diff --git a/CMakeLists.txt b/CMakeLists.txt
index 81afe249..78ab771b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -447,9 +447,7 @@ if(${RUN_HIT} EQUAL 0)
     execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_COMMON_BIN_DIR}" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET)
 endif()
 if(HIP_CATCH_TEST EQUAL "1")
-    enable_testing()
-    set(HIP_PATH ${HIP_ROOT_DIR})
-    add_subdirectory(${HIP_COMMON_DIR}/tests/catch ${PROJECT_BINARY_DIR}/catch)
+	message(STATUS "Building of catch tests through hipamd is no longer supported. Testing targets will not be available. catch tests have been moved to an independent github project hip-tests. Please refer to hip-tests Readme for build instructions! ")
 else()
     if(${RUN_HIT} EQUAL 0)
         set(CMAKE_MODULE_PATH "${HIP_ROOT_DIR}/cmake" ${CMAKE_MODULE_PATH})
diff --git a/hip-config.cmake.in b/hip-config.cmake.in
index 7c4fe7f9..b96803ee 100755
--- a/hip-config.cmake.in
+++ b/hip-config.cmake.in
@@ -206,16 +206,6 @@ if(HIP_COMPILER STREQUAL "clang")
 
   if (NOT compilePropIsSet)
     hip_add_interface_compile_flags(hip::device -x hip)
-    if (NOT EXISTS ${AMD_DEVICE_LIBS_PREFIX}/amdgcn/bitcode)
-      # This path is to support an older build of the device library
-      # TODO: To be removed in the future.
-      if(WIN32)
-        hip_add_interface_compile_flags(hip::device -fms-extensions -fms-compatibility)
-        hip_add_interface_compile_flags(hip::device --hip-device-lib-path=\"${HIP_PATH}/lib/bitcode\")
-      else()
-        hip_add_interface_compile_flags(hip::device --hip-device-lib-path=\"${AMD_DEVICE_LIBS_PREFIX}/lib\")
-      endif()
-    endif()
   endif()
 
   hip_add_interface_link_flags(hip::device --hip-link)
diff --git a/include/hip/amd_detail/amd_device_functions.h b/include/hip/amd_detail/amd_device_functions.h
index ce421c63..57576f57 100644
--- a/include/hip/amd_detail/amd_device_functions.h
+++ b/include/hip/amd_detail/amd_device_functions.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -444,7 +444,8 @@ __device__ static inline unsigned long long int __double2ull_ru(double x) {
 __device__ static inline unsigned long long int __double2ull_rz(double x) {
   return (unsigned long long int)x;
 }
-
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
 __device__ static inline long long int __double_as_longlong(double x) {
     static_assert(sizeof(long long) == sizeof(double), "");
 
@@ -453,6 +454,7 @@ __device__ static inline long long int __double_as_longlong(double x) {
 
     return tmp;
 }
+#pragma clang diagnostic pop
 
 /*
 __device__ unsigned short __float2half_rn(float x);
@@ -775,21 +777,21 @@ __device__
 inline
 static void __threadfence()
 {
-  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_device);
+    __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
 }
 
 __device__
 inline
 static void __threadfence_block()
 {
-  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_work_group);
+    __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
 }
 
 __device__
 inline
 static void __threadfence_system()
 {
-  __atomic_work_item_fence(0, __memory_order_seq_cst, __memory_scope_all_svm_devices);
+    __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
 }
 
 // abort
@@ -862,14 +864,11 @@ void __assertfail()
 }
 #endif /* defined(_WIN32) || defined(_WIN64) */
 
-__device__
-inline
-static void __work_group_barrier(__cl_mem_fence_flags flags, __memory_scope scope)
-{
+__device__ inline static void __work_group_barrier(__cl_mem_fence_flags flags) {
     if (flags) {
-        __atomic_work_item_fence(flags, __memory_order_release, scope);
+        __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");
         __builtin_amdgcn_s_barrier();
-        __atomic_work_item_fence(flags, __memory_order_acquire, scope);
+        __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
     } else {
         __builtin_amdgcn_s_barrier();
     }
@@ -879,7 +878,7 @@ __device__
 inline
 static void __barrier(int n)
 {
-  __work_group_barrier((__cl_mem_fence_flags)n, __memory_scope_work_group);
+  __work_group_barrier((__cl_mem_fence_flags)n);
 }
 
 __device__
@@ -922,7 +921,7 @@ int __syncthreads_or(int predicate)
    PIPE_ID     7:6     Pipeline from which the wave was dispatched.
    CU_ID       11:8    Compute Unit the wave is assigned to.
    SH_ID       12      Shader Array (within an SE) the wave is assigned to.
-   SE_ID       14:13   Shader Engine the wave is assigned to.
+   SE_ID       15:13   Shader Engine the wave is assigned to.
    TG_ID       19:16   Thread-group ID
    VM_ID       23:20   Virtual Memory ID
    QUEUE_ID    26:24   Queue from which this wave was dispatched.
@@ -935,7 +934,7 @@ int __syncthreads_or(int predicate)
 #define HW_ID_CU_ID_SIZE    4
 #define HW_ID_CU_ID_OFFSET  8
 
-#define HW_ID_SE_ID_SIZE    2
+#define HW_ID_SE_ID_SIZE    3
 #define HW_ID_SE_ID_OFFSET  13
 
 /*
diff --git a/include/hip/amd_detail/amd_hip_bfloat16.h b/include/hip/amd_detail/amd_hip_bfloat16.h
index 8c7f7839..deb3bfb7 100644
--- a/include/hip/amd_detail/amd_hip_bfloat16.h
+++ b/include/hip/amd_detail/amd_hip_bfloat16.h
@@ -32,10 +32,8 @@
 #include "host_defines.h"
 #if defined(__HIPCC_RTC__)
     #define __HOST_DEVICE__ __device__
-    #define HIP_OSTREAM __hip_internal::ostream
 #else
     #define __HOST_DEVICE__ __host__ __device__
-    #define HIP_OSTREAM std::ostream
 #endif
 
 #if __cplusplus < 201103L || !defined(__HIPCC__)
@@ -181,12 +179,12 @@ static_assert(__hip_internal::is_trivial<hip_bfloat16>{},
 static_assert(sizeof(hip_bfloat16) == sizeof(hip_bfloat16_public)
                   && offsetof(hip_bfloat16, data) == offsetof(hip_bfloat16_public, data),
               "internal hip_bfloat16 does not match public hip_bfloat16");
-#endif
 
-inline HIP_OSTREAM& operator<<(HIP_OSTREAM& os, const hip_bfloat16& bf16)
+inline std::ostream& operator<<(std::ostream& os, const hip_bfloat16& bf16)
 {
-    return os << bf16;
+  return os << float(bf16);
 }
+#endif
 
 inline __HOST_DEVICE__ hip_bfloat16 operator+(hip_bfloat16 a)
 {
diff --git a/include/hip/amd_detail/amd_hip_complex.h b/include/hip/amd_detail/amd_hip_complex.h
index eba6eb53..9d9dfd5e 100644
--- a/include/hip/amd_detail/amd_hip_complex.h
+++ b/include/hip/amd_detail/amd_hip_complex.h
@@ -106,15 +106,20 @@ THE SOFTWARE.
         return lhs;                                                                                \
     }
 
-#define COMPLEX_MUL_PREOP_OVERLOAD(type)                                                           \
-    __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) {               \
-        lhs = lhs * rhs;                                                                           \
-        return lhs;                                                                                \
+#define COMPLEX_MUL_PREOP_OVERLOAD(type)                                                            \
+    __HOST_DEVICE__ static inline type& operator*=(type& lhs, const type& rhs) {                    \
+        type temp{lhs};                                                                             \
+        lhs.x = rhs.x * temp.x - rhs.y * temp.y;                                                    \
+        lhs.y = rhs.y * temp.x + rhs.x * temp.y;                                                    \
+        return lhs;                                                                                 \
     }
 
 #define COMPLEX_DIV_PREOP_OVERLOAD(type)                                                           \
-    __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) {               \
-        lhs = lhs / rhs;                                                                           \
+    __HOST_DEVICE__ static inline type& operator/=(type& lhs, const type& rhs) {                   \
+        type temp;                                                                                 \
+        temp.x = (lhs.x*rhs.x + lhs.y * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y);                      \
+        temp.y = (lhs.y * rhs.x - lhs.x * rhs.y) / (rhs.x*rhs.x + rhs.y*rhs.y);                    \
+        lhs = temp;                                                                                \
         return lhs;                                                                                \
     }
 
diff --git a/include/hip/amd_detail/amd_hip_cooperative_groups.h b/include/hip/amd_detail/amd_hip_cooperative_groups.h
index 747f65a4..575a9f8e 100644
--- a/include/hip/amd_detail/amd_hip_cooperative_groups.h
+++ b/include/hip/amd_detail/amd_hip_cooperative_groups.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -32,6 +32,13 @@ THE SOFTWARE.
 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
 #define HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wc++98-compat"
+#pragma clang diagnostic ignored "-Wsign-conversion"
+#pragma clang diagnostic ignored "-Wunused-parameter"
+#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+#pragma clang diagnostic ignored "-Wpadded"
+
 #if __cplusplus
 #if !defined(__HIPCC_RTC__)
 #include <hip/amd_detail/hip_cooperative_groups_helper.h>
@@ -69,8 +76,8 @@ class thread_group {
   // only when the group is supposed to contain only the calling the thread
   // (throurh the API - `this_thread()`), and in all other cases, this thread
   // group object is a sub-object of some other derived thread group object
-  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = (uint64_t)0,
-                                uint64_t mask = (uint64_t)0) {
+  __CG_QUALIFIER__ thread_group(internal::group_type type, uint32_t size = static_cast<uint64_t>(0),
+                                uint64_t mask = static_cast<uint64_t>(0)) {
     _type = type;
     _size = size;
     _mask = mask;
@@ -199,7 +206,7 @@ class thread_block : public thread_group {
     const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
     // Invalid tile size, assert
     if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
-      __hip_assert(false && "invalid tile size");
+      __hip_assert(false && "invalid tile size")
     }
 
     thread_group tiledGroup = thread_group(internal::cg_tiled_group, tile_size);
@@ -246,7 +253,7 @@ class tiled_group : public thread_group {
     const bool pow2 = ((tile_size & (tile_size - 1)) == 0);
 
     if (!tile_size || (tile_size > __AMDGCN_WAVEFRONT_SIZE) || !pow2) {
-      __hip_assert(false && "invalid tile size");
+      __hip_assert(false && "invalid tile size")
     }
 
     if (size() <= tile_size) {
@@ -282,7 +289,6 @@ class tiled_group : public thread_group {
  *  \details Represents a active thread group in a wavefront.
  *           This group type also supports sub-wave level intrinsics.
  */
-
 class coalesced_group : public thread_group {
  private:
   friend __CG_QUALIFIER__ coalesced_group coalesced_threads();
@@ -300,8 +306,8 @@ class coalesced_group : public thread_group {
     // prepare a mask for further partitioning it so that it stays coalesced.
     if (coalesced_info.tiled_info.is_tiled) {
       unsigned int base_offset = (thread_rank() & (~(tile_size - 1)));
-      unsigned int masklength = min((unsigned int)size() - base_offset, tile_size);
-      lane_mask member_mask = (lane_mask)(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);
+      unsigned int masklength = min(static_cast<unsigned int>(size()) - base_offset, tile_size);
+      lane_mask member_mask = static_cast<lane_mask>(-1) >> (__AMDGCN_WAVEFRONT_SIZE - masklength);
 
       member_mask <<= (__lane_id() & ~(tile_size - 1));
       coalesced_group coalesced_tile = coalesced_group(member_mask);
@@ -358,7 +364,7 @@ class coalesced_group : public thread_group {
   __CG_QUALIFIER__ T shfl(T var, int srcRank) const {
     static_assert(is_valid_type<T>::value, "Neither an integer or float type.");
 
-    srcRank = srcRank % size();
+    srcRank = srcRank % static_cast<int>(size());
 
     int lane = (size() == __AMDGCN_WAVEFRONT_SIZE) ? srcRank
              : (__AMDGCN_WAVEFRONT_SIZE == 64)     ? __fns64(coalesced_info.member_mask, 0, (srcRank + 1))
@@ -452,7 +458,7 @@ __CG_QUALIFIER__ uint32_t thread_group::thread_rank() const {
       return (static_cast<const coalesced_group*>(this)->thread_rank());
     }
     default: {
-      __hip_assert(false && "invalid cooperative group type");
+      __hip_assert(false && "invalid cooperative group type")
       return -1;
     }
   }
@@ -476,7 +482,7 @@ __CG_QUALIFIER__ bool thread_group::is_valid() const {
       return (static_cast<const coalesced_group*>(this)->is_valid());
     }
     default: {
-      __hip_assert(false && "invalid cooperative group type");
+      __hip_assert(false && "invalid cooperative group type")
       return false;
     }
   }
@@ -505,7 +511,7 @@ __CG_QUALIFIER__ void thread_group::sync() const {
       break;
     }
     default: {
-      __hip_assert(false && "invalid cooperative group type");
+      __hip_assert(false && "invalid cooperative group type")
     }
   }
 }
@@ -697,6 +703,6 @@ __CG_QUALIFIER__ thread_block_tile<size, ParentCGTy> tiled_partition(const Paren
   return impl::tiled_partition_internal<size, ParentCGTy>(g);
 }
 }  // namespace cooperative_groups
-
+#pragma clang diagnostic pop
 #endif  // __cplusplus
 #endif  // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_H
diff --git a/include/hip/amd_detail/amd_hip_fp16.h b/include/hip/amd_detail/amd_hip_fp16.h
index fb07cfb6..fa58c3e4 100644
--- a/include/hip/amd_detail/amd_hip_fp16.h
+++ b/include/hip/amd_detail/amd_hip_fp16.h
@@ -1569,7 +1569,7 @@ THE SOFTWARE.
             __half hrcp(__half x)
             {
                 return __half_raw{
-                    __llvm_amdgcn_rcp_f16(static_cast<__half_raw>(x).data)};
+                    static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
             }
             inline
             __device__
@@ -1672,7 +1672,10 @@ THE SOFTWARE.
             __half2 h2log10(__half2 x) { return __ocml_log10_2f16(x); }
             inline
             __HOST_DEVICE__
-            __half2 h2rcp(__half2 x) { return __llvm_amdgcn_rcp_2f16(x); }
+            __half2 h2rcp(__half2 x) {
+                return _Float16_2{static_cast<_Float16>(__builtin_amdgcn_rcph(x.x)),
+                                  static_cast<_Float16>(__builtin_amdgcn_rcph(x.y))};
+            }
             inline
             __HOST_DEVICE__
             __half2 h2rsqrt(__half2 x) { return __ocml_rsqrt_2f16(x); }
@@ -1707,6 +1710,66 @@ THE SOFTWARE.
             using half = __half;
             using half2 = __half2;
         #endif
+        #if !defined(__HIPCC_RTC__)
+        #include "amd_device_functions.h"
+        #include "amd_warp_functions.h"
+            __device__
+            inline
+            __half __shfl(__half var, int src_lane, int width = warpSize) {
+                union { int i; __half h; } tmp; tmp.h = var;
+                tmp.i = __shfl(tmp.i, src_lane, width);
+                return tmp.h;
+            }
+            __device__
+            inline
+            __half2 __shfl(__half2 var, int src_lane, int width = warpSize) {
+                union { int i; __half2 h; } tmp; tmp.h = var;
+                tmp.i = __shfl(tmp.i, src_lane, width);
+                return tmp.h;
+            }
+            __device__
+            inline
+            __half __shfl_up(__half var, unsigned int lane_delta, int width = warpSize) {
+                union { int i; __half h; } tmp; tmp.h = var;
+                tmp.i = __shfl_up(tmp.i, lane_delta, width);
+                return tmp.h;
+            }
+            __device__
+            inline
+            __half2 __shfl_up(__half2 var, unsigned int lane_delta, int width = warpSize) {
+                union { int i; __half2 h; } tmp; tmp.h = var;
+                tmp.i = __shfl_up(tmp.i, lane_delta, width);
+                return tmp.h;
+            }
+            __device__
+            inline
+            __half __shfl_down(__half var, unsigned int lane_delta, int width = warpSize) {
+                union { int i; __half h; } tmp; tmp.h = var;
+                tmp.i = __shfl_down(tmp.i, lane_delta, width);
+                return tmp.h;
+            }
+            __device__
+            inline
+            __half2 __shfl_down(__half2 var, unsigned int lane_delta, int width = warpSize) {
+                union { int i; __half2 h; } tmp; tmp.h = var;
+                tmp.i = __shfl_down(tmp.i, lane_delta, width);
+                return tmp.h;
+            }
+            __device__
+            inline
+            __half __shfl_xor(__half var,  int lane_mask, int width = warpSize) {
+                union { int i; __half h; } tmp; tmp.h = var;
+                tmp.i = __shfl_xor(tmp.i, lane_mask, width);
+                return tmp.h;
+            }
+            __device__
+            inline
+            __half2 __shfl_xor(__half2 var,  int lane_mask, int width = warpSize) {
+                union { int i; __half2 h; } tmp; tmp.h = var;
+                tmp.i = __shfl_xor(tmp.i, lane_mask, width);
+                return tmp.h;
+            }
+        #endif // !defined(__HIPCC_RTC__)
     #endif // defined(__cplusplus)
 #elif defined(__GNUC__)
     #include "hip_fp16_gcc.h"
diff --git a/include/hip/amd_detail/amd_hip_unsafe_atomics.h b/include/hip/amd_detail/amd_hip_unsafe_atomics.h
index 243b5a64..0100e99e 100644
--- a/include/hip/amd_detail/amd_hip_unsafe_atomics.h
+++ b/include/hip/amd_detail/amd_hip_unsafe_atomics.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2021 - Present Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -23,6 +23,9 @@ THE SOFTWARE.
 #pragma once
 
 #ifdef __cplusplus
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wold-style-cast"
+
 /**
  * @brief Unsafe floating point rmw atomic add.
  *
@@ -175,7 +178,7 @@ __device__ inline float unsafeAtomicMin(float* addr, float val) {
  * @return Original value contained in \p addr.
  */
 __device__ inline double unsafeAtomicAdd(double* addr, double value) {
-#if (defined(__gfx90a__) || defined(__gfx940_)) &&                              \
+#if (defined(__gfx90a__) || defined(__gfx940__)) &&                              \
     __has_builtin(__builtin_amdgcn_flat_atomic_fadd_f64)
   return __builtin_amdgcn_flat_atomic_fadd_f64(addr, value);
 #elif defined (__hip_atomic_fetch_add)
@@ -307,7 +310,7 @@ __device__ inline double unsafeAtomicMin(double* addr, double val) {
  */
 __device__ inline float safeAtomicAdd(float* addr, float value) {
 #if defined(__gfx908__) ||                                                    \
-    (defined(__gfx90a) && !__has_builtin(__hip_atomic_fetch_add))
+    (defined(__gfx90a__) && !__has_builtin(__hip_atomic_fetch_add))
   // On gfx908, we can generate unsafe FP32 atomic add that does not follow all
   // IEEE rules when -munsafe-fp-atomics is passed. Do a CAS loop emulation instead.
   // On gfx90a, if we do not have the __hip_atomic_fetch_add builtin, we need to
@@ -563,4 +566,5 @@ __device__ inline double safeAtomicMin(double* addr, double val) {
   #endif
 }
 
+#pragma clang diagnostic pop
 #endif
diff --git a/include/hip/amd_detail/amd_hip_vector_types.h b/include/hip/amd_detail/amd_hip_vector_types.h
index dfd3b39a..8215fb02 100644
--- a/include/hip/amd_detail/amd_hip_vector_types.h
+++ b/include/hip/amd_detail/amd_hip_vector_types.h
@@ -544,6 +544,13 @@ template <typename __T> struct is_scalar : public integral_constant<bool, __is_s
             data *= x.data;
             return *this;
         }
+
+        friend __HOST_DEVICE__ inline constexpr HIP_vector_type operator*(
+        HIP_vector_type x, const HIP_vector_type& y) noexcept
+        {
+          return HIP_vector_type{ x } *= y;
+        }
+
         template<
             typename U,
             typename std::enable_if<
@@ -554,6 +561,12 @@ template <typename __T> struct is_scalar : public integral_constant<bool, __is_s
             return *this *= HIP_vector_type{x};
         }
 
+        friend __HOST_DEVICE__ inline constexpr HIP_vector_type operator/(
+        HIP_vector_type x, const HIP_vector_type& y) noexcept
+        {
+          return HIP_vector_type{ x } /= y;
+        }
+
         __HOST_DEVICE__
         HIP_vector_type& operator/=(const HIP_vector_type& x) noexcept
         {
@@ -709,15 +722,6 @@ template <typename __T> struct is_scalar : public integral_constant<bool, __is_s
         return HIP_vector_type<T, n>{x} -= y;
     }
 
-    template<typename T, unsigned int n>
-    __HOST_DEVICE__
-    inline
-    constexpr
-    HIP_vector_type<T, n> operator*(
-        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
-    {
-        return HIP_vector_type<T, n>{x} *= y;
-    }
     template<typename T, unsigned int n, typename U>
     __HOST_DEVICE__
     inline
@@ -737,15 +741,6 @@ template <typename __T> struct is_scalar : public integral_constant<bool, __is_s
         return HIP_vector_type<T, n>{x} *= y;
     }
 
-    template<typename T, unsigned int n>
-    __HOST_DEVICE__
-    inline
-    constexpr
-    HIP_vector_type<T, n> operator/(
-        const HIP_vector_type<T, n>& x, const HIP_vector_type<T, n>& y) noexcept
-    {
-        return HIP_vector_type<T, n>{x} /= y;
-    }
     template<typename T, unsigned int n, typename U>
     __HOST_DEVICE__
     inline
diff --git a/include/hip/amd_detail/amd_math_functions.h b/include/hip/amd_detail/amd_math_functions.h
index 471f6ee3..3c17d298 100644
--- a/include/hip/amd_detail/amd_math_functions.h
+++ b/include/hip/amd_detail/amd_math_functions.h
@@ -640,22 +640,22 @@ inline
 float __fmul_rz(float x, float y) { return __ocml_mul_rtz_f32(x, y); }
 __DEVICE__
 inline
-float __frcp_rd(float x) { return __llvm_amdgcn_rcp_f32(x); }
+float __frcp_rd(float x) { return __builtin_amdgcn_rcpf(x); }
 #endif
 __DEVICE__
 inline
-float __frcp_rn(float x) { return __llvm_amdgcn_rcp_f32(x); }
+float __frcp_rn(float x) { return __builtin_amdgcn_rcpf(x); }
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
 inline
-float __frcp_ru(float x) { return __llvm_amdgcn_rcp_f32(x); }
+float __frcp_ru(float x) { return __builtin_amdgcn_rcpf(x); }
 __DEVICE__
 inline
-float __frcp_rz(float x) { return __llvm_amdgcn_rcp_f32(x); }
+float __frcp_rz(float x) { return __builtin_amdgcn_rcpf(x); }
 #endif
 __DEVICE__
 inline
-float __frsqrt_rn(float x) { return __llvm_amdgcn_rsq_f32(x); }
+float __frsqrt_rn(float x) { return __builtin_amdgcn_rsqf(x); }
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
 inline
@@ -1155,18 +1155,18 @@ inline
 double __dmul_rz(double x, double y) { return __ocml_mul_rtz_f64(x, y); }
 __DEVICE__
 inline
-double __drcp_rd(double x) { return __llvm_amdgcn_rcp_f64(x); }
+double __drcp_rd(double x) { return __builtin_amdgcn_rcp(x); }
 #endif
 __DEVICE__
 inline
-double __drcp_rn(double x) { return __llvm_amdgcn_rcp_f64(x); }
+double __drcp_rn(double x) { return __builtin_amdgcn_rcp(x); }
 #if defined OCML_BASIC_ROUNDED_OPERATIONS
 __DEVICE__
 inline
-double __drcp_ru(double x) { return __llvm_amdgcn_rcp_f64(x); }
+double __drcp_ru(double x) { return __builtin_amdgcn_rcp(x); }
 __DEVICE__
 inline
-double __drcp_rz(double x) { return __llvm_amdgcn_rcp_f64(x); }
+double __drcp_rz(double x) { return __builtin_amdgcn_rcp(x); }
 __DEVICE__
 inline
 double __dsqrt_rd(double x) { return __ocml_sqrt_rtn_f64(x); }
diff --git a/include/hip/amd_detail/amd_warp_functions.h b/include/hip/amd_detail/amd_warp_functions.h
index b18ff5f5..fb6065b1 100644
--- a/include/hip/amd_detail/amd_warp_functions.h
+++ b/include/hip/amd_detail/amd_warp_functions.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -23,6 +23,14 @@ THE SOFTWARE.
 #ifndef HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
 #define HIP_INCLUDE_HIP_AMD_DETAIL_WARP_FUNCTIONS_H
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+#pragma clang diagnostic ignored "-Wsign-conversion"
+#pragma clang diagnostic ignored "-Wold-style-cast"
+#pragma clang diagnostic ignored "-Wc++98-compat"
+#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
+
 __device__ static inline unsigned __hip_ds_bpermute(int index, unsigned src) {
     union { int i; unsigned u; float f; } tmp; tmp.u = src;
     tmp.i = __builtin_amdgcn_ds_bpermute(index, tmp.i);
@@ -491,4 +499,5 @@ unsigned long long __shfl_xor(unsigned long long var, int lane_mask, int width =
     return tmp1;
 }
 
+#pragma clang diagnostic pop
 #endif
diff --git a/include/hip/amd_detail/device_library_decls.h b/include/hip/amd_detail/device_library_decls.h
index 8add4fa2..02228705 100644
--- a/include/hip/amd_detail/device_library_decls.h
+++ b/include/hip/amd_detail/device_library_decls.h
@@ -128,26 +128,4 @@ __device__ inline static __local void* __to_local(unsigned x) { return (__local
 #define __CLK_LOCAL_MEM_FENCE    0x01
 typedef unsigned __cl_mem_fence_flags;
 
-typedef enum __memory_scope {
-  __memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
-  __memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
-  __memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
-  __memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
-  __memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
-} __memory_scope;
-
-// enum values aligned with what clang uses in EmitAtomicExpr()
-typedef enum __memory_order
-{
-  __memory_order_relaxed = __ATOMIC_RELAXED,
-  __memory_order_acquire = __ATOMIC_ACQUIRE,
-  __memory_order_release = __ATOMIC_RELEASE,
-  __memory_order_acq_rel = __ATOMIC_ACQ_REL,
-  __memory_order_seq_cst = __ATOMIC_SEQ_CST
-} __memory_order;
-
-// Linked from hip.amdgcn.bc
-extern "C" __device__ void
-__atomic_work_item_fence(__cl_mem_fence_flags, __memory_order, __memory_scope);
-
 #endif
diff --git a/include/hip/amd_detail/hip_cooperative_groups_helper.h b/include/hip/amd_detail/hip_cooperative_groups_helper.h
index a90f0a3a..877c6a43 100644
--- a/include/hip/amd_detail/hip_cooperative_groups_helper.h
+++ b/include/hip/amd_detail/hip_cooperative_groups_helper.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -39,6 +39,12 @@ THE SOFTWARE.
 #define __align__(x) __attribute__((aligned(x)))
 #endif
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
+#pragma clang diagnostic ignored "-Wc++98-compat"
+#pragma clang diagnostic ignored "-Wc++98-compat-pedantic"
+#pragma clang diagnostic ignored "-Wshorten-64-to-32"
+
 #if !defined(__CG_QUALIFIER__)
 #define __CG_QUALIFIER__ __device__ __forceinline__
 #endif
@@ -92,15 +98,18 @@ typedef enum {
  */
 namespace multi_grid {
 
-__CG_STATIC_QUALIFIER__ uint32_t num_grids() { return (uint32_t)__ockl_multi_grid_num_grids(); }
+__CG_STATIC_QUALIFIER__ uint32_t num_grids() {
+  return static_cast<uint32_t>(__ockl_multi_grid_num_grids()); }
 
-__CG_STATIC_QUALIFIER__ uint32_t grid_rank() { return (uint32_t)__ockl_multi_grid_grid_rank(); }
+__CG_STATIC_QUALIFIER__ uint32_t grid_rank() {
+  return static_cast<uint32_t>(__ockl_multi_grid_grid_rank()); }
 
-__CG_STATIC_QUALIFIER__ uint32_t size() { return (uint32_t)__ockl_multi_grid_size(); }
+__CG_STATIC_QUALIFIER__ uint32_t size() { return static_cast<uint32_t>(__ockl_multi_grid_size()); }
 
-__CG_STATIC_QUALIFIER__ uint32_t thread_rank() { return (uint32_t)__ockl_multi_grid_thread_rank(); }
+__CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
+  return static_cast<uint32_t>(__ockl_multi_grid_thread_rank()); }
 
-__CG_STATIC_QUALIFIER__ bool is_valid() { return (bool)__ockl_multi_grid_is_valid(); }
+__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_multi_grid_is_valid()); }
 
 __CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
 
@@ -112,28 +121,28 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_multi_grid_sync(); }
 namespace grid {
 
 __CG_STATIC_QUALIFIER__ uint32_t size() {
-  return (uint32_t)((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
+  return static_cast<uint32_t>((blockDim.z * gridDim.z) * (blockDim.y * gridDim.y) *
                     (blockDim.x * gridDim.x));
 }
 
 __CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
   // Compute global id of the workgroup to which the current thread belongs to
-  uint32_t blkIdx = (uint32_t)((blockIdx.z * gridDim.y * gridDim.x) +
+  uint32_t blkIdx = static_cast<uint32_t>((blockIdx.z * gridDim.y * gridDim.x) +
                                (blockIdx.y * gridDim.x) + (blockIdx.x));
 
   // Compute total number of threads being passed to reach current workgroup
   // within grid
   uint32_t num_threads_till_current_workgroup =
-      (uint32_t)(blkIdx * (blockDim.x * blockDim.y * blockDim.z));
+      static_cast<uint32_t>(blkIdx * (blockDim.x * blockDim.y * blockDim.z));
 
   // Compute thread local rank within current workgroup
-  uint32_t local_thread_rank = (uint32_t)((threadIdx.z * blockDim.y * blockDim.x) +
+  uint32_t local_thread_rank = static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
                                           (threadIdx.y * blockDim.x) + (threadIdx.x));
 
   return (num_threads_till_current_workgroup + local_thread_rank);
 }
 
-__CG_STATIC_QUALIFIER__ bool is_valid() { return (bool)__ockl_grid_is_valid(); }
+__CG_STATIC_QUALIFIER__ bool is_valid() { return static_cast<bool>(__ockl_grid_is_valid()); }
 
 __CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
 
@@ -146,19 +155,21 @@ __CG_STATIC_QUALIFIER__ void sync() { __ockl_grid_sync(); }
 namespace workgroup {
 
 __CG_STATIC_QUALIFIER__ dim3 group_index() {
-  return (dim3((uint32_t)blockIdx.x, (uint32_t)blockIdx.y, (uint32_t)blockIdx.z));
+  return (dim3(static_cast<uint32_t>(blockIdx.x), static_cast<uint32_t>(blockIdx.y),
+               static_cast<uint32_t>(blockIdx.z)));
 }
 
 __CG_STATIC_QUALIFIER__ dim3 thread_index() {
-  return (dim3((uint32_t)threadIdx.x, (uint32_t)threadIdx.y, (uint32_t)threadIdx.z));
+  return (dim3(static_cast<uint32_t>(threadIdx.x), static_cast<uint32_t>(threadIdx.y),
+               static_cast<uint32_t>(threadIdx.z)));
 }
 
 __CG_STATIC_QUALIFIER__ uint32_t size() {
-  return ((uint32_t)(blockDim.x * blockDim.y * blockDim.z));
+  return (static_cast<uint32_t>(blockDim.x * blockDim.y * blockDim.z));
 }
 
 __CG_STATIC_QUALIFIER__ uint32_t thread_rank() {
-  return ((uint32_t)((threadIdx.z * blockDim.y * blockDim.x) +
+  return (static_cast<uint32_t>((threadIdx.z * blockDim.y * blockDim.x) +
                      (threadIdx.y * blockDim.x) + (threadIdx.x)));
 }
 
@@ -187,8 +198,8 @@ __CG_STATIC_QUALIFIER__ void sync() { __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, "
 //
 // For each thread, this function returns the number of active threads which
 // have i-th bit of x set and come before the current thread.
-__device__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) {
-  int counter=0;
+__CG_STATIC_QUALIFIER__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) {
+  unsigned int counter=0;
     #if __AMDGCN_WAVEFRONT_SIZE == 32
       counter = __builtin_amdgcn_mbcnt_lo(x, add);
     #else
@@ -206,5 +217,6 @@ __device__ unsigned int masked_bit_count(lane_mask x, unsigned int add = 0) {
 
 }  // namespace cooperative_groups
 
+#pragma clang diagnostic pop
 #endif  // __cplusplus
 #endif  // HIP_INCLUDE_HIP_AMD_DETAIL_HIP_COOPERATIVE_GROUPS_HELPER_H
diff --git a/include/hip/amd_detail/hip_fp16_math_fwd.h b/include/hip/amd_detail/hip_fp16_math_fwd.h
index 36942c1a..caf6ec75 100644
--- a/include/hip/amd_detail/hip_fp16_math_fwd.h
+++ b/include/hip/amd_detail/hip_fp16_math_fwd.h
@@ -44,7 +44,6 @@ extern "C"
     __device__ __attribute__((pure)) _Float16 __ocml_log10_f16(_Float16);
     __device__ __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
     __device__ __attribute__((pure)) _Float16 __ocml_pown_f16(_Float16, int);
-    __device__ __attribute__((const)) _Float16 __llvm_amdgcn_rcp_f16(_Float16);
     __device__ __attribute__((const)) _Float16 __ocml_rint_f16(_Float16);
     __device__ __attribute__((const)) _Float16 __ocml_rsqrt_f16(_Float16);
     __device__ _Float16 __ocml_sin_f16(_Float16);
@@ -73,11 +72,6 @@ extern "C"
     __device__ __attribute__((pure)) __2f16 __ocml_log_2f16(__2f16);
     __device__ __attribute__((pure)) __2f16 __ocml_log10_2f16(__2f16);
     __device__ __attribute__((pure)) __2f16 __ocml_log2_2f16(__2f16);
-    __device__ inline
-    __2f16 __llvm_amdgcn_rcp_2f16(__2f16 x) // Not currently exposed by ROCDL.
-    {
-        return __2f16{__llvm_amdgcn_rcp_f16(x.x), __llvm_amdgcn_rcp_f16(x.y)};
-    }
     __device__ __attribute__((const)) __2f16 __ocml_rint_2f16(__2f16);
     __device__ __attribute__((const)) __2f16 __ocml_rsqrt_2f16(__2f16);
     __device__ __2f16 __ocml_sin_2f16(__2f16);
diff --git a/include/hip/amd_detail/hip_prof_str.h b/include/hip/amd_detail/hip_prof_str.h
index 7ec70fdc..d0b24d01 100644
--- a/include/hip/amd_detail/hip_prof_str.h
+++ b/include/hip/amd_detail/hip_prof_str.h
@@ -370,10 +370,12 @@ enum hip_api_id_t {
   HIP_API_ID_hipGraphMemFreeNodeGetParams = 357,
   HIP_API_ID_hipModuleLaunchCooperativeKernel = 358,
   HIP_API_ID_hipModuleLaunchCooperativeKernelMultiDevice = 359,
-  HIP_API_ID_LAST = 359,
+  HIP_API_ID_hipArray3DGetDescriptor = 360,
+  HIP_API_ID_hipArrayGetDescriptor = 361,
+  HIP_API_ID_hipArrayGetInfo = 362,
+  HIP_API_ID_hipStreamGetDevice = 363,
+  HIP_API_ID_LAST = 363,
 
-  HIP_API_ID_hipArray3DGetDescriptor = HIP_API_ID_NONE,
-  HIP_API_ID_hipArrayGetDescriptor = HIP_API_ID_NONE,
   HIP_API_ID_hipBindTexture = HIP_API_ID_NONE,
   HIP_API_ID_hipBindTexture2D = HIP_API_ID_NONE,
   HIP_API_ID_hipBindTextureToArray = HIP_API_ID_NONE,
@@ -420,8 +422,11 @@ static inline const char* hip_api_name(const uint32_t id) {
     case HIP_API_ID___hipPopCallConfiguration: return "__hipPopCallConfiguration";
     case HIP_API_ID___hipPushCallConfiguration: return "__hipPushCallConfiguration";
     case HIP_API_ID_hipArray3DCreate: return "hipArray3DCreate";
+    case HIP_API_ID_hipArray3DGetDescriptor: return "hipArray3DGetDescriptor";
     case HIP_API_ID_hipArrayCreate: return "hipArrayCreate";
     case HIP_API_ID_hipArrayDestroy: return "hipArrayDestroy";
+    case HIP_API_ID_hipArrayGetDescriptor: return "hipArrayGetDescriptor";
+    case HIP_API_ID_hipArrayGetInfo: return "hipArrayGetInfo";
     case HIP_API_ID_hipChooseDevice: return "hipChooseDevice";
     case HIP_API_ID_hipConfigureCall: return "hipConfigureCall";
     case HIP_API_ID_hipCreateSurfaceObject: return "hipCreateSurfaceObject";
@@ -739,6 +744,7 @@ static inline const char* hip_api_name(const uint32_t id) {
     case HIP_API_ID_hipStreamEndCapture: return "hipStreamEndCapture";
     case HIP_API_ID_hipStreamGetCaptureInfo: return "hipStreamGetCaptureInfo";
     case HIP_API_ID_hipStreamGetCaptureInfo_v2: return "hipStreamGetCaptureInfo_v2";
+    case HIP_API_ID_hipStreamGetDevice: return "hipStreamGetDevice";
     case HIP_API_ID_hipStreamGetFlags: return "hipStreamGetFlags";
     case HIP_API_ID_hipStreamGetPriority: return "hipStreamGetPriority";
     case HIP_API_ID_hipStreamIsCapturing: return "hipStreamIsCapturing";
@@ -782,8 +788,11 @@ static inline uint32_t hipApiIdByName(const char* name) {
   if (strcmp("__hipPopCallConfiguration", name) == 0) return HIP_API_ID___hipPopCallConfiguration;
   if (strcmp("__hipPushCallConfiguration", name) == 0) return HIP_API_ID___hipPushCallConfiguration;
   if (strcmp("hipArray3DCreate", name) == 0) return HIP_API_ID_hipArray3DCreate;
+  if (strcmp("hipArray3DGetDescriptor", name) == 0) return HIP_API_ID_hipArray3DGetDescriptor;
   if (strcmp("hipArrayCreate", name) == 0) return HIP_API_ID_hipArrayCreate;
   if (strcmp("hipArrayDestroy", name) == 0) return HIP_API_ID_hipArrayDestroy;
+  if (strcmp("hipArrayGetDescriptor", name) == 0) return HIP_API_ID_hipArrayGetDescriptor;
+  if (strcmp("hipArrayGetInfo", name) == 0) return HIP_API_ID_hipArrayGetInfo;
   if (strcmp("hipChooseDevice", name) == 0) return HIP_API_ID_hipChooseDevice;
   if (strcmp("hipConfigureCall", name) == 0) return HIP_API_ID_hipConfigureCall;
   if (strcmp("hipCreateSurfaceObject", name) == 0) return HIP_API_ID_hipCreateSurfaceObject;
@@ -1101,6 +1110,7 @@ static inline uint32_t hipApiIdByName(const char* name) {
   if (strcmp("hipStreamEndCapture", name) == 0) return HIP_API_ID_hipStreamEndCapture;
   if (strcmp("hipStreamGetCaptureInfo", name) == 0) return HIP_API_ID_hipStreamGetCaptureInfo;
   if (strcmp("hipStreamGetCaptureInfo_v2", name) == 0) return HIP_API_ID_hipStreamGetCaptureInfo_v2;
+  if (strcmp("hipStreamGetDevice", name) == 0) return HIP_API_ID_hipStreamGetDevice;
   if (strcmp("hipStreamGetFlags", name) == 0) return HIP_API_ID_hipStreamGetFlags;
   if (strcmp("hipStreamGetPriority", name) == 0) return HIP_API_ID_hipStreamGetPriority;
   if (strcmp("hipStreamIsCapturing", name) == 0) return HIP_API_ID_hipStreamIsCapturing;
@@ -1164,6 +1174,12 @@ typedef struct hip_api_data_s {
       const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray;
       HIP_ARRAY3D_DESCRIPTOR pAllocateArray__val;
     } hipArray3DCreate;
+    struct {
+      HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor;
+      HIP_ARRAY3D_DESCRIPTOR pArrayDescriptor__val;
+      hipArray* array;
+      hipArray array__val;
+    } hipArray3DGetDescriptor;
     struct {
       hipArray** pHandle;
       hipArray* pHandle__val;
@@ -1174,6 +1190,22 @@ typedef struct hip_api_data_s {
       hipArray* array;
       hipArray array__val;
     } hipArrayDestroy;
+    struct {
+      HIP_ARRAY_DESCRIPTOR* pArrayDescriptor;
+      HIP_ARRAY_DESCRIPTOR pArrayDescriptor__val;
+      hipArray* array;
+      hipArray array__val;
+    } hipArrayGetDescriptor;
+    struct {
+      hipChannelFormatDesc* desc;
+      hipChannelFormatDesc desc__val;
+      hipExtent* extent;
+      hipExtent extent__val;
+      unsigned int* flags;
+      unsigned int flags__val;
+      hipArray* array;
+      hipArray array__val;
+    } hipArrayGetInfo;
     struct {
       int* device;
       int device__val;
@@ -3033,6 +3065,11 @@ typedef struct hip_api_data_s {
       size_t* numDependencies_out;
       size_t numDependencies_out__val;
     } hipStreamGetCaptureInfo_v2;
+    struct {
+      hipStream_t stream;
+      hipDevice_t* device;
+      hipDevice_t device__val;
+    } hipStreamGetDevice;
     struct {
       hipStream_t stream;
       unsigned int* flags;
@@ -3252,6 +3289,11 @@ typedef struct hip_api_data_s {
   cb_data.args.hipArray3DCreate.array = (hipArray**)array; \
   cb_data.args.hipArray3DCreate.pAllocateArray = (const HIP_ARRAY3D_DESCRIPTOR*)pAllocateArray; \
 };
+// hipArray3DGetDescriptor[('HIP_ARRAY3D_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray*', 'array')]
+#define INIT_hipArray3DGetDescriptor_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipArray3DGetDescriptor.pArrayDescriptor = (HIP_ARRAY3D_DESCRIPTOR*)pArrayDescriptor; \
+  cb_data.args.hipArray3DGetDescriptor.array = (hipArray*)array; \
+};
 // hipArrayCreate[('hipArray**', 'pHandle'), ('const HIP_ARRAY_DESCRIPTOR*', 'pAllocateArray')]
 #define INIT_hipArrayCreate_CB_ARGS_DATA(cb_data) { \
   cb_data.args.hipArrayCreate.pHandle = (hipArray**)array; \
@@ -3261,6 +3303,18 @@ typedef struct hip_api_data_s {
 #define INIT_hipArrayDestroy_CB_ARGS_DATA(cb_data) { \
   cb_data.args.hipArrayDestroy.array = (hipArray*)array; \
 };
+// hipArrayGetDescriptor[('HIP_ARRAY_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray*', 'array')]
+#define INIT_hipArrayGetDescriptor_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipArrayGetDescriptor.pArrayDescriptor = (HIP_ARRAY_DESCRIPTOR*)pArrayDescriptor; \
+  cb_data.args.hipArrayGetDescriptor.array = (hipArray*)array; \
+};
+// hipArrayGetInfo[('hipChannelFormatDesc*', 'desc'), ('hipExtent*', 'extent'), ('unsigned int*', 'flags'), ('hipArray*', 'array')]
+#define INIT_hipArrayGetInfo_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipArrayGetInfo.desc = (hipChannelFormatDesc*)desc; \
+  cb_data.args.hipArrayGetInfo.extent = (hipExtent*)extent; \
+  cb_data.args.hipArrayGetInfo.flags = (unsigned int*)flags; \
+  cb_data.args.hipArrayGetInfo.array = (hipArray*)array; \
+};
 // hipChooseDevice[('int*', 'device'), ('const hipDeviceProp_t*', 'prop')]
 #define INIT_hipChooseDevice_CB_ARGS_DATA(cb_data) { \
   cb_data.args.hipChooseDevice.device = (int*)device; \
@@ -5185,6 +5239,11 @@ typedef struct hip_api_data_s {
   cb_data.args.hipStreamGetCaptureInfo_v2.dependencies_out = (const hipGraphNode_t**)dependencies_out; \
   cb_data.args.hipStreamGetCaptureInfo_v2.numDependencies_out = (size_t*)numDependencies_out; \
 };
+// hipStreamGetDevice[('hipStream_t', 'stream'), ('hipDevice_t*', 'device')]
+#define INIT_hipStreamGetDevice_CB_ARGS_DATA(cb_data) { \
+  cb_data.args.hipStreamGetDevice.stream = (hipStream_t)stream; \
+  cb_data.args.hipStreamGetDevice.device = (hipDevice_t*)device; \
+};
 // hipStreamGetFlags[('hipStream_t', 'stream'), ('unsigned int*', 'flags')]
 #define INIT_hipStreamGetFlags_CB_ARGS_DATA(cb_data) { \
   cb_data.args.hipStreamGetFlags.stream = (hipStream_t)stream; \
@@ -5376,10 +5435,6 @@ typedef struct hip_api_data_s {
 #define INIT_CB_ARGS_DATA(cb_id, cb_data) INIT_##cb_id##_CB_ARGS_DATA(cb_data)
 
 // Macros for non-public API primitives
-// hipArray3DGetDescriptor()
-#define INIT_hipArray3DGetDescriptor_CB_ARGS_DATA(cb_data) {};
-// hipArrayGetDescriptor()
-#define INIT_hipArrayGetDescriptor_CB_ARGS_DATA(cb_data) {};
 // hipBindTexture()
 #define INIT_hipBindTexture_CB_ARGS_DATA(cb_data) {};
 // hipBindTexture2D()
@@ -5478,6 +5533,11 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
       if (data->args.hipArray3DCreate.array) data->args.hipArray3DCreate.array__val = *(data->args.hipArray3DCreate.array);
       if (data->args.hipArray3DCreate.pAllocateArray) data->args.hipArray3DCreate.pAllocateArray__val = *(data->args.hipArray3DCreate.pAllocateArray);
       break;
+// hipArray3DGetDescriptor[('HIP_ARRAY3D_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray*', 'array')]
+    case HIP_API_ID_hipArray3DGetDescriptor:
+      if (data->args.hipArray3DGetDescriptor.pArrayDescriptor) data->args.hipArray3DGetDescriptor.pArrayDescriptor__val = *(data->args.hipArray3DGetDescriptor.pArrayDescriptor);
+      if (data->args.hipArray3DGetDescriptor.array) data->args.hipArray3DGetDescriptor.array__val = *(data->args.hipArray3DGetDescriptor.array);
+      break;
 // hipArrayCreate[('hipArray**', 'pHandle'), ('const HIP_ARRAY_DESCRIPTOR*', 'pAllocateArray')]
     case HIP_API_ID_hipArrayCreate:
       if (data->args.hipArrayCreate.pHandle) data->args.hipArrayCreate.pHandle__val = *(data->args.hipArrayCreate.pHandle);
@@ -5487,6 +5547,18 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
     case HIP_API_ID_hipArrayDestroy:
       if (data->args.hipArrayDestroy.array) data->args.hipArrayDestroy.array__val = *(data->args.hipArrayDestroy.array);
       break;
+// hipArrayGetDescriptor[('HIP_ARRAY_DESCRIPTOR*', 'pArrayDescriptor'), ('hipArray*', 'array')]
+    case HIP_API_ID_hipArrayGetDescriptor:
+      if (data->args.hipArrayGetDescriptor.pArrayDescriptor) data->args.hipArrayGetDescriptor.pArrayDescriptor__val = *(data->args.hipArrayGetDescriptor.pArrayDescriptor);
+      if (data->args.hipArrayGetDescriptor.array) data->args.hipArrayGetDescriptor.array__val = *(data->args.hipArrayGetDescriptor.array);
+      break;
+// hipArrayGetInfo[('hipChannelFormatDesc*', 'desc'), ('hipExtent*', 'extent'), ('unsigned int*', 'flags'), ('hipArray*', 'array')]
+    case HIP_API_ID_hipArrayGetInfo:
+      if (data->args.hipArrayGetInfo.desc) data->args.hipArrayGetInfo.desc__val = *(data->args.hipArrayGetInfo.desc);
+      if (data->args.hipArrayGetInfo.extent) data->args.hipArrayGetInfo.extent__val = *(data->args.hipArrayGetInfo.extent);
+      if (data->args.hipArrayGetInfo.flags) data->args.hipArrayGetInfo.flags__val = *(data->args.hipArrayGetInfo.flags);
+      if (data->args.hipArrayGetInfo.array) data->args.hipArrayGetInfo.array__val = *(data->args.hipArrayGetInfo.array);
+      break;
 // hipChooseDevice[('int*', 'device'), ('const hipDeviceProp_t*', 'prop')]
     case HIP_API_ID_hipChooseDevice:
       if (data->args.hipChooseDevice.device) data->args.hipChooseDevice.device__val = *(data->args.hipChooseDevice.device);
@@ -6706,6 +6778,10 @@ static inline void hipApiArgsInit(hip_api_id_t id, hip_api_data_t* data) {
       if (data->args.hipStreamGetCaptureInfo_v2.dependencies_out) data->args.hipStreamGetCaptureInfo_v2.dependencies_out__val = *(data->args.hipStreamGetCaptureInfo_v2.dependencies_out);
       if (data->args.hipStreamGetCaptureInfo_v2.numDependencies_out) data->args.hipStreamGetCaptureInfo_v2.numDependencies_out__val = *(data->args.hipStreamGetCaptureInfo_v2.numDependencies_out);
       break;
+// hipStreamGetDevice[('hipStream_t', 'stream'), ('hipDevice_t*', 'device')]
+    case HIP_API_ID_hipStreamGetDevice:
+      if (data->args.hipStreamGetDevice.device) data->args.hipStreamGetDevice.device__val = *(data->args.hipStreamGetDevice.device);
+      break;
 // hipStreamGetFlags[('hipStream_t', 'stream'), ('unsigned int*', 'flags')]
     case HIP_API_ID_hipStreamGetFlags:
       if (data->args.hipStreamGetFlags.flags) data->args.hipStreamGetFlags.flags__val = *(data->args.hipStreamGetFlags.flags);
@@ -6881,6 +6957,14 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da
       else { oss << ", pAllocateArray="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArray3DCreate.pAllocateArray__val); }
       oss << ")";
     break;
+    case HIP_API_ID_hipArray3DGetDescriptor:
+      oss << "hipArray3DGetDescriptor(";
+      if (data->args.hipArray3DGetDescriptor.pArrayDescriptor == NULL) oss << "pArrayDescriptor=NULL";
+      else { oss << "pArrayDescriptor="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArray3DGetDescriptor.pArrayDescriptor__val); }
+      if (data->args.hipArray3DGetDescriptor.array == NULL) oss << ", array=NULL";
+      else { oss << ", array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArray3DGetDescriptor.array__val); }
+      oss << ")";
+    break;
     case HIP_API_ID_hipArrayCreate:
       oss << "hipArrayCreate(";
       if (data->args.hipArrayCreate.pHandle == NULL) oss << "pHandle=NULL";
@@ -6895,6 +6979,26 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da
       else { oss << "array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayDestroy.array__val); }
       oss << ")";
     break;
+    case HIP_API_ID_hipArrayGetDescriptor:
+      oss << "hipArrayGetDescriptor(";
+      if (data->args.hipArrayGetDescriptor.pArrayDescriptor == NULL) oss << "pArrayDescriptor=NULL";
+      else { oss << "pArrayDescriptor="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetDescriptor.pArrayDescriptor__val); }
+      if (data->args.hipArrayGetDescriptor.array == NULL) oss << ", array=NULL";
+      else { oss << ", array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetDescriptor.array__val); }
+      oss << ")";
+    break;
+    case HIP_API_ID_hipArrayGetInfo:
+      oss << "hipArrayGetInfo(";
+      if (data->args.hipArrayGetInfo.desc == NULL) oss << "desc=NULL";
+      else { oss << "desc="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.desc__val); }
+      if (data->args.hipArrayGetInfo.extent == NULL) oss << ", extent=NULL";
+      else { oss << ", extent="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.extent__val); }
+      if (data->args.hipArrayGetInfo.flags == NULL) oss << ", flags=NULL";
+      else { oss << ", flags="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.flags__val); }
+      if (data->args.hipArrayGetInfo.array == NULL) oss << ", array=NULL";
+      else { oss << ", array="; roctracer::hip_support::detail::operator<<(oss, data->args.hipArrayGetInfo.array__val); }
+      oss << ")";
+    break;
     case HIP_API_ID_hipChooseDevice:
       oss << "hipChooseDevice(";
       if (data->args.hipChooseDevice.device == NULL) oss << "device=NULL";
@@ -9404,6 +9508,13 @@ static inline const char* hipApiString(hip_api_id_t id, const hip_api_data_t* da
       else { oss << ", numDependencies_out="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetCaptureInfo_v2.numDependencies_out__val); }
       oss << ")";
     break;
+    case HIP_API_ID_hipStreamGetDevice:
+      oss << "hipStreamGetDevice(";
+      oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetDevice.stream);
+      if (data->args.hipStreamGetDevice.device == NULL) oss << ", device=NULL";
+      else { oss << ", device="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetDevice.device__val); }
+      oss << ")";
+    break;
     case HIP_API_ID_hipStreamGetFlags:
       oss << "hipStreamGetFlags(";
       oss << "stream="; roctracer::hip_support::detail::operator<<(oss, data->args.hipStreamGetFlags.stream);
diff --git a/include/hip/amd_detail/math_fwd.h b/include/hip/amd_detail/math_fwd.h
index 050c88c1..9e999268 100644
--- a/include/hip/amd_detail/math_fwd.h
+++ b/include/hip/amd_detail/math_fwd.h
@@ -386,19 +386,6 @@ float __ocml_fma_rtp_f32(float, float, float);
 __device__
 __attribute__((const))
 float __ocml_fma_rtz_f32(float, float, float);
-
-__device__
-__attribute__((const))
-float __llvm_amdgcn_cos_f32(float) __asm("llvm.amdgcn.cos.f32");
-__device__
-__attribute__((const))
-float __llvm_amdgcn_rcp_f32(float) __asm("llvm.amdgcn.rcp.f32");
-__device__
-__attribute__((const))
-float __llvm_amdgcn_rsq_f32(float) __asm("llvm.amdgcn.rsq.f32");
-__device__
-__attribute__((const))
-float __llvm_amdgcn_sin_f32(float) __asm("llvm.amdgcn.sin.f32");
 // END INTRINSICS
 // END FLOAT
 
@@ -697,13 +684,6 @@ double __ocml_fma_rtp_f64(double, double, double);
 __device__
 __attribute__((const))
 double __ocml_fma_rtz_f64(double, double, double);
-
-__device__
-__attribute__((const))
-double __llvm_amdgcn_rcp_f64(double) __asm("llvm.amdgcn.rcp.f64");
-__device__
-__attribute__((const))
-double __llvm_amdgcn_rsq_f64(double) __asm("llvm.amdgcn.rsq.f64");
 // END INTRINSICS
 // END DOUBLE
 
diff --git a/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 0c492b7c..4c8be9af 100644
--- a/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -39,6 +39,7 @@ THE SOFTWARE.
 #define CUDA_11030 11030
 #define CUDA_11040 11040
 #define CUDA_11060 11060
+#define CUDA_12000 12000
 
 #ifdef __cplusplus
 extern "C" {
@@ -2506,6 +2507,20 @@ inline static hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallb
         cudaStreamAddCallback(stream, (cudaStreamCallback_t)callback, userData, flags));
 }
 
+inline static hipError_t hipStreamGetDevice(hipStream_t stream, hipDevice_t* device) {
+    hipCtx_t context;
+    auto err = hipCUResultTohipError(cuStreamGetCtx(stream, &context));
+    if (err != hipSuccess) return err;
+
+    err = hipCUResultTohipError(cuCtxPushCurrent(context));
+    if (err != hipSuccess) return err;
+
+    err = hipCUResultTohipError(cuCtxGetDevice(device));
+    if (err != hipSuccess) return err;
+
+    return hipCUResultTohipError(cuCtxPopCurrent(&context));
+}
+
 inline static hipError_t hipDriverGetVersion(int* driverVersion) {
     return hipCUDAErrorTohipError(cudaDriverGetVersion(driverVersion));
 }
@@ -2773,6 +2788,7 @@ inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t
     return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig));
 }
 
+#if CUDA_VERSION < CUDA_12000
 __HIP_DEPRECATED inline static hipError_t hipBindTexture(size_t* offset,
                                                          struct textureReference* tex,
                                                          const void* devPtr,
@@ -2786,6 +2802,8 @@ __HIP_DEPRECATED inline static hipError_t hipBindTexture2D(
     const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch) {
     return hipCUDAErrorTohipError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch));
 }
+#endif // CUDA_VERSION < CUDA_12000
+
 
 inline static hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
                                                         hipChannelFormatKind f) {
@@ -2818,10 +2836,12 @@ inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDe
     return hipCUDAErrorTohipError(cudaGetTextureObjectResourceDesc( pResDesc, textureObject));
 }
 
+#if CUDA_VERSION < CUDA_12000
 __HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset(
     size_t* offset, const struct textureReference* texref) {
     return hipCUDAErrorTohipError(cudaGetTextureAlignmentOffset(offset,texref));
 }
+#endif
 
 inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array)
 {
@@ -3067,6 +3087,7 @@ inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
                                                                  blockSize, dynamicSMemSize, flags));
 }
 
+#if CUDA_VERSION < CUDA_12000
 template <class T, int dim, enum cudaTextureReadMode readMode>
 inline static hipError_t hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex,
                                         const void* devPtr, size_t size = UINT_MAX) {
@@ -3109,6 +3130,7 @@ __HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
     struct texture<T, dim, readMode>& tex, hipArray_const_t array) {
     return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array));
 }
+#endif   // CUDA_VERSION < CUDA_12000
 
 template <class T>
 inline static hipChannelFormatDesc hipCreateChannelDesc() {
@@ -3186,6 +3208,21 @@ inline static hipError_t hipArray3DCreate(hiparray* pHandle,
     return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray));
 }
 
+inline static hipError_t hipArrayGetInfo(hipChannelFormatDesc* desc, hipExtent* extent,
+                                          unsigned int* flags, hipArray* array) {
+    return hipCUDAErrorTohipError(cudaArrayGetInfo(desc, extent, flags, array));
+}
+
+inline static hipError_t hipArrayGetDescriptor(HIP_ARRAY_DESCRIPTOR* pArrayDescriptor,
+                                               hipArray* array) {
+    return hipCUResultTohipError(cuArrayGetDescriptor(pArrayDescriptor, (CUarray)array));
+}
+
+inline static hipError_t hipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor,
+                                                 hipArray* array) {
+    return hipCUResultTohipError(cuArray3DGetDescriptor(pArrayDescriptor, (CUarray)array));
+}
+
 inline static hipError_t hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode) {
     return hipCUDAErrorTohipError(cudaStreamBeginCapture(stream, mode));
 }
diff --git a/src/amd_hsa_elf.hpp b/src/amd_hsa_elf.hpp
index 45cf5c22..ca22fd1f 100644
--- a/src/amd_hsa_elf.hpp
+++ b/src/amd_hsa_elf.hpp
@@ -22,6 +22,9 @@ THE SOFTWARE.
 
 #pragma once
 
+// This header file is partially copied from
+//   https://github.com/llvm/llvm-project/blob/main/llvm/include/llvm/BinaryFormat/ELF.h
+
 // AMDGPU OS for HSA compatible compute kernels.
 enum { ELFOSABI_AMDGPU_HSA = 64, ELFOSABI_AMDGPU_PAL = 65, ELFOSABI_AMDGPU_MESA3D = 66 };
 
@@ -57,38 +60,51 @@ enum : unsigned {
   EF_AMDGPU_MACH_R600_FIRST = EF_AMDGPU_MACH_R600_R600,
   EF_AMDGPU_MACH_R600_LAST = EF_AMDGPU_MACH_R600_TURKS,
 
-  EF_AMDGPU_MACH_AMDGCN_GFX600 = 0x020,
-  EF_AMDGPU_MACH_AMDGCN_GFX601 = 0x021,
-  EF_AMDGPU_MACH_AMDGCN_GFX700 = 0x022,
-  EF_AMDGPU_MACH_AMDGCN_GFX701 = 0x023,
-  EF_AMDGPU_MACH_AMDGCN_GFX702 = 0x024,
-  EF_AMDGPU_MACH_AMDGCN_GFX703 = 0x025,
-  EF_AMDGPU_MACH_AMDGCN_GFX704 = 0x026,
-  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X027 = 0x027,
-  EF_AMDGPU_MACH_AMDGCN_GFX801 = 0x028,
-  EF_AMDGPU_MACH_AMDGCN_GFX802 = 0x029,
-  EF_AMDGPU_MACH_AMDGCN_GFX803 = 0x02a,
-  EF_AMDGPU_MACH_AMDGCN_GFX810 = 0x02b,
-  EF_AMDGPU_MACH_AMDGCN_GFX900 = 0x02c,
-  EF_AMDGPU_MACH_AMDGCN_GFX902 = 0x02d,
-  EF_AMDGPU_MACH_AMDGCN_GFX904 = 0x02e,
-  EF_AMDGPU_MACH_AMDGCN_GFX906 = 0x02f,
-  EF_AMDGPU_MACH_AMDGCN_GFX908 = 0x030,
-  EF_AMDGPU_MACH_AMDGCN_GFX909 = 0x031,
-  EF_AMDGPU_MACH_AMDGCN_GFX90C = 0x032,
-  EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033,
-  EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034,
-  EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035,
-  EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
-  EF_AMDGPU_MACH_AMDGCN_GFX1031 = 0x037,
-  EF_AMDGPU_MACH_AMDGCN_GFX1032 = 0x038,
-  EF_AMDGPU_MACH_AMDGCN_GFX1033 = 0x039,
-  EF_AMDGPU_MACH_AMDGCN_GFX602 = 0x03a,
-  EF_AMDGPU_MACH_AMDGCN_GFX705 = 0x03b,
-  EF_AMDGPU_MACH_AMDGCN_GFX805 = 0x03c,
-  EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f,
+  // AMDGCN-based processors.
+  EF_AMDGPU_MACH_AMDGCN_GFX600        = 0x020,
+  EF_AMDGPU_MACH_AMDGCN_GFX601        = 0x021,
+  EF_AMDGPU_MACH_AMDGCN_GFX700        = 0x022,
+  EF_AMDGPU_MACH_AMDGCN_GFX701        = 0x023,
+  EF_AMDGPU_MACH_AMDGCN_GFX702        = 0x024,
+  EF_AMDGPU_MACH_AMDGCN_GFX703        = 0x025,
+  EF_AMDGPU_MACH_AMDGCN_GFX704        = 0x026,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X27 = 0x027,
+  EF_AMDGPU_MACH_AMDGCN_GFX801        = 0x028,
+  EF_AMDGPU_MACH_AMDGCN_GFX802        = 0x029,
+  EF_AMDGPU_MACH_AMDGCN_GFX803        = 0x02a,
+  EF_AMDGPU_MACH_AMDGCN_GFX810        = 0x02b,
+  EF_AMDGPU_MACH_AMDGCN_GFX900        = 0x02c,
+  EF_AMDGPU_MACH_AMDGCN_GFX902        = 0x02d,
+  EF_AMDGPU_MACH_AMDGCN_GFX904        = 0x02e,
+  EF_AMDGPU_MACH_AMDGCN_GFX906        = 0x02f,
+  EF_AMDGPU_MACH_AMDGCN_GFX908        = 0x030,
+  EF_AMDGPU_MACH_AMDGCN_GFX909        = 0x031,
+  EF_AMDGPU_MACH_AMDGCN_GFX90C        = 0x032,
+  EF_AMDGPU_MACH_AMDGCN_GFX1010       = 0x033,
+  EF_AMDGPU_MACH_AMDGCN_GFX1011       = 0x034,
+  EF_AMDGPU_MACH_AMDGCN_GFX1012       = 0x035,
+  EF_AMDGPU_MACH_AMDGCN_GFX1030       = 0x036,
+  EF_AMDGPU_MACH_AMDGCN_GFX1031       = 0x037,
+  EF_AMDGPU_MACH_AMDGCN_GFX1032       = 0x038,
+  EF_AMDGPU_MACH_AMDGCN_GFX1033       = 0x039,
+  EF_AMDGPU_MACH_AMDGCN_GFX602        = 0x03a,
+  EF_AMDGPU_MACH_AMDGCN_GFX705        = 0x03b,
+  EF_AMDGPU_MACH_AMDGCN_GFX805        = 0x03c,
+  EF_AMDGPU_MACH_AMDGCN_GFX1035       = 0x03d,
+  EF_AMDGPU_MACH_AMDGCN_GFX1034       = 0x03e,
+  EF_AMDGPU_MACH_AMDGCN_GFX90A        = 0x03f,
+  EF_AMDGPU_MACH_AMDGCN_GFX940        = 0x040,
+  EF_AMDGPU_MACH_AMDGCN_GFX1100       = 0x041,
+  EF_AMDGPU_MACH_AMDGCN_GFX1013       = 0x042,
+  EF_AMDGPU_MACH_AMDGCN_RESERVED_0X43 = 0x043,
+  EF_AMDGPU_MACH_AMDGCN_GFX1103       = 0x044,
+  EF_AMDGPU_MACH_AMDGCN_GFX1036       = 0x045,
+  EF_AMDGPU_MACH_AMDGCN_GFX1101       = 0x046,
+  EF_AMDGPU_MACH_AMDGCN_GFX1102       = 0x047,
+
+  // First/last AMDGCN-based processors.
   EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
-  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX90A,
+  EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1102,
 
   // Indicates if the "xnack" target feature is enabled for all code contained
   // in the object.
diff --git a/src/amdhip.def b/src/amdhip.def
index 9a1d3e6e..ffaff7f5 100644
--- a/src/amdhip.def
+++ b/src/amdhip.def
@@ -103,6 +103,9 @@ hipMemPoolImportPointer
 hipArrayCreate
 hipArray3DCreate
 hipArrayDestroy
+hipArrayGetInfo
+hipArrayGetDescriptor
+hipArray3DGetDescriptor
 hipMallocArray
 hipMemAdvise
 hipMemAllocPitch
@@ -190,6 +193,7 @@ hipStreamCreate
 hipStreamCreateWithFlags
 hipStreamCreateWithPriority
 hipStreamDestroy
+hipStreamGetDevice
 hipStreamGetFlags
 hipStreamQuery
 hipStreamSynchronize
diff --git a/src/hip_code_object.cpp b/src/hip_code_object.cpp
index 0d5fb1b4..dd9de637 100644
--- a/src/hip_code_object.cpp
+++ b/src/hip_code_object.cpp
@@ -31,8 +31,6 @@ THE SOFTWARE.
 #include "platform/program.hpp"
 #include <elf/elf.hpp>
 
-hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
-                      amd::HostQueue& queue, bool isAsync = false);
 hipError_t ihipFree(void* ptr);
 // forward declaration of methods required for managed variables
 hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align = 0);
@@ -172,6 +170,11 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp
       sramEccSupported = false;
       proc_name = "gfx90c";
       break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX940:
+      xnackSupported = true;
+      sramEccSupported = true;
+      proc_name = "gfx940";
+      break;
     case EF_AMDGPU_MACH_AMDGCN_GFX1010:
       xnackSupported = true;
       sramEccSupported = false;
@@ -187,6 +190,11 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp
       sramEccSupported = false;
       proc_name = "gfx1012";
       break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1013:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx1013";
+      break;
     case EF_AMDGPU_MACH_AMDGCN_GFX1030:
       xnackSupported = false;
       sramEccSupported = false;
@@ -207,6 +215,41 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp
       sramEccSupported = false;
       proc_name = "gfx1033";
       break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1034:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1034";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1035:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1035";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1036:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1036";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1100:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1100";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1101:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1101";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1102:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1102";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1103:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1103";
+      break;
     default:
       return false;
   }
@@ -590,10 +633,10 @@ hipError_t DynCO::initDynManagedVars(const std::string& managedVar) {
   it->second->setManagedVarInfo(pointer, dvar->size());
 
   // copy initial value to the managed variable to the managed memory allocated
-  amd::HostQueue* queue = hip::getNullStream();
-  if (queue != nullptr) {
+  hip::Stream* stream = hip::getNullStream();
+  if (stream != nullptr) {
     status = ihipMemcpy(pointer, reinterpret_cast<address>(dvar->device_ptr()), dvar->size(),
-                        hipMemcpyDeviceToDevice, *queue);
+                        hipMemcpyDeviceToDevice, *stream);
     if (status != hipSuccess) {
       ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to copy device ptr:%s", status,
               managedVar.c_str());
@@ -613,7 +656,7 @@ hipError_t DynCO::initDynManagedVars(const std::string& managedVar) {
   }
   // copy managed memory pointer to the managed device variable
   status = ihipMemcpy(reinterpret_cast<address>(dvar->device_ptr()), &pointer, dvar->size(),
-                      hipMemcpyHostToDevice, *queue);
+                      hipMemcpyHostToDevice, *stream);
   if (status != hipSuccess) {
     ClPrint(amd::LOG_ERROR, amd::LOG_API, "Status %d, failed to copy device ptr:%s", status,
             managedVar.c_str());
@@ -850,10 +893,10 @@ hipError_t StatCO::initStatManagedVarDevicePtr(int deviceId) {
       DeviceVar* dvar = nullptr;
       IHIP_RETURN_ONFAIL(var->getStatDeviceVar(&dvar, deviceId));
 
-      amd::HostQueue* queue = g_devices.at(deviceId)->NullStream();
-      if (queue != nullptr) {
+      hip::Stream* stream = g_devices.at(deviceId)->NullStream();
+      if (stream != nullptr) {
         err = ihipMemcpy(reinterpret_cast<address>(dvar->device_ptr()), var->getManagedVarPtr(),
-                         dvar->size(), hipMemcpyHostToDevice, *queue);
+                         dvar->size(), hipMemcpyHostToDevice, *stream);
       } else {
         ClPrint(amd::LOG_ERROR, amd::LOG_API, "Host Queue is NULL");
         return hipErrorInvalidResourceHandle;
diff --git a/src/hip_context.cpp b/src/hip_context.cpp
index 824e6959..f639d4ff 100644
--- a/src/hip_context.cpp
+++ b/src/hip_context.cpp
@@ -29,7 +29,7 @@ std::vector<hip::Device*> g_devices;
 
 namespace hip {
 thread_local TlsAggregator tls;
-Device* host_device = nullptr;
+amd::Context* host_context = nullptr;
 
 //init() is only to be called from the HIP_INIT macro only once
 bool init() {
@@ -44,7 +44,7 @@ bool init() {
   if (!amd::Runtime::init()) {
     return false;
   }
-  LogPrintfInfo("Direct Dispatch: %d", AMD_DIRECT_DISPATCH);
+  ClPrint(amd::LOG_INFO, amd::LOG_INIT, "Direct Dispatch: %d", AMD_DIRECT_DISPATCH);
 
 
   const std::vector<amd::Device*>& devices = amd::Device::getDevices(CL_DEVICE_TYPE_GPU, false);
@@ -74,7 +74,7 @@ bool init() {
   if (CL_SUCCESS != hContext->create(nullptr)) {
     hContext->release();
   }
-  host_device = new Device(hContext, -1);
+  host_context = hContext;
 
   PlatformState::instance().init();
   return true;
@@ -91,21 +91,21 @@ void setCurrentDevice(unsigned int index) {
   amd::Os::setPreferredNumaNode(preferredNumaNode);
 }
 
-amd::HostQueue* getQueue(hipStream_t stream) {
+hip::Stream* getStream(hipStream_t stream) {
  if (stream == nullptr) {
     return getNullStream();
   } else {
-    amd::HostQueue* queue = reinterpret_cast<hip::Stream*>(stream)->asHostQueue();
-    if (!(reinterpret_cast<hip::Stream*>(stream)->Flags() & hipStreamNonBlocking)) {
+    hip::Stream* hip_stream = reinterpret_cast<hip::Stream*>(stream);
+    if (!(hip_stream->Flags() & hipStreamNonBlocking)) {
       constexpr bool WaitNullStreamOnly = true;
-      iHipWaitActiveStreams(queue, WaitNullStreamOnly);
+      iHipWaitActiveStreams(hip_stream, WaitNullStreamOnly);
     }
-    return queue;
+    return hip_stream;
   }
 }
 
 // ================================================================================================
-amd::HostQueue* getNullStream(amd::Context& ctx) {
+hip::Stream* getNullStream(amd::Context& ctx) {
   for (auto& it : g_devices) {
     if (it->asContext() == &ctx) {
       return it->NullStream();
@@ -113,7 +113,7 @@ amd::HostQueue* getNullStream(amd::Context& ctx) {
   }
   // If it's a pure SVM allocation with system memory access, then it shouldn't matter which device
   // runtime selects by default
-  if (hip::host_device->asContext() == &ctx) {
+  if (hip::host_context == &ctx) {
     // Return current...
     return getNullStream();
   }
@@ -131,7 +131,7 @@ int getDeviceID(amd::Context& ctx) {
 }
 
 // ================================================================================================
-amd::HostQueue* getNullStream() {
+hip::Stream* getNullStream() {
   Device* device = getCurrentDevice();
   return device ? device->NullStream() : nullptr;
 }
diff --git a/src/hip_conversions.hpp b/src/hip_conversions.hpp
index 757ccc79..ef928225 100644
--- a/src/hip_conversions.hpp
+++ b/src/hip_conversions.hpp
@@ -168,6 +168,12 @@ hipArray_Format getCL2hipArrayFormat(const cl_channel_type type) {
     case CL_SIGNED_INT8:
       return HIP_AD_FORMAT_SIGNED_INT8;
 
+    case CL_UNSIGNED_INT16:
+      return HIP_AD_FORMAT_UNSIGNED_INT16;
+
+    case CL_SIGNED_INT16:
+      return HIP_AD_FORMAT_SIGNED_INT16;
+
     case CL_SIGNED_INT32:
       return HIP_AD_FORMAT_SIGNED_INT32;
 
diff --git a/src/hip_device.cpp b/src/hip_device.cpp
index fedd62e5..8782b6c3 100644
--- a/src/hip_device.cpp
+++ b/src/hip_device.cpp
@@ -26,25 +26,17 @@
 namespace hip {
 
 // ================================================================================================
-amd::HostQueue* Device::NullStream(bool skip_alloc) {
-  amd::HostQueue* null_queue = null_stream_.asHostQueue(skip_alloc);
-  if (null_queue == nullptr) {
-    return nullptr;
+hip::Stream* Device::NullStream(bool skip_alloc) {
+  if (null_stream_ == nullptr && !skip_alloc) {
+    null_stream_ = new Stream(this, Stream::Priority::Normal, 0, true);
   }
-  // Wait for all active streams before executing commands on the default
-  iHipWaitActiveStreams(null_queue);
-  return null_queue;
-}
 
-// ================================================================================================
-Stream* Device::GetNullStream() {
-  amd::HostQueue* null_queue = null_stream_.asHostQueue();
-  if (null_queue == nullptr) {
+  if (null_stream_ == nullptr) {
     return nullptr;
   }
   // Wait for all active streams before executing commands on the default
-  iHipWaitActiveStreams(null_queue);
-  return &null_stream_;
+  iHipWaitActiveStreams(null_stream_);
+  return null_stream_;
 }
 
 // ================================================================================================
@@ -54,6 +46,18 @@ bool Device::Create() {
   if (default_mem_pool_ == nullptr) {
     return false;
   }
+
+  // Create graph memory pool
+  graph_mem_pool_ = new MemoryPool(this);
+  if (graph_mem_pool_ == nullptr) {
+    return false;
+  }
+
+  uint64_t max_size = std::numeric_limits<uint64_t>::max();
+  // Use maximum value to hold memory, because current implementation doesn't support VM
+  // Note: the call for the threshold is always successful
+  auto error = graph_mem_pool_->SetAttribute(hipMemPoolAttrReleaseThreshold, &max_size);
+
   // Current is default pool after device creation
   current_mem_pool_ = default_mem_pool_;
   return true;
@@ -79,7 +83,7 @@ void Device::RemoveMemoryPool(MemoryPool* pool) {
 bool Device::FreeMemory(amd::Memory* memory, Stream* stream) {
   amd::ScopedLock lock(lock_);
   // Search for memory in the entire list of pools
-  for (auto& it : mem_pools_) {
+  for (auto it : mem_pools_) {
     if (it->FreeMemory(memory, stream)) {
       return true;
     }
@@ -91,7 +95,7 @@ bool Device::FreeMemory(amd::Memory* memory, Stream* stream) {
 void Device::ReleaseFreedMemory(Stream* stream) {
   amd::ScopedLock lock(lock_);
   // Search for memory in the entire list of pools
-  for (auto& it : mem_pools_) {
+  for (auto it : mem_pools_) {
     it->ReleaseFreedMemory(stream);
   }
 }
@@ -100,20 +104,23 @@ void Device::ReleaseFreedMemory(Stream* stream) {
 void Device::RemoveStreamFromPools(Stream* stream) {
   amd::ScopedLock lock(lock_);
   // Update all pools with the destroyed stream
-  for (auto& it : mem_pools_) {
+  for (auto it : mem_pools_) {
     it->RemoveStream(stream);
   }
 }
 
 // ================================================================================================
 void Device::Reset() {
-  auto it = mem_pools_.begin();
-  while (it != mem_pools_.end()) {
-    auto current = it++;
-    (*current)->ReleaseAllMemory();
-    delete *current;
+  {
+    amd::ScopedLock lock(lock_);
+    auto it = mem_pools_.begin();
+    while (it != mem_pools_.end()) {
+      auto current = it++;
+      (*current)->ReleaseAllMemory();
+      delete *current;
+    }
+    mem_pools_.clear();
   }
-  mem_pools_.clear();
   flags_ = hipDeviceScheduleSpin;
   hip::Stream::destroyAllStreams(deviceId_);
   amd::MemObjMap::Purge(devices()[0]);
@@ -125,6 +132,14 @@ Device::~Device() {
   if (default_mem_pool_ != nullptr) {
     default_mem_pool_->release();
   }
+
+  if (graph_mem_pool_ != nullptr) {
+    graph_mem_pool_->release();
+  }
+
+  if (null_stream_!= nullptr) {
+    null_stream_->release();
+  }
 }
 
 }
diff --git a/src/hip_device_runtime.cpp b/src/hip_device_runtime.cpp
index def9bcb0..19bed5ef 100644
--- a/src/hip_device_runtime.cpp
+++ b/src/hip_device_runtime.cpp
@@ -512,9 +512,9 @@ hipError_t hipDeviceSetSharedMemConfig ( hipSharedMemConfig config ) {
 hipError_t hipDeviceSynchronize ( void ) {
   HIP_INIT_API(hipDeviceSynchronize);
 
-  amd::HostQueue* queue = hip::getNullStream();
+  hip::Stream* stream = hip::getNullStream();
 
-  if (!queue) {
+  if (!stream) {
     HIP_RETURN(hipErrorOutOfMemory);
   }
 
@@ -522,7 +522,7 @@ hipError_t hipDeviceSynchronize ( void ) {
     HIP_RETURN(hipErrorStreamCaptureUnsupported);
   }
 
-  queue->finish();
+  stream->finish();
 
   hip::Stream::syncNonBlockingStreams(hip::getCurrentDevice()->deviceId());
 
@@ -602,7 +602,7 @@ hipError_t hipSetDeviceFlags ( unsigned int  flags ) {
   switch (scheduleFlag) {
     case hipDeviceScheduleAuto:
       // Current behavior is different from the spec, due to MT usage in runtime
-      if (hip::host_device->devices().size() >= std::thread::hardware_concurrency()) {
+      if (hip::host_context->devices().size() >= std::thread::hardware_concurrency()) {
         device->SetActiveWait(false);
         break;
       }
diff --git a/src/hip_event.cpp b/src/hip_event.cpp
index f556cabe..747b69ce 100644
--- a/src/hip_event.cpp
+++ b/src/hip_event.cpp
@@ -177,12 +177,12 @@ int64_t EventDD::time(bool getStartTs) const {
   }
 }
 
-hipError_t Event::streamWaitCommand(amd::Command*& command, amd::HostQueue* queue) {
+hipError_t Event::streamWaitCommand(amd::Command*& command, hip::Stream* stream) {
   amd::Command::EventWaitList eventWaitList;
   if (event_ != nullptr) {
     eventWaitList.push_back(event_);
   }
-  command = new amd::Marker(*queue, kMarkerDisableFlush, eventWaitList);
+  command = new amd::Marker(*stream, kMarkerDisableFlush, eventWaitList);
 
   if (command == NULL) {
     return hipErrorOutOfMemory;
@@ -196,17 +196,17 @@ hipError_t Event::enqueueStreamWaitCommand(hipStream_t stream, amd::Command* com
 }
 
 hipError_t Event::streamWait(hipStream_t stream, uint flags) {
-  amd::HostQueue* queue = hip::getQueue(stream);
+  hip::Stream* hip_stream = hip::getStream(stream);
   // Access to event_ object must be lock protected
   amd::ScopedLock lock(lock_);
-  if ((event_ == nullptr) || (event_->command().queue() == queue) || ready()) {
+  if ((event_ == nullptr) || (event_->command().queue() == hip_stream) || ready()) {
     return hipSuccess;
   }
   if (!event_->notifyCmdQueue()) {
     return hipErrorLaunchOutOfResources;
   }
   amd::Command* command;
-  hipError_t status = streamWaitCommand(command, queue);
+  hipError_t status = streamWaitCommand(command, hip_stream);
   if (status != hipSuccess) {
     return status;
   }
@@ -218,20 +218,19 @@ hipError_t Event::streamWait(hipStream_t stream, uint flags) {
   return hipSuccess;
 }
 
-hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* queue,
+hipError_t Event::recordCommand(amd::Command*& command, amd::HostQueue* stream,
                                 uint32_t ext_flags ) {
   if (command == nullptr) {
     int32_t releaseFlags = ((ext_flags == 0) ? flags : ext_flags) &
-                            (hipEventReleaseToSystem | hipEventReleaseToDevice);
-    if (releaseFlags & hipEventReleaseToDevice) {
-      releaseFlags = amd::Device::kCacheStateAgent;
-    } else if (releaseFlags & hipEventReleaseToSystem) {
-      releaseFlags = amd::Device::kCacheStateSystem;
-    } else {
+                            (hipEventReleaseToDevice | hipEventReleaseToSystem |
+                             hipEventDisableSystemFence);
+    if (releaseFlags & hipEventDisableSystemFence) {
       releaseFlags = amd::Device::kCacheStateIgnore;
+    } else {
+      releaseFlags = amd::Device::kCacheStateInvalid;
     }
     // Always submit a EventMarker.
-    command = new hip::EventMarker(*queue, !kMarkerDisableFlush, true, releaseFlags);
+    command = new hip::EventMarker(*stream, !kMarkerDisableFlush, true, releaseFlags);
   }
   return hipSuccess;
 }
@@ -249,10 +248,10 @@ hipError_t Event::enqueueRecordCommand(hipStream_t stream, amd::Command* command
 }
 
 hipError_t Event::addMarker(hipStream_t stream, amd::Command* command, bool record) {
-  amd::HostQueue* queue = hip::getQueue(stream);
+  hip::Stream* hip_stream = hip::getStream(stream);
   // Keep the lock always at the beginning of this to avoid a race. SWDEV-277847
   amd::ScopedLock lock(lock_);
-  hipError_t status = recordCommand(command, queue);
+  hipError_t status = recordCommand(command, hip_stream);
   if (status != hipSuccess) {
     return hipSuccess;
   }
@@ -279,14 +278,21 @@ bool isValid(hipEvent_t event) {
 // ================================================================================================
 hipError_t ihipEventCreateWithFlags(hipEvent_t* event, unsigned flags) {
   unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming |
-      hipEventReleaseToDevice | hipEventReleaseToSystem | hipEventInterprocess;
+                            hipEventReleaseToDevice | hipEventReleaseToSystem |
+                            hipEventInterprocess | hipEventDisableSystemFence;
 
-  const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem);
+  const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem |
+                                 hipEventDisableSystemFence);
   // can't set any unsupported flags.
-  // can't set both release flags
+  // can set only one of the release flags.
   // if hipEventInterprocess flag is set, then hipEventDisableTiming flag also must be set
   const bool illegalFlags = (flags & ~supportedFlags) ||
-                            ((flags & releaseFlags) == releaseFlags) ||
+                            ([](unsigned int num){
+                              unsigned int bitcount;
+                              for (bitcount = 0; num; bitcount++) {
+                                num &= num - 1;
+                              }
+                              return bitcount; } (flags & releaseFlags) > 1) ||
                             ((flags & hipEventInterprocess) && !(flags & hipEventDisableTiming));
   if (!illegalFlags) {
     hip::Event* e = nullptr;
@@ -379,8 +385,8 @@ hipError_t hipEventRecord_common(hipEvent_t event, hipStream_t stream) {
     return hipErrorInvalidHandle;
   }
   hip::Event* e = reinterpret_cast<hip::Event*>(event);
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (g_devices[e->deviceId()]->devices()[0] != &queue->device()) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (g_devices[e->deviceId()]->devices()[0] != &hip_stream->device()) {
     return hipErrorInvalidHandle;
   }
   return e->addMarker(stream, nullptr, true);
@@ -404,9 +410,6 @@ hipError_t hipEventSynchronize(hipEvent_t event) {
     HIP_RETURN(hipErrorInvalidHandle);
   }
 
-  if (hip::Stream::StreamCaptureOngoing() == true) {
-    HIP_RETURN(hipErrorStreamCaptureUnsupported);
-  }
   hip::Event* e = reinterpret_cast<hip::Event*>(event);
   HIP_RETURN(e->synchronize());
 }
diff --git a/src/hip_event.hpp b/src/hip_event.hpp
index e08ea33f..91a8193d 100644
--- a/src/hip_event.hpp
+++ b/src/hip_event.hpp
@@ -78,9 +78,9 @@ typedef struct ihipIpcEventShmem_s {
 
 class EventMarker : public amd::Marker {
  public:
-  EventMarker(amd::HostQueue& queue, bool disableFlush, bool markerTs = false,
+  EventMarker(amd::HostQueue& stream, bool disableFlush, bool markerTs = false,
               int32_t scope = amd::Device::kCacheStateInvalid)
-      : amd::Marker(queue, disableFlush) {
+      : amd::Marker(stream, disableFlush) {
     profilingInfo_.enabled_ = true;
     profilingInfo_.callback_ = nullptr;
     profilingInfo_.marker_ts_ = markerTs;
@@ -116,11 +116,11 @@ class Event {
   virtual hipError_t synchronize();
   hipError_t elapsedTime(Event& eStop, float& ms);
 
-  virtual hipError_t streamWaitCommand(amd::Command*& command, amd::HostQueue* queue);
+  virtual hipError_t streamWaitCommand(amd::Command*& command, hip::Stream* stream);
   virtual hipError_t enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command);
   virtual hipError_t streamWait(hipStream_t stream, uint flags);
 
-  virtual hipError_t recordCommand(amd::Command*& command, amd::HostQueue* queue,
+  virtual hipError_t recordCommand(amd::Command*& command, amd::HostQueue* stream,
                                    uint32_t flags = 0);
   virtual hipError_t enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record);
   hipError_t addMarker(hipStream_t stream, amd::Command* command, bool record);
@@ -175,7 +175,7 @@ class Event {
 
  protected:
   amd::Monitor lock_;
-  amd::HostQueue* stream_;
+  hip::Stream* stream_;
   amd::Event* event_;
   int device_id_;
   //! Flag to indicate hipEventRecord has not been called. This is needed for
@@ -224,7 +224,7 @@ class IPCEvent : public Event {
   hipError_t synchronize();
   hipError_t query();
 
-  hipError_t streamWaitCommand(amd::Command*& command, amd::HostQueue* queue);
+  hipError_t streamWaitCommand(amd::Command*& command, hip::Stream* stream);
   hipError_t enqueueStreamWaitCommand(hipStream_t stream, amd::Command* command);
   hipError_t streamWait(hipStream_t stream, uint flags);
 
diff --git a/src/hip_event_ipc.cpp b/src/hip_event_ipc.cpp
index 7385566b..706b3d44 100644
--- a/src/hip_event_ipc.cpp
+++ b/src/hip_event_ipc.cpp
@@ -102,8 +102,8 @@ hipError_t IPCEvent::synchronize() {
   return hipSuccess;
 }
 
-hipError_t IPCEvent::streamWaitCommand(amd::Command*& command, amd::HostQueue* queue) {
-  command = new amd::Marker(*queue, false);
+hipError_t IPCEvent::streamWaitCommand(amd::Command*& command, hip::Stream* stream) {
+  command = new amd::Marker(*stream, false);
   if (command == NULL) {
     return hipErrorOutOfMemory;
   }
@@ -125,12 +125,12 @@ hipError_t IPCEvent::enqueueStreamWaitCommand(hipStream_t stream, amd::Command*
 }
 
 hipError_t IPCEvent::streamWait(hipStream_t stream, uint flags) {
-  amd::HostQueue* queue = hip::getQueue(stream);
+  hip::Stream* hip_stream = hip::getStream(stream);
 
   amd::ScopedLock lock(lock_);
   if(query() != hipSuccess) {
     amd::Command* command;
-    hipError_t status = streamWaitCommand(command, queue);
+    hipError_t status = streamWaitCommand(command, hip_stream);
     if (status != hipSuccess) {
       return status;
     }
@@ -140,18 +140,17 @@ hipError_t IPCEvent::streamWait(hipStream_t stream, uint flags) {
   return hipSuccess;
 }
 
-hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* queue, uint32_t flags) {
+hipError_t IPCEvent::recordCommand(amd::Command*& command, amd::HostQueue* stream, uint32_t flags) {
   bool unrecorded = isUnRecorded();
   if (unrecorded) {
-    command = new amd::Marker(*queue, kMarkerDisableFlush);
+    command = new amd::Marker(*stream, kMarkerDisableFlush);
   } else {
-    return Event::recordCommand(command, queue);
+    return Event::recordCommand(command, stream);
   }
   return hipSuccess;
 }
 
 hipError_t IPCEvent::enqueueRecordCommand(hipStream_t stream, amd::Command* command, bool record) {
-  amd::HostQueue* queue = hip::getQueue(stream);
   bool unrecorded = isUnRecorded();
   if (unrecorded) {
     amd::Event& tEvent = command->event();
diff --git a/src/hip_fatbin.cpp b/src/hip_fatbin.cpp
index bafc7436..c2b7ff75 100644
--- a/src/hip_fatbin.cpp
+++ b/src/hip_fatbin.cpp
@@ -23,13 +23,15 @@ FatBinaryInfo::FatBinaryInfo(const char* fname, const void* image) : fdesc_(amd:
     fname_ = std::string();
   }
 
-  fatbin_dev_info_.resize(g_devices.size());
+  fatbin_dev_info_.resize(g_devices.size(), nullptr);
 }
 
 FatBinaryInfo::~FatBinaryInfo() {
 
-  for (auto& fbd: fatbin_dev_info_) {
-    delete fbd;
+  for (auto* fbd: fatbin_dev_info_) {
+    if (fbd != nullptr) {
+      delete fbd;
+    }
   }
 
   if (fdesc_ > 0) {
@@ -298,6 +300,10 @@ hipError_t FatBinaryInfo::AddDevProgram(const int device_id) {
   DeviceIdCheck(device_id);
 
   FatBinaryDeviceInfo* fbd_info = fatbin_dev_info_[device_id];
+  if (fbd_info == nullptr) {
+    return hipErrorInvalidKernelFile;
+  }
+
   // If fat binary was already added, skip this step and return success
   if (fbd_info->add_dev_prog_ == false) {
     amd::Context* ctx = g_devices[device_id]->asContext();
diff --git a/src/hip_gl.cpp b/src/hip_gl.cpp
index 216a2cb4..ce692753 100644
--- a/src/hip_gl.cpp
+++ b/src/hip_gl.cpp
@@ -637,13 +637,12 @@ hipError_t hipGraphicsMapResources(int count, hipGraphicsResource_t* resources,
     HIP_RETURN(hipErrorUnknown);
   }
 
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (nullptr == queue) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (nullptr == hip_stream) {
     HIP_RETURN(hipErrorUnknown);
   }
-  amd::HostQueue& hostQueue = *queue;
 
-  if (!hostQueue.context().glenv() || !hostQueue.context().glenv()->isAssociated()) {
+  if (!hip_stream->context().glenv() || !hip_stream->context().glenv()->isAssociated()) {
     LogWarning("\"amdContext\" is not created from GL context or share list");
     HIP_RETURN(hipErrorUnknown);
   }
@@ -658,7 +657,7 @@ hipError_t hipGraphicsMapResources(int count, hipGraphicsResource_t* resources,
 
   //! Now create command and enqueue
   amd::AcquireExtObjectsCommand* command = new amd::AcquireExtObjectsCommand(
-      hostQueue, nullWaitList, count, memObjects, CL_COMMAND_ACQUIRE_GL_OBJECTS);
+      *hip_stream, nullWaitList, count, memObjects, CL_COMMAND_ACQUIRE_GL_OBJECTS);
   if (command == nullptr) {
     HIP_RETURN(hipErrorUnknown);
   }
@@ -712,13 +711,12 @@ hipError_t hipGraphicsUnmapResources(int count, hipGraphicsResource_t* resources
   }
 
   // Wait for the current host queue
-  hip::getQueue(stream)->finish();
+  hip::getStream(stream)->finish();
 
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (nullptr == queue) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (nullptr == hip_stream) {
     HIP_RETURN(hipErrorUnknown);
   }
-  amd::HostQueue& hostQueue = *queue;
 
   std::vector<amd::Memory*> memObjects;
   hipError_t err = hipSetInteropObjects(count, reinterpret_cast<void**>(resources), memObjects);
@@ -730,7 +728,7 @@ hipError_t hipGraphicsUnmapResources(int count, hipGraphicsResource_t* resources
 
   // Now create command and enqueue
   amd::ReleaseExtObjectsCommand* command = new amd::ReleaseExtObjectsCommand(
-      hostQueue, nullWaitList, count, memObjects, CL_COMMAND_RELEASE_GL_OBJECTS);
+      *hip_stream, nullWaitList, count, memObjects, CL_COMMAND_RELEASE_GL_OBJECTS);
   if (command == nullptr) {
     HIP_RETURN(hipErrorUnknown);
   }
diff --git a/src/hip_graph.cpp b/src/hip_graph.cpp
index 4b5d9e15..fddc49b0 100644
--- a/src/hip_graph.cpp
+++ b/src/hip_graph.cpp
@@ -36,11 +36,16 @@ inline hipError_t ihipGraphAddNode(hipGraphNode_t graphNode, hipGraph_t graph,
                                    const hipGraphNode_t* pDependencies, size_t numDependencies,
                                    bool capture = true) {
   graph->AddNode(graphNode);
+  std::unordered_set<hipGraphNode_t> DuplicateDep;
   for (size_t i = 0; i < numDependencies; i++) {
     if ((!hipGraphNode::isNodeValid(pDependencies[i])) ||
         (graph != pDependencies[i]->GetParentGraph())) {
       return hipErrorInvalidValue;
     }
+    if (DuplicateDep.find(pDependencies[i]) != DuplicateDep.end()) {
+      return hipErrorInvalidValue;
+    }
+    DuplicateDep.insert(pDependencies[i]);
     pDependencies[i]->AddEdge(graphNode);
   }
   if (capture == false) {
@@ -1210,7 +1215,8 @@ hipError_t hipGraphAddChildGraphNode(hipGraphNode_t* pGraphNode, hipGraph_t grap
   HIP_RETURN(status);
 }
 
-hipError_t ihipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph) {
+hipError_t ihipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph,
+                               uint64_t flags = 0) {
   if (pGraphExec == nullptr || graph == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
@@ -1227,7 +1233,8 @@ hipError_t ihipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph) {
   clonedGraph->LevelOrder(levelOrder);
   clonedGraph->GetUserObjs(graphExeUserObj);
   *pGraphExec =
-      new hipGraphExec(levelOrder, parallelLists, nodeWaitLists, clonedNodes, graphExeUserObj);
+      new hipGraphExec(levelOrder, parallelLists, nodeWaitLists, clonedNodes,
+      graphExeUserObj, flags);
   if (*pGraphExec != nullptr) {
     return (*pGraphExec)->Init();
   } else {
@@ -1242,7 +1249,7 @@ hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph,
 }
 
 hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t graph,
-                                        unsigned long long flags) {
+                                        unsigned long long flags = 0) {
   HIP_INIT_API(hipGraphInstantiateWithFlags, pGraphExec, graph, flags);
   if (pGraphExec == nullptr || graph == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
@@ -1674,13 +1681,19 @@ hipError_t hipStreamUpdateCaptureDependencies(hipStream_t stream, hipGraphNode_t
   if (s->GetCaptureStatus() == hipStreamCaptureStatusNone) {
     HIP_RETURN(hipErrorIllegalState);
   }
-  if ((numDependencies > 0 && dependencies == nullptr) ||
+  if ((s->GetCaptureGraph()->GetNodeCount() < numDependencies) ||
+      (numDependencies > 0 && dependencies == nullptr) ||
       (flags != 0 && flags != hipStreamAddCaptureDependencies &&
        flags != hipStreamSetCaptureDependencies)) {
     HIP_RETURN(hipErrorInvalidValue);
   }
   std::vector<hipGraphNode_t> depNodes;
+  const std::vector<hipGraphNode_t>& graphNodes = s->GetCaptureGraph()->GetNodes();
   for (int i = 0; i < numDependencies; i++) {
+    if ((dependencies[i] == nullptr) ||
+        std::find(std::begin(graphNodes), std::end(graphNodes), dependencies[i]) == std::end(graphNodes)) {
+      HIP_RETURN(hipErrorInvalidValue);
+    }
     depNodes.push_back(dependencies[i]);
   }
   if (flags == hipStreamAddCaptureDependencies) {
@@ -1700,7 +1713,7 @@ hipError_t hipGraphRemoveDependencies(hipGraph_t graph, const hipGraphNode_t* fr
   }
   for (size_t i = 0; i < numDependencies; i++) {
     if (to[i]->GetParentGraph() != graph || from[i]->GetParentGraph() != graph ||
-        from[i]->RemoveEdge(to[i]) == false) {
+        from[i]->RemoveUpdateEdge(to[i]) == false) {
       HIP_RETURN(hipErrorInvalidValue);
     }
   }
@@ -2203,54 +2216,63 @@ hipError_t hipDeviceGetGraphMemAttribute(int device, hipGraphMemAttributeType at
   if ((static_cast<size_t>(device) >= g_devices.size()) || device < 0 || value == nullptr) {
     HIP_RETURN(hipErrorInvalidDevice);
   }
-  // later use this to access memory pool
-  auto* deviceHandle = g_devices[device]->devices()[0];
+  hipError_t result = hipErrorInvalidValue;
   switch (attr) {
     case hipGraphMemAttrUsedMemCurrent:
-      *reinterpret_cast<int32_t*>(value) = 0;
+      result = g_devices[device]->GetGraphMemoryPool()->GetAttribute(
+          hipMemPoolAttrUsedMemCurrent, value);
       break;
     case hipGraphMemAttrUsedMemHigh:
-      *reinterpret_cast<int32_t*>(value) = 0;
+      result = g_devices[device]->GetGraphMemoryPool()->GetAttribute(
+          hipMemPoolAttrUsedMemHigh, value);
       break;
     case hipGraphMemAttrReservedMemCurrent:
-      *reinterpret_cast<int32_t*>(value) = 0;
+      result = g_devices[device]->GetGraphMemoryPool()->GetAttribute(
+          hipMemPoolAttrReservedMemCurrent, value);
       break;
     case hipGraphMemAttrReservedMemHigh:
-      *reinterpret_cast<int32_t*>(value) = 0;
+      result = g_devices[device]->GetGraphMemoryPool()->GetAttribute(
+          hipMemPoolAttrReservedMemHigh, value);
       break;
     default:
-      return HIP_RETURN(hipErrorInvalidValue);
+      break;
   }
-  return HIP_RETURN(hipSuccess);
+  return HIP_RETURN(result);
 }
 
+// ================================================================================================
 hipError_t hipDeviceSetGraphMemAttribute(int device, hipGraphMemAttributeType attr, void* value) {
   HIP_INIT_API(hipDeviceSetGraphMemAttribute, device, attr, value);
   if ((static_cast<size_t>(device) >= g_devices.size()) || device < 0 || value == nullptr) {
     HIP_RETURN(hipErrorInvalidDevice);
   }
-  // later use this to access memory pool
-  auto* deviceHandle = g_devices[device]->devices()[0];
+  hipError_t result = hipErrorInvalidValue;
   switch (attr) {
     case hipGraphMemAttrUsedMemHigh:
+      result = g_devices[device]->GetGraphMemoryPool()->SetAttribute(
+          hipMemPoolAttrUsedMemHigh, value);
       break;
     case hipGraphMemAttrReservedMemHigh:
+      result = g_devices[device]->GetGraphMemoryPool()->SetAttribute(
+          hipMemPoolAttrReservedMemHigh, value);
       break;
     default:
-      return HIP_RETURN(hipErrorInvalidValue);
+      break;
   }
-  return HIP_RETURN(hipSuccess);
+  return HIP_RETURN(result);
 }
 
+// ================================================================================================
 hipError_t hipDeviceGraphMemTrim(int device) {
   HIP_INIT_API(hipDeviceGraphMemTrim, device);
   if ((static_cast<size_t>(device) >= g_devices.size()) || device < 0) {
     HIP_RETURN(hipErrorInvalidDevice);
   }
-  // not implemented yet
+  g_devices[device]->GetGraphMemoryPool()->TrimTo(0);
   return HIP_RETURN(hipSuccess);
 }
 
+// ================================================================================================
 hipError_t hipUserObjectCreate(hipUserObject_t* object_out, void* ptr, hipHostFn_t destroy,
                                unsigned int initialRefcount, unsigned int flags) {
   HIP_INIT_API(hipUserObjectCreate, object_out, ptr, destroy, initialRefcount, flags);
@@ -2275,6 +2297,10 @@ hipError_t hipUserObjectRelease(hipUserObject_t object, unsigned int count) {
   if (object->referenceCount() < count || !hipUserObject::isUserObjvalid(object)) {
     HIP_RETURN(hipSuccess);
   }
+  //! If all the counts are gone not longer need the obj in the list
+  if (object->referenceCount() == count) {
+    hipUserObject::removeUSerObj(object);
+  }
   object->decreaseRefCount(count);
   HIP_RETURN(hipSuccess);
 }
diff --git a/src/hip_graph_helper.hpp b/src/hip_graph_helper.hpp
index 69780338..20d01165 100644
--- a/src/hip_graph_helper.hpp
+++ b/src/hip_graph_helper.hpp
@@ -5,9 +5,9 @@ hipError_t ihipMemcpy3D_validate(const hipMemcpy3DParms* p);
 hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind);
 
 hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes,
-                             hipMemcpyKind kind, amd::HostQueue& queue, bool isAsync = false);
+                             hipMemcpyKind kind, hip::Stream& stream, bool isAsync = false);
 
-void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, amd::HostQueue& queue);
+void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, hip::Stream& stream);
 
 bool IsHtoHMemcpy(void* dst, const void* src, hipMemcpyKind kind);
 
@@ -26,19 +26,19 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
                                    uint32_t globalWorkSizeX, uint32_t globalWorkSizeY,
                                    uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY,
                                    uint32_t blockDimZ, uint32_t sharedMemBytes,
-                                   amd::HostQueue* queue, void** kernelParams, void** extra,
+                                   hip::Stream* stream, void** kernelParams, void** extra,
                                    hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags,
                                    uint32_t params, uint32_t gridId, uint32_t numGrids,
                                    uint64_t prevGridSum, uint64_t allGridSum, uint32_t firstDevice);
 
 hipError_t ihipMemcpy3DCommand(amd::Command*& command, const hipMemcpy3DParms* p,
-                               amd::HostQueue* queue);
+                               hip::Stream* stream);
 
 hipError_t ihipMemsetCommand(std::vector<amd::Command*>& commands, void* dst, int64_t value,
-                             size_t valueSize, size_t sizeBytes, amd::HostQueue* queue);
+                             size_t valueSize, size_t sizeBytes, hip::Stream* stream);
 
 hipError_t ihipMemset3DCommand(std::vector<amd::Command*>& commands, hipPitchedPtr pitchedDevPtr,
-                               int value, hipExtent extent, amd::HostQueue* queue, size_t elementSize = 1);
+                               int value, hipExtent extent, hip::Stream* stream, size_t elementSize = 1);
 
 hipError_t ihipMemcpySymbol_validate(const void* symbol, size_t sizeBytes, size_t offset,
                                      size_t& sym_size, hipDeviceptr_t& device_ptr);
diff --git a/src/hip_graph_internal.cpp b/src/hip_graph_internal.cpp
index 3ce74cab..f4060a0f 100644
--- a/src/hip_graph_internal.cpp
+++ b/src/hip_graph_internal.cpp
@@ -77,14 +77,14 @@ hipError_t hipGraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size
     if (origDstMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) {
       return hipErrorInvalidValue;
     }
-    if (kind != hipMemcpyHostToDevice) {
+    if ((kind != hipMemcpyHostToDevice) && (kind != hipMemcpyDefault)) {
       return hipErrorInvalidValue;
     }
   } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) {  // device to host
     if (origSrcMemory->getContext().devices()[0] != srcMemory->getContext().devices()[0]) {
       return hipErrorInvalidValue;
     }
-    if (kind != hipMemcpyDeviceToHost) {
+    if ((kind != hipMemcpyDeviceToHost) && (kind != hipMemcpyDefault)) {
       return hipErrorInvalidValue;
     }
   } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) {
@@ -98,56 +98,6 @@ hipError_t hipGraphMemcpyNode1D::ValidateParams(void* dst, const void* src, size
   return hipSuccess;
 }
 
-hipError_t hipGraphMemcpyNode1D::SetCommandParams(void* dst, const void* src, size_t count,
-                                                  hipMemcpyKind kind) {
-  hipError_t status = ihipMemcpy_validate(dst, src, count, kind);
-  if (status != hipSuccess) {
-    return status;
-  }
-  size_t sOffsetOrig = 0;
-  amd::Memory* origSrcMemory = getMemoryObject(src, sOffsetOrig);
-  size_t dOffsetOrig = 0;
-  amd::Memory* origDstMemory = getMemoryObject(dst, dOffsetOrig);
-
-  size_t sOffset = 0;
-  amd::Memory* srcMemory = getMemoryObject(src, sOffset);
-  size_t dOffset = 0;
-  amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
-
-  if ((srcMemory == nullptr) && (dstMemory != nullptr)) {
-    if (origDstMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) {
-      return hipErrorInvalidValue;
-    }
-    amd::WriteMemoryCommand* command = reinterpret_cast<amd::WriteMemoryCommand*>(commands_[0]);
-    command->setParams(*dstMemory->asBuffer(), dOffset, count, src);
-  } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) {
-    if (origSrcMemory->getContext().devices()[0] != srcMemory->getContext().devices()[0]) {
-      return hipErrorInvalidValue;
-    }
-    amd::ReadMemoryCommand* command = reinterpret_cast<amd::ReadMemoryCommand*>(commands_[0]);
-    command->setParams(*srcMemory->asBuffer(), sOffset, count, dst);
-  } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) {
-    if (origDstMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) {
-      return hipErrorInvalidValue;
-    }
-    if (origSrcMemory->getContext().devices()[0] != srcMemory->getContext().devices()[0]) {
-      return hipErrorInvalidValue;
-    }
-    amd::CopyMemoryP2PCommand* command = reinterpret_cast<amd::CopyMemoryP2PCommand*>(commands_[0]);
-    command->setParams(*srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, count);
-    // Make sure runtime has valid memory for the command execution. P2P access
-    // requires page table mapping on the current device to another GPU memory
-    if (!static_cast<amd::CopyMemoryP2PCommand*>(command)->validateMemory()) {
-      delete command;
-      return hipErrorInvalidValue;
-    }
-  } else {
-    amd::CopyMemoryCommand* command = reinterpret_cast<amd::CopyMemoryCommand*>(commands_[0]);
-    command->setParams(*srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, count);
-  }
-  return hipSuccess;
-}
-
 hipError_t hipGraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParams) {
   hipError_t status = ihipMemcpy3D_validate(pNodeParams);
   if (status != hipSuccess) {
@@ -297,185 +247,6 @@ hipError_t hipGraphMemcpyNode::ValidateParams(const hipMemcpy3DParms* pNodeParam
   return hipSuccess;
 }
 
-hipError_t hipGraphMemcpyNode::SetCommandParams(const hipMemcpy3DParms* pNodeParams) {
-  hipError_t status = ihipMemcpy3D_validate(pNodeParams);
-  if (status != hipSuccess) {
-    return status;
-  }
-  const HIP_MEMCPY3D pCopy = hip::getDrvMemcpy3DDesc(*pNodeParams);
-  // If {src/dst}MemoryType is hipMemoryTypeUnified, {src/dst}Device and {src/dst}Pitch specify the
-  // (unified virtual address space) base address of the source data and the bytes per row to apply.
-  // {src/dst}Array is ignored.
-  hipMemoryType srcMemoryType = pCopy.srcMemoryType;
-  if (srcMemoryType == hipMemoryTypeUnified) {
-    srcMemoryType =
-        amd::MemObjMap::FindMemObj(pCopy.srcDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost;
-    if (srcMemoryType == hipMemoryTypeHost) {
-      // {src/dst}Host may be unitialized. Copy over {src/dst}Device into it if we detect system
-      // memory.
-      const_cast<HIP_MEMCPY3D*>(&pCopy)->srcHost = pCopy.srcDevice;
-    }
-  }
-  hipMemoryType dstMemoryType = pCopy.dstMemoryType;
-  if (dstMemoryType == hipMemoryTypeUnified) {
-    dstMemoryType =
-        amd::MemObjMap::FindMemObj(pCopy.dstDevice) ? hipMemoryTypeDevice : hipMemoryTypeHost;
-    if (srcMemoryType == hipMemoryTypeHost) {
-      const_cast<HIP_MEMCPY3D*>(&pCopy)->dstHost = pCopy.dstDevice;
-    }
-  }
-
-  // If {src/dst}MemoryType is hipMemoryTypeHost, check if the memory was prepinned.
-  // In that case upgrade the copy type to hipMemoryTypeDevice to avoid extra pinning.
-  if (srcMemoryType == hipMemoryTypeHost) {
-    amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy.srcHost);
-    srcMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost;
-    if (srcMemoryType == hipMemoryTypeDevice) {
-      const_cast<HIP_MEMCPY3D*>(&pCopy)->srcDevice = const_cast<void*>(pCopy.srcHost);
-    }
-  }
-  if (dstMemoryType == hipMemoryTypeHost) {
-    amd::Memory* mem = amd::MemObjMap::FindMemObj(pCopy.dstHost);
-    dstMemoryType = mem ? hipMemoryTypeDevice : hipMemoryTypeHost;
-    if (dstMemoryType == hipMemoryTypeDevice) {
-      const_cast<HIP_MEMCPY3D*>(&pCopy)->dstDevice = const_cast<void*>(pCopy.dstDevice);
-    }
-  }
-
-  amd::Coord3D srcOrigin = {pCopy.srcXInBytes, pCopy.srcY, pCopy.srcZ};
-  amd::Coord3D dstOrigin = {pCopy.dstXInBytes, pCopy.dstY, pCopy.dstZ};
-  amd::Coord3D copyRegion = {pCopy.WidthInBytes, pCopy.Height, pCopy.Depth};
-
-  if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeDevice)) {
-    // Host to Device.
-
-    amd::Memory* dstMemory;
-    amd::BufferRect srcRect;
-    amd::BufferRect dstRect;
-
-    status =
-        ihipMemcpyHtoDValidate(pCopy.srcHost, pCopy.dstDevice, srcOrigin, dstOrigin, copyRegion,
-                               pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, pCopy.dstPitch,
-                               pCopy.dstPitch * pCopy.dstHeight, dstMemory, srcRect, dstRect);
-    if (status != hipSuccess) {
-      return status;
-    }
-    amd::WriteMemoryCommand* command = reinterpret_cast<amd::WriteMemoryCommand*>(commands_[0]);
-    command->setParams(*dstMemory, {dstRect.start_, 0, 0}, copyRegion, pCopy.srcHost, dstRect,
-                       srcRect);
-  } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeHost)) {
-    // Device to Host.
-    amd::Memory* srcMemory;
-    amd::BufferRect srcRect;
-    amd::BufferRect dstRect;
-    status =
-        ihipMemcpyDtoHValidate(pCopy.srcDevice, pCopy.dstHost, srcOrigin, dstOrigin, copyRegion,
-                               pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, pCopy.dstPitch,
-                               pCopy.dstPitch * pCopy.dstHeight, srcMemory, srcRect, dstRect);
-    if (status != hipSuccess) {
-      return status;
-    }
-    amd::ReadMemoryCommand* command = reinterpret_cast<amd::ReadMemoryCommand*>(commands_[0]);
-    command->setParams(*srcMemory, {srcRect.start_, 0, 0}, copyRegion, pCopy.dstHost, srcRect,
-                       dstRect);
-    command->setSource(*srcMemory);
-    command->setOrigin({srcRect.start_, 0, 0});
-    command->setSize(copyRegion);
-    command->setDestination(pCopy.dstHost);
-    command->setBufRect(srcRect);
-    command->setHostRect(dstRect);
-  } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice)) {
-    // Device to Device.
-    amd::Memory* srcMemory;
-    amd::Memory* dstMemory;
-    amd::BufferRect srcRect;
-    amd::BufferRect dstRect;
-
-    status = ihipMemcpyDtoDValidate(pCopy.srcDevice, pCopy.dstDevice, srcOrigin, dstOrigin,
-                                    copyRegion, pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight,
-                                    pCopy.dstPitch, pCopy.dstPitch * pCopy.dstHeight, srcMemory,
-                                    dstMemory, srcRect, dstRect);
-    if (status != hipSuccess) {
-      return status;
-    }
-    amd::CopyMemoryCommand* command = reinterpret_cast<amd::CopyMemoryCommand*>(commands_[0]);
-    command->setParams(*srcMemory, *dstMemory, {srcRect.start_, 0, 0}, {dstRect.start_, 0, 0},
-                       copyRegion, srcRect, dstRect);
-  } else if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeArray)) {
-    amd::Image* dstImage;
-    amd::BufferRect srcRect;
-
-    status =
-        ihipMemcpyHtoAValidate(pCopy.srcHost, pCopy.dstArray, srcOrigin, dstOrigin, copyRegion,
-                               pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight, dstImage, srcRect);
-    if (status != hipSuccess) {
-      return status;
-    }
-    amd::WriteMemoryCommand* command = reinterpret_cast<amd::WriteMemoryCommand*>(commands_[0]);
-    command->setParams(*dstImage, dstOrigin, copyRegion,
-                       static_cast<const char*>(pCopy.srcHost) + srcRect.start_, pCopy.srcPitch,
-                       pCopy.srcPitch * pCopy.srcHeight);
-  } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeHost)) {
-    // Image to Host.
-    amd::Image* srcImage;
-    amd::BufferRect dstRect;
-
-    status =
-        ihipMemcpyAtoHValidate(pCopy.srcArray, pCopy.dstHost, srcOrigin, dstOrigin, copyRegion,
-                               pCopy.dstPitch, pCopy.dstPitch * pCopy.dstHeight, srcImage, dstRect);
-    if (status != hipSuccess) {
-      return status;
-    }
-    amd::ReadMemoryCommand* command = reinterpret_cast<amd::ReadMemoryCommand*>(commands_[0]);
-    command->setParams(*srcImage, srcOrigin, copyRegion,
-                       static_cast<char*>(pCopy.dstHost) + dstRect.start_, pCopy.dstPitch,
-                       pCopy.dstPitch * pCopy.dstHeight);
-  } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeArray)) {
-    // Device to Image.
-    amd::Image* dstImage;
-    amd::Memory* srcMemory;
-    amd::BufferRect dstRect;
-    amd::BufferRect srcRect;
-    status = ihipMemcpyDtoAValidate(pCopy.srcDevice, pCopy.dstArray, srcOrigin, dstOrigin,
-                                    copyRegion, pCopy.srcPitch, pCopy.srcPitch * pCopy.srcHeight,
-                                    dstImage, srcMemory, dstRect, srcRect);
-    if (status != hipSuccess) {
-      return status;
-    }
-    amd::CopyMemoryCommand* command = reinterpret_cast<amd::CopyMemoryCommand*>(commands_[0]);
-    command->setParams(*srcMemory, *dstImage, srcOrigin, dstOrigin, copyRegion, srcRect, dstRect);
-  } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeDevice)) {
-    // Image to Device.
-    amd::BufferRect srcRect;
-    amd::BufferRect dstRect;
-    amd::Memory* dstMemory;
-    amd::Image* srcImage;
-    status = ihipMemcpyAtoDValidate(pCopy.srcArray, pCopy.dstDevice, srcOrigin, dstOrigin,
-                                    copyRegion, pCopy.dstPitch, pCopy.dstPitch * pCopy.dstHeight,
-                                    dstMemory, srcImage, srcRect, dstRect);
-    if (status != hipSuccess) {
-      return status;
-    }
-    amd::CopyMemoryCommand* command = reinterpret_cast<amd::CopyMemoryCommand*>(commands_[0]);
-    command->setParams(*srcImage, *dstMemory, srcOrigin, dstOrigin, copyRegion, srcRect, dstRect);
-  } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeArray)) {
-    amd::Image* srcImage;
-    amd::Image* dstImage;
-
-    status = ihipMemcpyAtoAValidate(pCopy.srcArray, pCopy.dstArray, srcOrigin, dstOrigin,
-                                    copyRegion, srcImage, dstImage);
-    if (status != hipSuccess) {
-      return status;
-    }
-    amd::CopyMemoryCommand* command = reinterpret_cast<amd::CopyMemoryCommand*>(commands_[0]);
-    command->setParams(*srcImage, *dstImage, srcOrigin, dstOrigin, copyRegion);
-  } else {
-    return hipErrorInvalidValue;
-  }
-  return hipSuccess;
-}
-
-
 bool ihipGraph::isGraphValid(ihipGraph* pGraph) {
   amd::ScopedLock lock(graphSetLock_);
   if (graphSet_.find(pGraph) == graphSet_.end()) {
@@ -679,41 +450,38 @@ bool hipGraphExec::isGraphExecValid(hipGraphExec* pGraphExec) {
   return true;
 }
 
-hipError_t hipGraphExec::CreateQueues(size_t numQueues) {
-  parallelQueues_.reserve(numQueues);
-  for (size_t i = 0; i < numQueues; i++) {
-    amd::HostQueue* queue;
-    queue = new amd::HostQueue(
-        *hip::getCurrentDevice()->asContext(), *hip::getCurrentDevice()->devices()[0], 0,
-        amd::CommandQueue::RealTimeDisabled, amd::CommandQueue::Priority::Normal);
-
-    bool result = (queue != nullptr) ? queue->create() : false;
-    // Create a host queue
-    if (result) {
-      parallelQueues_.push_back(queue);
-    } else {
-      ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to create host queue\n");
+hipError_t hipGraphExec::CreateStreams(uint32_t num_streams) {
+  parallel_streams_.reserve(num_streams);
+  for (uint32_t i = 0; i < num_streams; ++i) {
+    auto stream = new hip::Stream(hip::getCurrentDevice(),
+                                  hip::Stream::Priority::Normal, hipStreamNonBlocking);
+    if (stream == nullptr || !stream->Create()) {
+      if (stream != nullptr) {
+        stream->release();
+      }
+      ClPrint(amd::LOG_ERROR, amd::LOG_CODE, "[hipGraph] Failed to create parallel stream!\n");
       return hipErrorOutOfMemory;
     }
+    parallel_streams_.push_back(stream);
   }
   return hipSuccess;
 }
 
 hipError_t hipGraphExec::Init() {
   hipError_t status;
-  size_t reqNumQueues = 1;
+  size_t min_num_streams = 1;
 
   for (auto& node : levelOrder_) {
-    reqNumQueues += node->GetNumParallelQueues();
+    min_num_streams += node->GetNumParallelStreams();
   }
-  status = CreateQueues(parallelLists_.size() - 1 + reqNumQueues);
+  status = CreateStreams(parallelLists_.size() - 1 + min_num_streams);
   return status;
 }
 
 hipError_t FillCommands(std::vector<std::vector<Node>>& parallelLists,
                         std::unordered_map<Node, std::vector<Node>>& nodeWaitLists,
                         std::vector<Node>& levelOrder, std::vector<amd::Command*>& rootCommands,
-                        amd::Command*& endCommand, amd::HostQueue* queue) {
+                        amd::Command*& endCommand, hip::Stream* stream) {
   hipError_t status;
   for (auto& node : levelOrder) {
     // TODO: clone commands from next launch
@@ -763,7 +531,7 @@ hipError_t FillCommands(std::vector<std::vector<Node>>& parallelLists,
     }
   }
   if (!graphLastCmdWaitList.empty()) {
-    endCommand = new amd::Marker(*queue, false, graphLastCmdWaitList);
+    endCommand = new amd::Marker(*stream, false, graphLastCmdWaitList);
     if (endCommand == nullptr) {
       return hipErrorOutOfMemory;
     }
@@ -771,19 +539,19 @@ hipError_t FillCommands(std::vector<std::vector<Node>>& parallelLists,
   return hipSuccess;
 }
 
-void UpdateQueue(std::vector<std::vector<Node>>& parallelLists, amd::HostQueue*& queue,
+void UpdateStream(std::vector<std::vector<Node>>& parallelLists, hip::Stream* stream,
                  hipGraphExec* ptr) {
   int i = 0;
   for (const auto& list : parallelLists) {
     // first parallel list will be launched on the same queue as parent
     if (i == 0) {
       for (auto& node : list) {
-        node->SetQueue(queue, ptr);
+        node->SetStream(stream, ptr);
       }
-    } else {  // New queue for parallel branches
-      amd::HostQueue* paralleQueue = ptr->GetAvailableQueue();
+    } else {  // New stream for parallel branches
+      hip::Stream* stream = ptr->GetAvailableStreams();
       for (auto& node : list) {
-        node->SetQueue(paralleQueue, ptr);
+        node->SetStream(stream, ptr);
       }
     }
     i++;
@@ -792,15 +560,22 @@ void UpdateQueue(std::vector<std::vector<Node>>& parallelLists, amd::HostQueue*&
 
 hipError_t hipGraphExec::Run(hipStream_t stream) {
   hipError_t status;
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+
+  if (hip::getStream(stream) == nullptr) {
     return hipErrorInvalidResourceHandle;
   }
-  UpdateQueue(parallelLists_, queue, this);
+  if (flags_ == hipGraphInstantiateFlagAutoFreeOnLaunch) {
+    if (!levelOrder_.empty()) {
+      levelOrder_[0]->GetParentGraph()->FreeAllMemory();
+    }
+  }
+  auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream()
+                                        : reinterpret_cast<hip::Stream*>(stream);
+  UpdateStream(parallelLists_, hip_stream, this);
   std::vector<amd::Command*> rootCommands;
   amd::Command* endCommand = nullptr;
   status =
-      FillCommands(parallelLists_, nodeWaitLists_, levelOrder_, rootCommands, endCommand, queue);
+      FillCommands(parallelLists_, nodeWaitLists_, levelOrder_, rootCommands, endCommand, hip_stream);
   if (status != hipSuccess) {
     return status;
   }
diff --git a/src/hip_graph_internal.hpp b/src/hip_graph_internal.hpp
index 6462535d..819125c1 100644
--- a/src/hip_graph_internal.hpp
+++ b/src/hip_graph_internal.hpp
@@ -38,9 +38,9 @@ typedef hipGraphNode* Node;
 hipError_t FillCommands(std::vector<std::vector<Node>>& parallelLists,
                         std::unordered_map<Node, std::vector<Node>>& nodeWaitLists,
                         std::vector<Node>& levelOrder, std::vector<amd::Command*>& rootCommands,
-                        amd::Command*& endCommand, amd::HostQueue* queue);
-void UpdateQueue(std::vector<std::vector<Node>>& parallelLists, amd::HostQueue*& queue,
-                 hipGraphExec* ptr);
+                        amd::Command*& endCommand, hip::Stream* stream);
+void UpdateStream(std::vector<std::vector<Node>>& parallelLists, hip::Stream* stream,
+                  hipGraphExec* ptr);
 
 struct hipUserObject : public amd::ReferenceCountedObject {
   typedef void (*UserCallbackDestructor)(void* data);
@@ -76,8 +76,8 @@ struct hipUserObject : public amd::ReferenceCountedObject {
   }
 
   static bool isUserObjvalid(hipUserObject* pUsertObj) {
-    amd::ScopedLock lock(UserObjectLock_);
-    if (ObjectSet_.find(pUsertObj) == ObjectSet_.end()) {
+    auto it = ObjectSet_.find(pUsertObj);
+    if (it == ObjectSet_.end()) {
       return false;
     }
     return true;
@@ -85,8 +85,9 @@ struct hipUserObject : public amd::ReferenceCountedObject {
 
   static void removeUSerObj(hipUserObject* pUsertObj) {
     amd::ScopedLock lock(UserObjectLock_);
-    if (ObjectSet_.find(pUsertObj) == ObjectSet_.end()) {
-      ObjectSet_.erase(pUsertObj);
+    auto it = ObjectSet_.find(pUsertObj);
+    if (it != ObjectSet_.end()) {
+      ObjectSet_.erase(it);
     }
   }
 
@@ -154,7 +155,7 @@ struct hipGraphNodeDOTAttribute {
 
 struct hipGraphNode : public hipGraphNodeDOTAttribute {
  protected:
-  amd::HostQueue* queue_;
+  hip::Stream* stream_ = nullptr;
   uint32_t level_;
   unsigned int id_;
   hipGraphNodeType type_;
@@ -221,13 +222,15 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute {
     return true;
   }
 
-  amd::HostQueue* GetQueue() { return queue_; }
+  hip::Stream* GetQueue() { return stream_; }
 
-  virtual void SetQueue(amd::HostQueue* queue, hipGraphExec* ptr = nullptr) { queue_ = queue; }
+  virtual void SetStream(hip::Stream* stream, hipGraphExec* ptr = nullptr) {
+    stream_ = stream;
+  }
   /// Create amd::command for the graph node
-  virtual hipError_t CreateCommand(amd::HostQueue* queue) {
+  virtual hipError_t CreateCommand(hip::Stream* stream) {
     commands_.clear();
-    queue_ = queue;
+    stream_ = stream;
     return hipSuccess;
   }
   /// Return node unique ID
@@ -265,6 +268,9 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute {
     dependencies_.erase(std::remove(dependencies_.begin(), dependencies_.end(), node),
                         dependencies_.end());
   }
+  void RemoveEdge(const Node& childNode) {
+    edges_.erase(std::remove(edges_.begin(), edges_.end(), childNode), edges_.end());
+  }
   /// Return graph node children
   const std::vector<Node>& GetEdges() const { return edges_; }
   /// Updates graph node children
@@ -280,6 +286,12 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute {
       edge->UpdateEdgeLevel();
     }
   }
+  void ReduceEdgeLevel() {
+    for (auto edge: edges_) {
+      edge->SetLevel(std::min(edge->GetLevel(),GetLevel() + 1));
+      edge->ReduceEdgeLevel();
+    }
+  }
   /// Add edge, update parent node outdegree, child node indegree, level and dependency
   void AddEdge(const Node& childNode) {
     edges_.push_back(childNode);
@@ -290,7 +302,7 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute {
     childNode->AddDependency(this);
   }
   /// Remove edge, update parent node outdegree, child node indegree, level and dependency
-  bool RemoveEdge(const Node& childNode) {
+  bool RemoveUpdateEdge(const Node& childNode) {
     // std::remove changes the end() hence saving it before hand for validation
     auto currEdgeEnd = edges_.end();
     auto it = std::remove(edges_.begin(), edges_.end(), childNode);
@@ -301,15 +313,20 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute {
     edges_.erase(it, edges_.end());
     outDegree_--;
     childNode->SetInDegree(childNode->GetInDegree() - 1);
+    childNode->RemoveDependency(this);
     const std::vector<Node>& dependencies = childNode->GetDependencies();
     int32_t level = 0;
     int32_t parentLevel = 0;
+    uint32_t origLevel = 0;
     for (auto parent : dependencies) {
       parentLevel = parent->GetLevel();
       level = std::max(level, (parentLevel + 1));
     }
+    origLevel = childNode->GetLevel();
     childNode->SetLevel(level);
-    childNode->RemoveDependency(this);
+    if (level < origLevel) {
+      childNode->ReduceEdgeLevel();
+    }
     return true;
   }
   /// Get Runlist of the nodes embedded as part of the graphnode(e.g. ChildGraph)
@@ -323,7 +340,7 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute {
       command->updateEventWaitList(waitList);
     }
   }
-  virtual size_t GetNumParallelQueues() { return 0; }
+  virtual size_t GetNumParallelStreams() { return 0; }
   /// Enqueue commands part of the node
   virtual void EnqueueCommands(hipStream_t stream) {
     // If the node is disabled it becomes empty node. To maintain ordering just enqueue marker.
@@ -332,8 +349,8 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute {
         (type_ == hipGraphNodeTypeKernel || type_ == hipGraphNodeTypeMemcpy ||
          type_ == hipGraphNodeTypeMemset)) {
       amd::Command::EventWaitList waitList;
-      amd::HostQueue* queue = hip::getQueue(stream);
-      amd::Command* command = new amd::Marker(*queue, !kMarkerDisableFlush, waitList);
+      hip::Stream* hip_stream = hip::getStream(stream);
+      amd::Command* command = new amd::Marker(*hip_stream, !kMarkerDisableFlush, waitList);
       command->enqueue();
       command->release();
       return;
@@ -365,7 +382,7 @@ struct hipGraphNode : public hipGraphNodeDOTAttribute {
       fout << "\"" << fromNodeName << "\" -> \"" << toNodeName << "\"" << std::endl;
     }
   }
-  virtual std::string GetLabel() { return (std::to_string(id_) + "\n" + label_); }
+  virtual std::string GetLabel(hipGraphDebugDotFlags flag) { return (std::to_string(id_) + "\n" + label_); }
   unsigned int GetEnabled() const { return isEnabled_; }
   void SetEnabled(unsigned int isEnabled) { isEnabled_ = isEnabled; }
 };
@@ -389,17 +406,9 @@ struct ihipGraph {
       , device_(device) {
     amd::ScopedLock lock(graphSetLock_);
     graphSet_.insert(this);
-    if (original == nullptr) {
-      // Create memory pool, associated with the graph
-      mem_pool_ = new hip::MemoryPool(device);
-      uint64_t max_size = std::numeric_limits<uint64_t>::max();
-      // Note: the call for the threshold is always successful
-      auto error = mem_pool_->SetAttribute(hipMemPoolAttrReleaseThreshold, &max_size);
-    } else {
-      mem_pool_ = original->mem_pool_;
-      mem_pool_->retain();
-    }
-  };
+    mem_pool_ = device->GetGraphMemoryPool();
+    mem_pool_->retain();
+  }
 
   ~ihipGraph() {
     for (auto node : vertices_) {
@@ -414,7 +423,7 @@ struct ihipGraph {
       mem_pool_->release();
     }
 
-  };
+  }
 
   void AddManualNodeDuringCapture(hipGraphNode* node) { capturedNodes_.insert(node); }
 
@@ -478,7 +487,7 @@ struct ihipGraph {
   ihipGraph* clone() const;
   void GenerateDOT(std::ostream& fout, hipGraphDebugDotFlags flag) {
     fout << "subgraph cluster_" << GetID() << " {" << std::endl;
-    fout << "graph[style=\"dashed\" label=\"graph_" << GetID() << "\"];\n";
+    fout << "label=\"graph_" << GetID() <<"\"graph[style=\"dashed\"];\n";
     for (auto node : vertices_) {
       node->GenerateDOTNode(GetID(), fout, flag);
     }
@@ -516,6 +525,10 @@ struct ihipGraph {
     }
     return false;
   }
+
+  void FreeAllMemory() {
+    mem_pool_->FreeAllMemory();
+  }
 };
 
 struct hipGraphExec {
@@ -523,26 +536,28 @@ struct hipGraphExec {
   // level order of the graph doesn't include nodes embedded as part of the child graph
   std::vector<Node> levelOrder_;
   std::unordered_map<Node, std::vector<Node>> nodeWaitLists_;
-  std::vector<amd::HostQueue*> parallelQueues_;
+  std::vector<hip::Stream*> parallel_streams_;
   uint currentQueueIndex_;
   std::unordered_map<Node, Node> clonedNodes_;
   amd::Command* lastEnqueuedCommand_;
   static std::unordered_set<hipGraphExec*> graphExecSet_;
   std::unordered_set<hipUserObject*> graphExeUserObj_;
   static amd::Monitor graphExecSetLock_;
-
+  uint64_t flags_ = 0;
  public:
   hipGraphExec(std::vector<Node>& levelOrder, std::vector<std::vector<Node>>& lists,
                std::unordered_map<Node, std::vector<Node>>& nodeWaitLists,
                std::unordered_map<Node, Node>& clonedNodes,
-               std::unordered_set<hipUserObject*>& userObjs)
+               std::unordered_set<hipUserObject*>& userObjs,
+               uint64_t flags = 0)
       : parallelLists_(lists),
         levelOrder_(levelOrder),
         nodeWaitLists_(nodeWaitLists),
         clonedNodes_(clonedNodes),
         lastEnqueuedCommand_(nullptr),
         graphExeUserObj_(userObjs),
-        currentQueueIndex_(0) {
+        currentQueueIndex_(0),
+        flags_(flags) {
     amd::ScopedLock lock(graphExecSetLock_);
     graphExecSet_.insert(this);
   }
@@ -550,8 +565,10 @@ struct hipGraphExec {
   ~hipGraphExec() {
     // new commands are launched for every launch they are destroyed as and when command is
     // terminated after it complete execution
-    for (auto queue : parallelQueues_) {
-      queue->release();
+    for (auto stream : parallel_streams_) {
+      if (stream != nullptr) {
+        stream->release();
+      }
     }
     for (auto it = clonedNodes_.begin(); it != clonedNodes_.end(); it++) delete it->second;
     amd::ScopedLock lock(graphExecSetLock_);
@@ -576,10 +593,10 @@ struct hipGraphExec {
 
   std::vector<Node>& GetNodes() { return levelOrder_; }
 
-  amd::HostQueue* GetAvailableQueue() { return parallelQueues_[currentQueueIndex_++]; }
+  hip::Stream* GetAvailableStreams() { return parallel_streams_[currentQueueIndex_++]; }
   void ResetQueueIndex() { currentQueueIndex_ = 0; }
   hipError_t Init();
-  hipError_t CreateQueues(size_t numQueues);
+  hipError_t CreateStreams(uint32_t num_streams);
   hipError_t Run(hipStream_t stream);
 };
 
@@ -608,20 +625,20 @@ struct hipChildGraphNode : public hipGraphNode {
 
   ihipGraph* GetChildGraph() { return childGraph_; }
 
-  size_t GetNumParallelQueues() {
+  size_t GetNumParallelStreams() {
     LevelOrder(childGraphlevelOrder_);
     size_t num = 0;
     for (auto& node : childGraphlevelOrder_) {
-      num += node->GetNumParallelQueues();
+      num += node->GetNumParallelStreams();
     }
     // returns total number of parallel queues required for child graph nodes to be launched
     // first parallel list will be launched on the same queue as parent
     return num + (parallelLists_.size() - 1);
   }
 
-  void SetQueue(amd::HostQueue* queue, hipGraphExec* ptr = nullptr) {
-    queue_ = queue;
-    UpdateQueue(parallelLists_, queue, ptr);
+  void SetStream(hip::Stream* stream, hipGraphExec* ptr = nullptr) {
+    stream_ = stream;
+    UpdateStream(parallelLists_, stream, ptr);
   }
 
   // For nodes that are dependent on the child graph node waitlist is the last node of the first
@@ -629,8 +646,8 @@ struct hipChildGraphNode : public hipGraphNode {
   std::vector<amd::Command*>& GetCommands() { return parallelLists_[0].back()->GetCommands(); }
 
   // Create child graph node commands and set waitlists
-  hipError_t CreateCommand(amd::HostQueue* queue) {
-    hipError_t status = hipGraphNode::CreateCommand(queue);
+  hipError_t CreateCommand(hip::Stream* stream) {
+    hipError_t status = hipGraphNode::CreateCommand(stream);
     if (status != hipSuccess) {
       return status;
     }
@@ -638,7 +655,7 @@ struct hipChildGraphNode : public hipGraphNode {
     std::vector<amd::Command*> rootCommands;
     amd::Command* endCommand = nullptr;
     status = FillCommands(parallelLists_, nodeWaitLists_, childGraphlevelOrder_, rootCommands,
-                          endCommand, queue);
+                          endCommand, stream);
     for (auto& cmd : rootCommands) {
       commands_.push_back(cmd);
     }
@@ -710,12 +727,31 @@ class hipGraphKernelNode : public hipGraphNode {
   unsigned int kernelAttrInUse_;
 
  public:
+    void PrintAttributes(std::ostream& out, hipGraphDebugDotFlags flag) {
+      out << "[";
+      out << "style";
+      out << "=\"";
+      out << style_;
+      (flag == hipGraphDebugDotFlagsKernelNodeParams ||
+       flag == hipGraphDebugDotFlagsKernelNodeAttributes) ?
+       out << "\n" : out << "\"";
+      out << "shape";
+      out << "=\"";
+      out << GetShape(flag);
+      out << "\"";
+      out << "label";
+      out << "=\"";
+      out << GetLabel(flag);
+      out << "\"";
+      out << "];";
+      }
+
   std::string GetLabel(hipGraphDebugDotFlags flag) {
     hipFunction_t func = getFunc(*pKernelParams_, ihipGetDevice());
     hip::DeviceFunc* function = hip::DeviceFunc::asFunction(func);
     std::string label;
-    if (flag == hipGraphDebugDotFlagsKernelNodeParams || flag == hipGraphDebugDotFlagsVerbose) {
-      char buffer[500];
+    char buffer[500];
+    if (flag == hipGraphDebugDotFlagsVerbose) {
       sprintf(buffer,
               "{\n%s\n| {ID | %d | %s\\<\\<\\<(%u,%u,%u),(%u,%u,%u),%u\\>\\>\\>}\n| {{node "
               "handle | func handle} | {%p | %p}}\n| {accessPolicyWindow | {base_ptr | num_bytes | "
@@ -729,8 +765,29 @@ class hipGraphKernelNode : public hipGraphNode {
               kernelAttr_.accessPolicyWindow.hitRatio, kernelAttr_.accessPolicyWindow.hitProp,
               kernelAttr_.accessPolicyWindow.missProp, kernelAttr_.cooperative);
       label = buffer;
-    } else {
-      label = std::to_string(GetID()) + "\n" + function->name();
+    }
+    else if (flag == hipGraphDebugDotFlagsKernelNodeAttributes) {
+      sprintf(buffer,
+              "{\n%s\n| {ID | %d | %s}\n"
+              "| {accessPolicyWindow | {base_ptr | num_bytes | "
+              "hitRatio | hitProp | missProp} |\n| {%p | %ld | %f | %d | %d}}\n| {cooperative | "
+              "%u}\n| {priority | 0}\n}",
+              label_.c_str(), GetID(), function->name().c_str(),
+              kernelAttr_.accessPolicyWindow.base_ptr, kernelAttr_.accessPolicyWindow.num_bytes,
+              kernelAttr_.accessPolicyWindow.hitRatio, kernelAttr_.accessPolicyWindow.hitProp,
+              kernelAttr_.accessPolicyWindow.missProp, kernelAttr_.cooperative);
+      label = buffer;
+    }
+    else if (flag == hipGraphDebugDotFlagsKernelNodeParams) {
+      sprintf(buffer, "%d\n%s\n\\<\\<\\<(%u,%u,%u),(%u,%u,%u),%u\\>\\>\\>",
+              GetID(), function->name().c_str(), pKernelParams_->gridDim.x,
+              pKernelParams_->gridDim.y, pKernelParams_->gridDim.z,
+              pKernelParams_->blockDim.x, pKernelParams_->blockDim.y,
+              pKernelParams_->blockDim.z, pKernelParams_->sharedMemBytes);
+      label = buffer;
+    }
+    else {
+      label = std::to_string(GetID()) + "\n" + function->name() + "\n";
     }
     return label;
   }
@@ -868,14 +925,14 @@ class hipGraphKernelNode : public hipGraphNode {
     return new hipGraphKernelNode(static_cast<hipGraphKernelNode const&>(*this));
   }
 
-  hipError_t CreateCommand(amd::HostQueue* queue) {
+  hipError_t CreateCommand(hip::Stream* stream) {
     hipFunction_t func = nullptr;
     hipError_t status = validateKernelParams(pKernelParams_, &func,
-                                             queue ? hip::getDeviceID(queue->context()) : -1);
+                                             stream ? hip::getDeviceID(stream->context()) : -1);
     if (hipSuccess != status) {
       return status;
     }
-    status = hipGraphNode::CreateCommand(queue);
+    status = hipGraphNode::CreateCommand(stream);
     if (status != hipSuccess) {
       return status;
     }
@@ -886,7 +943,7 @@ class hipGraphKernelNode : public hipGraphNode {
         pKernelParams_->gridDim.y * pKernelParams_->blockDim.y,
         pKernelParams_->gridDim.z * pKernelParams_->blockDim.z, pKernelParams_->blockDim.x,
         pKernelParams_->blockDim.y, pKernelParams_->blockDim.z, pKernelParams_->sharedMemBytes,
-        queue, pKernelParams_->kernelParams, pKernelParams_->extra, nullptr, nullptr, 0, 0, 0, 0, 0,
+        stream, pKernelParams_->kernelParams, pKernelParams_->extra, nullptr, nullptr, 0, 0, 0, 0, 0,
         0, 0);
     commands_.emplace_back(command);
     return status;
@@ -942,7 +999,7 @@ class hipGraphKernelNode : public hipGraphNode {
   }
   hipError_t GetAttrParams(hipKernelNodeAttrID attr, hipKernelNodeAttrValue* params) {
     // Get kernel attr params
-    if (kernelAttrInUse_ != attr) return hipErrorInvalidValue;
+    if (kernelAttrInUse_ != 0 && kernelAttrInUse_ != attr) return hipErrorInvalidValue;
     if (attr == hipKernelNodeAttributeAccessPolicyWindow) {
       params->accessPolicyWindow.base_ptr = kernelAttr_.accessPolicyWindow.base_ptr;
       params->accessPolicyWindow.hitProp = kernelAttr_.accessPolicyWindow.hitProp;
@@ -979,22 +1036,6 @@ class hipGraphKernelNode : public hipGraphNode {
     }
     return hipSuccess;
   }
-  // ToDo: use this when commands are cloned and command params are to be updated
-  hipError_t SetCommandParams(const hipKernelNodeParams* params) {
-    // updates kernel params
-    hipError_t status = validateKernelParams(params);
-    if (hipSuccess != status) {
-      return status;
-    }
-    size_t globalWorkOffset[3] = {0};
-    size_t globalWorkSize[3] = {params->gridDim.x, params->gridDim.y, params->gridDim.z};
-    size_t localWorkSize[3] = {params->blockDim.x, params->blockDim.y, params->blockDim.z};
-    reinterpret_cast<amd::NDRangeKernelCommand*>(commands_[0])
-        ->setSizes(globalWorkOffset, globalWorkSize, localWorkSize);
-    reinterpret_cast<amd::NDRangeKernelCommand*>(commands_[0])
-        ->setSharedMemBytes(params->sharedMemBytes);
-    return hipSuccess;
-  }
 
   hipError_t SetParams(hipGraphNode* node) {
     const hipGraphKernelNode* kernelNode = static_cast<hipGraphKernelNode const*>(node);
@@ -1045,17 +1086,17 @@ class hipGraphMemcpyNode : public hipGraphNode {
     return new hipGraphMemcpyNode(static_cast<hipGraphMemcpyNode const&>(*this));
   }
 
-  hipError_t CreateCommand(amd::HostQueue* queue) {
+  hipError_t CreateCommand(hip::Stream* stream) {
     if (IsHtoHMemcpy(pCopyParams_->dstPtr.ptr, pCopyParams_->srcPtr.ptr, pCopyParams_->kind)) {
       return hipSuccess;
     }
-    hipError_t status = hipGraphNode::CreateCommand(queue);
+    hipError_t status = hipGraphNode::CreateCommand(stream);
     if (status != hipSuccess) {
       return status;
     }
     commands_.reserve(1);
     amd::Command* command;
-    status = ihipMemcpy3DCommand(command, pCopyParams_, queue);
+    status = ihipMemcpy3DCommand(command, pCopyParams_, stream);
     commands_.emplace_back(command);
     return status;
   }
@@ -1064,7 +1105,7 @@ class hipGraphMemcpyNode : public hipGraphNode {
     if (isEnabled_ && IsHtoHMemcpy(pCopyParams_->dstPtr.ptr, pCopyParams_->srcPtr.ptr, pCopyParams_->kind)) {
       ihipHtoHMemcpy(pCopyParams_->dstPtr.ptr, pCopyParams_->srcPtr.ptr,
                      pCopyParams_->extent.width * pCopyParams_->extent.height *
-                     pCopyParams_->extent.depth, *hip::getQueue(stream));
+                     pCopyParams_->extent.depth, *hip::getStream(stream));
       return;
     }
     hipGraphNode::EnqueueCommands(stream);
@@ -1085,8 +1126,6 @@ class hipGraphMemcpyNode : public hipGraphNode {
     const hipGraphMemcpyNode* memcpyNode = static_cast<hipGraphMemcpyNode const*>(node);
     return SetParams(memcpyNode->pCopyParams_);
   }
-  // ToDo: use this when commands are cloned and command params are to be updated
-  hipError_t SetCommandParams(const hipMemcpy3DParms* pNodeParams);
   hipError_t ValidateParams(const hipMemcpy3DParms* pNodeParams);
   std::string GetLabel(hipGraphDebugDotFlags flag) {
     const HIP_MEMCPY3D pCopy = hip::getDrvMemcpy3DDesc(*pCopyParams_);
@@ -1191,17 +1230,17 @@ class hipGraphMemcpyNode1D : public hipGraphNode {
     return new hipGraphMemcpyNode1D(static_cast<hipGraphMemcpyNode1D const&>(*this));
   }
 
-  virtual hipError_t CreateCommand(amd::HostQueue* queue) {
+  virtual hipError_t CreateCommand(hip::Stream* stream) {
     if (IsHtoHMemcpy(dst_, src_, kind_)) {
       return hipSuccess;
     }
-    hipError_t status = hipGraphNode::CreateCommand(queue);
+    hipError_t status = hipGraphNode::CreateCommand(stream);
     if (status != hipSuccess) {
       return status;
     }
     commands_.reserve(1);
     amd::Command* command = nullptr;
-    status = ihipMemcpyCommand(command, dst_, src_, count_, kind_, *queue);
+    status = ihipMemcpyCommand(command, dst_, src_, count_, kind_, *stream);
     commands_.emplace_back(command);
     return status;
   }
@@ -1216,14 +1255,14 @@ class hipGraphMemcpyNode1D : public hipGraphNode {
     if (isEnabled_) {
       //HtoH
       if (isH2H) {
-        ihipHtoHMemcpy(dst_, src_, count_, *hip::getQueue(stream));
+        ihipHtoHMemcpy(dst_, src_, count_, *hip::getStream(stream));
         return;
       }
       amd::Command* command = commands_[0];
       amd::HostQueue* cmdQueue = command->queue();
-      amd::HostQueue* queue = hip::getQueue(stream);
+      hip::Stream* hip_stream = hip::getStream(stream);
 
-      if (cmdQueue == queue) {
+      if (cmdQueue == hip_stream) {
         command->enqueue();
         command->release();
         return;
@@ -1231,7 +1270,7 @@ class hipGraphMemcpyNode1D : public hipGraphNode {
 
       amd::Command::EventWaitList waitList;
       amd::Command* depdentMarker = nullptr;
-      amd::Command* cmd = queue->getLastQueuedCommand(true);
+      amd::Command* cmd = hip_stream->getLastQueuedCommand(true);
       if (cmd != nullptr) {
         waitList.push_back(cmd);
         amd::Command* depdentMarker = new amd::Marker(*cmdQueue, true, waitList);
@@ -1248,7 +1287,7 @@ class hipGraphMemcpyNode1D : public hipGraphNode {
       if (cmd != nullptr) {
         waitList.clear();
         waitList.push_back(cmd);
-        amd::Command* depdentMarker = new amd::Marker(*queue, true, waitList);
+        amd::Command* depdentMarker = new amd::Marker(*hip_stream, true, waitList);
         if (depdentMarker != nullptr) {
           depdentMarker->enqueue();  // Make sure future commands of queue synced with command
           depdentMarker->release();
@@ -1257,8 +1296,8 @@ class hipGraphMemcpyNode1D : public hipGraphNode {
       }
     } else {
       amd::Command::EventWaitList waitList;
-      amd::HostQueue* queue = hip::getQueue(stream);
-      amd::Command* command = new amd::Marker(*queue, !kMarkerDisableFlush, waitList);
+      hip::Stream* hip_stream = hip::getStream(stream);
+      amd::Command* command = new amd::Marker(*hip_stream, !kMarkerDisableFlush, waitList);
       command->enqueue();
       command->release();
     }
@@ -1281,8 +1320,6 @@ class hipGraphMemcpyNode1D : public hipGraphNode {
     return SetParams(memcpy1DNode->dst_, memcpy1DNode->src_, memcpy1DNode->count_,
                      memcpy1DNode->kind_);
   }
-  // ToDo: use this when commands are cloned and command params are to be updated
-  hipError_t SetCommandParams(void* dst, const void* src, size_t count, hipMemcpyKind kind);
   static hipError_t ValidateParams(void* dst, const void* src, size_t count, hipMemcpyKind kind);
   std::string GetLabel(hipGraphDebugDotFlags flag) {
     size_t sOffsetOrig = 0;
@@ -1306,9 +1343,15 @@ class hipGraphMemcpyNode1D : public hipGraphNode {
     if (flag == hipGraphDebugDotFlagsMemcpyNodeParams || flag == hipGraphDebugDotFlagsVerbose) {
       char buffer[500];
       sprintf(buffer,
-              "{\n%s\n| {{ID | node handle | dst | src | count | kind } | {%u | %p | %p | %p | "
-              "%zu | %s}}}",
-              label_.c_str(), GetID(), this, dst_, src_, count_, memcpyDirection.c_str());
+              "{\n%s\n| {{ID | node handle} | {%u | %p}}\n| {kind | %s}\n| {{srcPtr | dstPtr} | "
+              "{pitch "
+              "| ptr | xsize | ysize | pitch | ptr | xsize | size} | {%zu | %p | %zu | %zu | %zu | %p "
+              "| %zu "
+              "| %zu}}\n| {{srcPos | {{x | %zu} | {y | %zu} | {z | %zu}}} | {dstPos | {{x | %zu} | {y "
+              "| "
+              "%zu} | {z | %zu}}} | {Extent | {{Width | %zu} | {Height | %zu} | {Depth | %zu}}}}\n}",
+              label_.c_str(), GetID(), this, memcpyDirection.c_str(), (size_t)0,
+              src_, (size_t)0, (size_t)0, (size_t)0, dst_, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0, (size_t)0, count_, (size_t)1, (size_t)1);
       label = buffer;
     } else {
       label = std::to_string(GetID()) + "\n" + label_ + "\n(" + memcpyDirection + "," +
@@ -1343,8 +1386,8 @@ class hipGraphMemcpyNodeFromSymbol : public hipGraphMemcpyNode1D {
         static_cast<hipGraphMemcpyNodeFromSymbol const&>(*this));
   }
 
-  hipError_t CreateCommand(amd::HostQueue* queue) {
-    hipError_t status = hipGraphNode::CreateCommand(queue);
+  hipError_t CreateCommand(hip::Stream* stream) {
+    hipError_t status = hipGraphNode::CreateCommand(stream);
     if (status != hipSuccess) {
       return status;
     }
@@ -1357,7 +1400,7 @@ class hipGraphMemcpyNodeFromSymbol : public hipGraphMemcpyNode1D {
     if (status != hipSuccess) {
       return status;
     }
-    status = ihipMemcpyCommand(command, dst_, device_ptr, count_, kind_, *queue);
+    status = ihipMemcpyCommand(command, dst_, device_ptr, count_, kind_, *stream);
     if (status != hipSuccess) {
       return status;
     }
@@ -1403,18 +1446,6 @@ class hipGraphMemcpyNodeFromSymbol : public hipGraphMemcpyNode1D {
     return SetParams(memcpyNode->dst_, memcpyNode->symbol_, memcpyNode->count_, memcpyNode->offset_,
                      memcpyNode->kind_);
   }
-  // ToDo: use this when commands are cloned and command params are to be updated
-  hipError_t SetCommandParams(void* dst, const void* symbol, size_t count, size_t offset,
-                              hipMemcpyKind kind) {
-    size_t sym_size = 0;
-    hipDeviceptr_t device_ptr = nullptr;
-
-    hipError_t status = ihipMemcpySymbol_validate(symbol, count, offset, sym_size, device_ptr);
-    if (status != hipSuccess) {
-      return status;
-    }
-    return hipGraphMemcpyNode1D::SetCommandParams(dst, device_ptr, count, kind);
-  }
 };
 class hipGraphMemcpyNodeToSymbol : public hipGraphMemcpyNode1D {
   const void* symbol_;
@@ -1433,8 +1464,8 @@ class hipGraphMemcpyNodeToSymbol : public hipGraphMemcpyNode1D {
     return new hipGraphMemcpyNodeToSymbol(static_cast<hipGraphMemcpyNodeToSymbol const&>(*this));
   }
 
-  hipError_t CreateCommand(amd::HostQueue* queue) {
-    hipError_t status = hipGraphNode::CreateCommand(queue);
+  hipError_t CreateCommand(hip::Stream* stream) {
+    hipError_t status = hipGraphNode::CreateCommand(stream);
     if (status != hipSuccess) {
       return status;
     }
@@ -1447,7 +1478,7 @@ class hipGraphMemcpyNodeToSymbol : public hipGraphMemcpyNode1D {
     if (status != hipSuccess) {
       return status;
     }
-    status = ihipMemcpyCommand(command, device_ptr, src_, count_, kind_, *queue);
+    status = ihipMemcpyCommand(command, device_ptr, src_, count_, kind_, *stream);
     if (status != hipSuccess) {
       return status;
     }
@@ -1491,18 +1522,6 @@ class hipGraphMemcpyNodeToSymbol : public hipGraphMemcpyNode1D {
     return SetParams(memcpyNode->src_, memcpyNode->symbol_, memcpyNode->count_, memcpyNode->offset_,
                      memcpyNode->kind_);
   }
-  // ToDo: use this when commands are cloned and command params are to be updated
-  hipError_t SetCommandParams(const void* symbol, const void* src, size_t count, size_t offset,
-                              hipMemcpyKind kind) {
-    size_t sym_size = 0;
-    hipDeviceptr_t device_ptr = nullptr;
-
-    hipError_t status = ihipMemcpySymbol_validate(symbol, count, offset, sym_size, device_ptr);
-    if (status != hipSuccess) {
-      return status;
-    }
-    return hipGraphMemcpyNode1D::SetCommandParams(device_ptr, src, count, kind);
-  }
 };
 
 class hipGraphMemsetNode : public hipGraphNode {
@@ -1562,21 +1581,21 @@ class hipGraphMemsetNode : public hipGraphNode {
     }
   }
 
-  hipError_t CreateCommand(amd::HostQueue* queue) {
-    hipError_t status = hipGraphNode::CreateCommand(queue);
+  hipError_t CreateCommand(hip::Stream* stream) {
+    hipError_t status = hipGraphNode::CreateCommand(stream);
     if (status != hipSuccess) {
       return status;
     }
     if (pMemsetParams_->height == 1) {
       size_t sizeBytes = pMemsetParams_->width * pMemsetParams_->elementSize;
       hipError_t status = ihipMemsetCommand(commands_, pMemsetParams_->dst, pMemsetParams_->value,
-                                            pMemsetParams_->elementSize, sizeBytes, queue);
+                                            pMemsetParams_->elementSize, sizeBytes, stream);
     } else {
       hipError_t status = ihipMemset3DCommand(
           commands_,
           {pMemsetParams_->dst, pMemsetParams_->pitch, pMemsetParams_->width * pMemsetParams_->elementSize,
            pMemsetParams_->height},
-          pMemsetParams_->value, {pMemsetParams_->width * pMemsetParams_->elementSize, pMemsetParams_->height, 1}, queue, pMemsetParams_->elementSize);
+          pMemsetParams_->value, {pMemsetParams_->width * pMemsetParams_->elementSize, pMemsetParams_->height, 1}, stream, pMemsetParams_->elementSize);
     }
     return status;
   }
@@ -1635,15 +1654,15 @@ class hipGraphEventRecordNode : public hipGraphNode {
     return new hipGraphEventRecordNode(static_cast<hipGraphEventRecordNode const&>(*this));
   }
 
-  hipError_t CreateCommand(amd::HostQueue* queue) {
-    hipError_t status = hipGraphNode::CreateCommand(queue);
+  hipError_t CreateCommand(hip::Stream* stream) {
+    hipError_t status = hipGraphNode::CreateCommand(stream);
     if (status != hipSuccess) {
       return status;
     }
     hip::Event* e = reinterpret_cast<hip::Event*>(event_);
     commands_.reserve(1);
     amd::Command* command = nullptr;
-    status = e->recordCommand(command, queue);
+    status = e->recordCommand(command, stream);
     commands_.emplace_back(command);
     return status;
   }
@@ -1673,16 +1692,6 @@ class hipGraphEventRecordNode : public hipGraphNode {
         static_cast<hipGraphEventRecordNode const*>(node);
     return SetParams(eventRecordNode->event_);
   }
-  // ToDo: use this when commands are cloned and command params are to be updated
-  hipError_t SetCommandParams(hipEvent_t event) {
-    amd::HostQueue* queue;
-    if (!commands_.empty()) {
-      queue = commands_[0]->queue();
-      commands_[0]->release();
-    }
-    commands_.clear();
-    return CreateCommand(queue);
-  }
 };
 
 class hipGraphEventWaitNode : public hipGraphNode {
@@ -1698,15 +1707,15 @@ class hipGraphEventWaitNode : public hipGraphNode {
     return new hipGraphEventWaitNode(static_cast<hipGraphEventWaitNode const&>(*this));
   }
 
-  hipError_t CreateCommand(amd::HostQueue* queue) {
-    hipError_t status = hipGraphNode::CreateCommand(queue);
+  hipError_t CreateCommand(hip::Stream* stream) {
+    hipError_t status = hipGraphNode::CreateCommand(stream);
     if (status != hipSuccess) {
       return status;
     }
     hip::Event* e = reinterpret_cast<hip::Event*>(event_);
     commands_.reserve(1);
     amd::Command* command;
-    status = e->streamWaitCommand(command, queue);
+    status = e->streamWaitCommand(command, stream);
     commands_.emplace_back(command);
     return status;
   }
@@ -1735,16 +1744,6 @@ class hipGraphEventWaitNode : public hipGraphNode {
     const hipGraphEventWaitNode* eventWaitNode = static_cast<hipGraphEventWaitNode const*>(node);
     return SetParams(eventWaitNode->event_);
   }
-  // ToDo: use this when commands are cloned and command params are to be updated
-  hipError_t SetCommandParams(hipEvent_t event) {
-    amd::HostQueue* queue;
-    if (!commands_.empty()) {
-      queue = commands_[0]->queue();
-      commands_[0]->release();
-    }
-    commands_.clear();
-    return CreateCommand(queue);
-  }
 };
 
 class hipGraphHostNode : public hipGraphNode {
@@ -1765,14 +1764,14 @@ class hipGraphHostNode : public hipGraphNode {
     return new hipGraphHostNode(static_cast<hipGraphHostNode const&>(*this));
   }
 
-  hipError_t CreateCommand(amd::HostQueue* queue) {
-    hipError_t status = hipGraphNode::CreateCommand(queue);
+  hipError_t CreateCommand(hip::Stream* stream) {
+    hipError_t status = hipGraphNode::CreateCommand(stream);
     if (status != hipSuccess) {
       return status;
     }
     amd::Command::EventWaitList waitList;
     commands_.reserve(1);
-    amd::Command* command = new amd::Marker(*queue, !kMarkerDisableFlush, waitList);
+    amd::Command* command = new amd::Marker(*stream, !kMarkerDisableFlush, waitList);
     commands_.emplace_back(command);
     return hipSuccess;
   }
@@ -1814,8 +1813,6 @@ class hipGraphHostNode : public hipGraphNode {
     const hipGraphHostNode* hostNode = static_cast<hipGraphHostNode const*>(node);
     return SetParams(hostNode->pNodeParams_);
   }
-  // ToDo: use this when commands are cloned and command params are to be updated
-  hipError_t SetCommandParams(const hipHostNodeParams* params);
 };
 
 class hipGraphEmptyNode : public hipGraphNode {
@@ -1827,14 +1824,14 @@ class hipGraphEmptyNode : public hipGraphNode {
     return new hipGraphEmptyNode(static_cast<hipGraphEmptyNode const&>(*this));
   }
 
-  hipError_t CreateCommand(amd::HostQueue* queue) {
-    hipError_t status = hipGraphNode::CreateCommand(queue);
+  hipError_t CreateCommand(hip::Stream* stream) {
+    hipError_t status = hipGraphNode::CreateCommand(stream);
     if (status != hipSuccess) {
       return status;
     }
     amd::Command::EventWaitList waitList;
     commands_.reserve(1);
-    amd::Command* command = new amd::Marker(*queue, !kMarkerDisableFlush, waitList);
+    amd::Command* command = new amd::Marker(*stream, !kMarkerDisableFlush, waitList);
     commands_.emplace_back(command);
     return hipSuccess;
   }
@@ -1854,11 +1851,9 @@ class hipGraphMemAllocNode : public hipGraphNode {
     return new hipGraphMemAllocNode(static_cast<hipGraphMemAllocNode const&>(*this));
   }
 
-  virtual hipError_t CreateCommand(amd::HostQueue* queue) {
-    auto error = hipGraphNode::CreateCommand(queue);
-    // Note: memory pool can work with hip::Streams only. It can't accept amd::HostQueue.
-    // Resource tracking is disabled!
-    auto ptr = Execute();
+  virtual hipError_t CreateCommand(hip::Stream* stream) {
+    auto error = hipGraphNode::CreateCommand(stream);
+    auto ptr = Execute(stream_);
     return error;
   }
 
@@ -1897,15 +1892,13 @@ class hipGraphMemFreeNode : public hipGraphNode {
     return new hipGraphMemFreeNode(static_cast<hipGraphMemFreeNode const&>(*this));
   }
 
-  virtual hipError_t CreateCommand(amd::HostQueue* queue) {
-    auto error = hipGraphNode::CreateCommand(queue);
-    // Note: memory pool can work with hip::Streams only. It can't accept amd::HostQueue.
-    // Resource tracking is disabled!
-    Execute();
+  virtual hipError_t CreateCommand(hip::Stream* stream) {
+    auto error = hipGraphNode::CreateCommand(stream);
+    Execute(stream_);
     return error;
   }
 
-  void Execute(hip::Stream* stream = nullptr) {
+  void Execute(hip::Stream* stream) {
     auto graph = GetParentGraph();
     if (graph != nullptr) {
       graph->FreeMemory(device_ptr_, stream);
diff --git a/src/hip_hcc.def.in b/src/hip_hcc.def.in
index 187c0fdd..fe219359 100644
--- a/src/hip_hcc.def.in
+++ b/src/hip_hcc.def.in
@@ -103,6 +103,9 @@ hipMemPoolImportPointer
 hipArrayCreate
 hipArray3DCreate
 hipArrayDestroy
+hipArrayGetInfo
+hipArrayGetDescriptor
+hipArray3DGetDescriptor
 hipMallocArray
 hipMemAdvise
 hipMemAllocPitch
@@ -191,6 +194,7 @@ hipStreamCreate
 hipStreamCreateWithFlags
 hipStreamCreateWithPriority
 hipStreamDestroy
+hipStreamGetDevice
 hipStreamGetFlags
 hipStreamQuery
 hipStreamSynchronize
diff --git a/src/hip_hcc.map.in b/src/hip_hcc.map.in
index a9adf372..204b139f 100644
--- a/src/hip_hcc.map.in
+++ b/src/hip_hcc.map.in
@@ -169,6 +169,7 @@ global:
     hipStreamCreateWithFlags;
     hipStreamCreateWithPriority;
     hipStreamDestroy;
+    hipStreamGetDevice;
     hipStreamGetFlags;
     hipStreamQuery;
     hipStreamSynchronize;
@@ -517,3 +518,12 @@ global:
 local:
     *;
 } hip_5.3;
+
+hip_5.6 {
+global:
+    hipArrayGetInfo;
+    hipArrayGetDescriptor;
+    hipArray3DGetDescriptor;
+local:
+    *;
+} hip_5.5;
\ No newline at end of file
diff --git a/src/hip_hmm.cpp b/src/hip_hmm.cpp
index 00687f84..ec201663 100644
--- a/src/hip_hmm.cpp
+++ b/src/hip_hmm.cpp
@@ -94,7 +94,7 @@ hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device,
     HIP_RETURN(hipErrorInvalidDevice);
   }
 
-  amd::HostQueue* queue = nullptr;
+  hip::Stream* hip_stream = nullptr;
   amd::Device* dev = nullptr;
   bool cpu_access = false;
 
@@ -106,19 +106,19 @@ hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device,
   // Pick the specified stream or Null one from the provided device
   if (device == hipCpuDeviceId) {
     cpu_access = true;
-    queue = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() : hip::getQueue(stream);
+    hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() : hip::getStream(stream);
   } else {
     dev = g_devices[device]->devices()[0];
-    queue = (stream == nullptr) ? g_devices[device]->NullStream() : hip::getQueue(stream);
+    hip_stream = (stream == nullptr) ? g_devices[device]->NullStream() : hip::getStream(stream);
   }
 
-  if (queue == nullptr) {
+  if (hip_stream == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
 
   amd::Command::EventWaitList waitList;
   amd::SvmPrefetchAsyncCommand* command =
-      new amd::SvmPrefetchAsyncCommand(*queue, waitList, dev_ptr, count, dev, cpu_access);
+      new amd::SvmPrefetchAsyncCommand(*hip_stream, waitList, dev_ptr, count, dev, cpu_access);
   if (command == nullptr) {
     return hipErrorOutOfMemory;
   }
@@ -233,8 +233,8 @@ hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align) {
     return hipSuccess;
   }
 
-  assert((hip::host_device->asContext()!= nullptr) && "Current host context must be valid");
-  amd::Context& ctx = *hip::host_device->asContext();
+  assert((hip::host_context != nullptr) && "Current host context must be valid");
+  amd::Context& ctx = *hip::host_context;
 
   const amd::Device& dev = *ctx.devices()[0];
 
diff --git a/src/hip_internal.hpp b/src/hip_internal.hpp
index 99bdf00d..ca924bd5 100644
--- a/src/hip_internal.hpp
+++ b/src/hip_internal.hpp
@@ -225,12 +225,11 @@ class stream_per_thread {
 namespace hip {
   class Device;
   class MemoryPool;
-  class Stream {
+  class Stream : public amd::HostQueue {
   public:
     enum Priority : int { High = -1, Normal = 0, Low = 1 };
 
   private:
-    amd::HostQueue* queue_;
     mutable amd::Monitor lock_;
     Device* device_;
     Priority priority_;
@@ -260,18 +259,20 @@ namespace hip {
     /// Capture events
     std::unordered_set<hipEvent_t> captureEvents_;
     unsigned long long captureID_;
+
+    static inline CommandQueue::Priority convertToQueuePriority(Priority p){
+      return p == Priority::High ? amd::CommandQueue::Priority::High : p == Priority::Low ?
+                    amd::CommandQueue::Priority::Low : amd::CommandQueue::Priority::Normal;
+    }
+
   public:
     Stream(Device* dev, Priority p = Priority::Normal, unsigned int f = 0, bool null_stream = false,
            const std::vector<uint32_t>& cuMask = {},
            hipStreamCaptureStatus captureStatus = hipStreamCaptureStatusNone);
-    ~Stream();
+
     /// Creates the hip stream object, including AMD host queue
     bool Create();
-
-    /// Get device AMD host queue object. The method can allocate the queue
-    amd::HostQueue* asHostQueue(bool skip_alloc = false);
-
-    void Finish() const;
+    virtual bool terminate() override;
     /// Get device ID associated with the current stream;
     int DeviceId() const;
     /// Get HIP device associated with the stream
@@ -378,18 +379,23 @@ namespace hip {
         parallelCaptureStreams_.erase(it);
       }
     }
+    static bool existsActiveStreamForDevice(hip::Device* device);
+
+    /// The stream should be destroyed via release() rather than delete
+    private:
+      ~Stream() {};
   };
 
   /// HIP Device class
   class Device {
-    amd::Monitor lock_{"Device lock"};
+    amd::Monitor lock_{"Device lock", true};
     /// ROCclr context
     amd::Context* context_;
     /// Device's ID
     /// Store it here so we don't have to loop through the device list every time
     int deviceId_;
     /// ROCclr host queue for default streams
-    Stream null_stream_;
+    Stream* null_stream_  = nullptr;
     /// Store device flags
     unsigned int flags_;
     /// Maintain list of user enabled peers
@@ -398,21 +404,21 @@ namespace hip {
     /// True if this device is active
     bool isActive_;
 
-    std::vector<amd::HostQueue*> queues_;
 
-    MemoryPool* default_mem_pool_;
+    MemoryPool* default_mem_pool_;  //!< Default memory pool for this device
     MemoryPool* current_mem_pool_;
+    MemoryPool* graph_mem_pool_;    //!< Memory pool, associated with graphs for this device
 
     std::set<MemoryPool*> mem_pools_;
 
   public:
     Device(amd::Context* ctx, int devId): context_(ctx),
         deviceId_(devId),
-        null_stream_(this, Stream::Priority::Normal, 0, true),
-        flags_(hipDeviceScheduleSpin),
+         flags_(hipDeviceScheduleSpin),
         isActive_(false),
         default_mem_pool_(nullptr),
-        current_mem_pool_(nullptr)
+        current_mem_pool_(nullptr),
+        graph_mem_pool_(nullptr)
         { assert(ctx != nullptr); }
     ~Device();
 
@@ -445,22 +451,16 @@ namespace hip {
     void setFlags(unsigned int flags) { flags_ = flags; }
     void Reset();
 
-    amd::HostQueue* NullStream(bool skip_alloc = false);
-    Stream* GetNullStream();
+   hip::Stream* NullStream(bool skip_alloc = false);
+   Stream* GetNullStream();
 
-    void SaveQueue(amd::HostQueue* queue) {
-      amd::ScopedLock lock(lock_);
-      queues_.push_back(queue);
-    }
 
     bool GetActiveStatus() {
       amd::ScopedLock lock(lock_);
       if (isActive_) return true;
-      for (int i = 0; i < queues_.size(); i++) {
-        if (queues_[i]->GetQueueStatus()) {
-          isActive_ = true;
-          return true;
-        }
+      if (Stream::existsActiveStreamForDevice(this)) {
+        isActive_ = true;
+        return true;
       }
       return false;
     }
@@ -476,6 +476,9 @@ namespace hip {
     /// Get the default memory pool on the device
     MemoryPool* GetDefaultMemoryPool() const { return default_mem_pool_; }
 
+    /// Get the graph memory pool on the device
+    MemoryPool* GetGraphMemoryPool() const { return graph_mem_pool_; }
+
     /// Add memory pool to the device
     void AddMemoryPool(MemoryPool* pool);
 
@@ -490,6 +493,7 @@ namespace hip {
 
     /// Removes a destroyed stream from the safe list of memory pools
     void RemoveStreamFromPools(Stream* stream);
+
   };
 
   /// Thread Local Storage Variables Aggregator Class
@@ -513,7 +517,7 @@ namespace hip {
   extern thread_local TlsAggregator tls;
 
   /// Device representing the host - for pinned memory
-  extern Device* host_device;
+  extern amd::Context* host_context;
 
   extern bool init();
 
@@ -524,11 +528,11 @@ namespace hip {
   /// Get ROCclr queue associated with hipStream
   /// Note: This follows the CUDA spec to sync with default streams
   ///       and Blocking streams
-  extern amd::HostQueue* getQueue(hipStream_t stream);
+  extern hip::Stream* getStream(hipStream_t stream);
   /// Get default stream associated with the ROCclr context
-  extern amd::HostQueue* getNullStream(amd::Context&);
+  extern hip::Stream* getNullStream(amd::Context&);
   /// Get default stream of the thread
-  extern amd::HostQueue* getNullStream();
+  extern hip::Stream* getNullStream();
   /// Get device ID associated with the ROCclr context
   int getDeviceID(amd::Context& ctx);
   /// Check if stream is valid
@@ -542,7 +546,7 @@ extern void WaitThenDecrementSignal(hipStream_t stream, hipError_t status, void*
 
 /// Wait all active streams on the blocking queue. The method enqueues a wait command and
 /// doesn't stall the current thread
-extern void iHipWaitActiveStreams(amd::HostQueue* blocking_queue, bool wait_null_stream = false);
+extern void iHipWaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream = false);
 
 extern std::vector<hip::Device*> g_devices;
 extern hipError_t ihipDeviceGetCount(int* count);
@@ -561,7 +565,8 @@ extern hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, hipDevice_t de
 extern hipError_t ihipDeviceGet(hipDevice_t* device, int deviceId);
 extern hipError_t ihipStreamOperation(hipStream_t stream, cl_command_type cmdType, void* ptr,
                                       uint64_t value, uint64_t mask, unsigned int flags, size_t sizeBytes);
-
+hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
+                      hip::Stream& stream, bool isHostAsync = false, bool isGPUAsync = true);
 constexpr bool kOptionChangeable = true;
 constexpr bool kNewDevProg = false;
 
diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index f3fcf68f..79295437 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -78,9 +78,9 @@ hipError_t ihipFree(void *ptr) {
     auto dev = g_devices[device_id];
     // Skip stream allocation, since if it wasn't allocated until free, then the device wasn't used
     constexpr bool SkipStreamAlloc = true;
-    amd::HostQueue* queue = dev->NullStream(SkipStreamAlloc);
-    if (queue != nullptr) {
-      queue->finish();
+    hip::Stream* stream = dev->NullStream(SkipStreamAlloc);
+    if (stream != nullptr) {
+      stream->finish();
     }
     hip::Stream::syncNonBlockingStreams(device_id);
     // Find out if memory belongs to any memory pool
@@ -195,15 +195,15 @@ hipError_t hipSignalExternalSemaphoresAsync(
   if (extSemArray == nullptr || paramsArray == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
 
   for (unsigned int i = 0; i < numExtSems; i++) {
     if (extSemArray[i] != nullptr) {
       amd::ExternalSemaphoreCmd* command =
-          new amd::ExternalSemaphoreCmd(*queue, extSemArray[i], paramsArray[i].params.fence.value,
+          new amd::ExternalSemaphoreCmd(*hip_stream, extSemArray[i], paramsArray[i].params.fence.value,
                                         amd::ExternalSemaphoreCmd::COMMAND_SIGNAL_EXTSEMAPHORE);
       if (command == nullptr) {
         return hipErrorOutOfMemory;
@@ -227,15 +227,15 @@ hipError_t hipWaitExternalSemaphoresAsync(const hipExternalSemaphore_t* extSemAr
   if (extSemArray == nullptr || paramsArray == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
 
   for (unsigned int i = 0; i < numExtSems; i++) {
     if (extSemArray[i] != nullptr) {
       amd::ExternalSemaphoreCmd* command =
-          new amd::ExternalSemaphoreCmd(*queue, extSemArray[i], paramsArray[i].params.fence.value,
+          new amd::ExternalSemaphoreCmd(*hip_stream, extSemArray[i], paramsArray[i].params.fence.value,
                                         amd::ExternalSemaphoreCmd::COMMAND_WAIT_EXTSEMAPHORE);
       if (command == nullptr) {
         return hipErrorOutOfMemory;
@@ -274,7 +274,7 @@ hipError_t ihipMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
 
   bool useHostDevice = (flags & CL_MEM_SVM_FINE_GRAIN_BUFFER) != 0;
   amd::Context* curDevContext = hip::getCurrentDevice()->asContext();
-  amd::Context* amdContext = useHostDevice ? hip::host_device->asContext() : curDevContext;
+  amd::Context* amdContext = useHostDevice ? hip::host_context : curDevContext;
 
   if (amdContext == nullptr) {
     return hipErrorOutOfMemory;
@@ -343,35 +343,35 @@ hipError_t ihipMemcpy_validate(void* dst, const void* src, size_t sizeBytes,
 }
 
 hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src, size_t sizeBytes,
-                             hipMemcpyKind kind, amd::HostQueue& queue, bool isAsync) {
+                             hipMemcpyKind kind, hip::Stream& stream, bool isAsync) {
   amd::Command::EventWaitList waitList;
   size_t sOffset = 0;
   amd::Memory* srcMemory = getMemoryObject(src, sOffset);
   size_t dOffset = 0;
   amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
-  amd::Device* queueDevice = &queue.device();
+  amd::Device* queueDevice = &stream.device();
   amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
   if ((srcMemory == nullptr) && (dstMemory != nullptr)) {
-    amd::HostQueue* pQueue = &queue;
+    hip::Stream* pStream = &stream;
     if (queueDevice != dstMemory->getContext().devices()[0]) {
-      pQueue = hip::getNullStream(dstMemory->getContext());
-      amd::Command* cmd = queue.getLastQueuedCommand(true);
+      pStream = hip::getNullStream(dstMemory->getContext());
+      amd::Command* cmd = stream.getLastQueuedCommand(true);
       if (cmd != nullptr) {
         waitList.push_back(cmd);
       }
     }
-    command = new amd::WriteMemoryCommand(*pQueue, CL_COMMAND_WRITE_BUFFER, waitList,
+    command = new amd::WriteMemoryCommand(*pStream, CL_COMMAND_WRITE_BUFFER, waitList,
               *dstMemory->asBuffer(), dOffset, sizeBytes, src, 0, 0, copyMetadata);
   } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) {
-    amd::HostQueue* pQueue = &queue;
+    hip::Stream* pStream = &stream;
     if (queueDevice != srcMemory->getContext().devices()[0]) {
-      pQueue = hip::getNullStream(srcMemory->getContext());
-      amd::Command* cmd = queue.getLastQueuedCommand(true);
+      pStream = hip::getNullStream(srcMemory->getContext());
+      amd::Command* cmd = stream.getLastQueuedCommand(true);
       if (cmd != nullptr) {
         waitList.push_back(cmd);
       }
     }
-    command = new amd::ReadMemoryCommand(*pQueue, CL_COMMAND_READ_BUFFER, waitList,
+    command = new amd::ReadMemoryCommand(*pStream, CL_COMMAND_READ_BUFFER, waitList,
               *srcMemory->asBuffer(), sOffset, sizeBytes, dst, 0, 0, copyMetadata);
   } else if ((srcMemory != nullptr) && (dstMemory != nullptr)) {
     // Check if the queue device doesn't match the device on any memory object.
@@ -380,7 +380,7 @@ hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src,
     if ((srcMemory->getContext().devices()[0] != dstMemory->getContext().devices()[0]) &&
         ((srcMemory->getContext().devices().size() == 1) &&
          (dstMemory->getContext().devices().size() == 1))) {
-      command = new amd::CopyMemoryP2PCommand(queue, CL_COMMAND_COPY_BUFFER, waitList,
+      command = new amd::CopyMemoryP2PCommand(stream, CL_COMMAND_COPY_BUFFER, waitList,
           *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes);
       if (command == nullptr) {
         return hipErrorOutOfMemory;
@@ -392,12 +392,12 @@ hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src,
         return hipErrorInvalidValue;
       }
     } else {
-      amd::HostQueue* pQueue = &queue;
+      hip::Stream* pStream = &stream;
       if ((srcMemory->getContext().devices()[0] == dstMemory->getContext().devices()[0]) &&
           (queueDevice != srcMemory->getContext().devices()[0])) {
         copyMetadata.copyEnginePreference_ = amd::CopyMetadata::CopyEnginePreference::NONE;
-        pQueue = hip::getNullStream(srcMemory->getContext());
-        amd::Command* cmd = queue.getLastQueuedCommand(true);
+        pStream = hip::getNullStream(srcMemory->getContext());
+        amd::Command* cmd = stream.getLastQueuedCommand(true);
         if (cmd != nullptr) {
           waitList.push_back(cmd);
         }
@@ -405,22 +405,22 @@ hipError_t ihipMemcpyCommand(amd::Command*& command, void* dst, const void* src,
         // Scenarios such as DtoH where dst is pinned memory
         if ((queueDevice != srcMemory->getContext().devices()[0]) &&
             (dstMemory->getContext().devices().size() != 1)) {
-          pQueue = hip::getNullStream(srcMemory->getContext());
-          amd::Command* cmd = queue.getLastQueuedCommand(true);
+          pStream = hip::getNullStream(srcMemory->getContext());
+          amd::Command* cmd = stream.getLastQueuedCommand(true);
           if (cmd != nullptr) {
             waitList.push_back(cmd);
           }
         // Scenarios such as HtoD where src is pinned memory
         } else if ((queueDevice != dstMemory->getContext().devices()[0]) &&
                    (srcMemory->getContext().devices().size() != 1)) {
-          pQueue = hip::getNullStream(dstMemory->getContext());
-          amd::Command* cmd = queue.getLastQueuedCommand(true);
+          pStream = hip::getNullStream(dstMemory->getContext());
+          amd::Command* cmd = stream.getLastQueuedCommand(true);
           if (cmd != nullptr) {
             waitList.push_back(cmd);
           }
         }
       }
-      command = new amd::CopyMemoryCommand(*pQueue, CL_COMMAND_COPY_BUFFER, waitList,
+      command = new amd::CopyMemoryCommand(*pStream, CL_COMMAND_COPY_BUFFER, waitList,
           *srcMemory->asBuffer(), *dstMemory->asBuffer(), sOffset, dOffset, sizeBytes,
           copyMetadata);
     }
@@ -445,13 +445,13 @@ bool IsHtoHMemcpy(void* dst, const void* src, hipMemcpyKind kind) {
   }
   return false;
 }
-void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, amd::HostQueue& queue) {
-  queue.finish();
+void ihipHtoHMemcpy(void* dst, const void* src, size_t sizeBytes, hip::Stream& stream) {
+  stream.finish();
   memcpy(dst, src, sizeBytes);
 }
 // ================================================================================================
 hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
-                      amd::HostQueue& queue, bool isAsync = false) {
+                      hip::Stream& stream, bool isHostAsync, bool isGPUAsync) {
   hipError_t status;
   if (sizeBytes == 0) {
     // Skip if nothing needs writing.
@@ -469,29 +469,39 @@ hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKin
   size_t dOffset = 0;
   amd::Memory* dstMemory = getMemoryObject(dst, dOffset);
   if (srcMemory == nullptr && dstMemory == nullptr) {
-    ihipHtoHMemcpy(dst, src, sizeBytes, queue);
+    ihipHtoHMemcpy(dst, src, sizeBytes, stream);
     return hipSuccess;
   } else if ((srcMemory == nullptr) && (dstMemory != nullptr)) {
-    isAsync = false;
+    isHostAsync = false;
   } else if ((srcMemory != nullptr) && (dstMemory == nullptr)) {
-    isAsync = false;
+    isHostAsync = false;
   }
+
   amd::Command* command = nullptr;
-  status = ihipMemcpyCommand(command, dst, src, sizeBytes, kind, queue, isAsync);
+  status = ihipMemcpyCommand(command, dst, src, sizeBytes, kind, stream, isHostAsync);
   if (status != hipSuccess) {
     return status;
   }
   command->enqueue();
-  if (!isAsync) {
+  if (!isHostAsync) {
     command->awaitCompletion();
+  } else if (!isGPUAsync) {
+    hip::Stream* pStream = hip::getNullStream(dstMemory->getContext());
+    amd::Command::EventWaitList waitList;
+    waitList.push_back(command);
+    amd::Command* depdentMarker = new amd::Marker(*pStream, false, waitList);
+    if (depdentMarker != nullptr) {
+      depdentMarker->enqueue();
+      depdentMarker->release();
+    }
   } else {
     amd::HostQueue* newQueue = command->queue();
-    if (newQueue != &queue) {
+    if (newQueue != &stream) {
       amd::Command::EventWaitList waitList;
       amd::Command* cmd = newQueue->getLastQueuedCommand(true);
       if (cmd != nullptr) {
         waitList.push_back(cmd);
-        amd::Command* depdentMarker = new amd::Marker(queue, true, waitList);
+        amd::Command* depdentMarker = new amd::Marker(stream, true, waitList);
         if (depdentMarker != nullptr) {
           depdentMarker->enqueue();
           depdentMarker->release();
@@ -512,7 +522,9 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag
   if (flags == hipDeviceMallocDefault) {
     ihipFlags = 0;
   } else if (flags == hipDeviceMallocFinegrained) {
-    ihipFlags = CL_MEM_SVM_ATOMICS | ROCCLR_MEM_HSA_PSEUDO_FINE_GRAIN;
+    ihipFlags = CL_MEM_SVM_ATOMICS;
+  } else if (flags == hipDeviceMallocUncached) {
+    ihipFlags = CL_MEM_SVM_ATOMICS | ROCCLR_MEM_HSA_UNCACHED;
   } else if (flags == hipMallocSignalMemory) {
     ihipFlags = CL_MEM_SVM_ATOMICS | CL_MEM_SVM_FINE_GRAIN_BUFFER | ROCCLR_MEM_HSA_SIGNAL_MEMORY;
     if (sizeBytes != 8) {
@@ -594,18 +606,18 @@ hipError_t hipFree(void* ptr) {
 hipError_t hipMemcpy_common(void* dst, const void* src, size_t sizeBytes,
                             hipMemcpyKind kind, hipStream_t stream = nullptr) {
   CHECK_STREAM_CAPTURING();
-  amd::HostQueue* queue = nullptr;
+  hip::Stream* hip_stream = nullptr;
 
   if (stream != nullptr) {
-    queue = hip::getQueue(stream);
+    hip_stream = hip::getStream(stream);
   } else {
-    queue = hip::getNullStream();
+    hip_stream = hip::getNullStream();
   }
 
-  if (queue == nullptr) {
+  if (hip_stream == nullptr) {
     return hipErrorInvalidValue;
   }
-  return ihipMemcpy(dst, src, sizeBytes, kind, *queue);
+  return ihipMemcpy(dst, src, sizeBytes, kind, *hip_stream);
 }
 
 hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind) {
@@ -626,12 +638,12 @@ hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes,
     HIP_RETURN(hipErrorContextIsDestroyed);
   }
 
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
 
-  HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *queue, false));
+  HIP_RETURN_DURATION(ihipMemcpy(dst, src, sizeBytes, kind, *hip_stream, false));
 }
 
 hipError_t hipMemPtrGetInfo(void *ptr, size_t *size) {
@@ -680,9 +692,9 @@ hipError_t ihipArrayDestroy(hipArray* array) {
   }
 
   for (auto& dev : g_devices) {
-    amd::HostQueue* queue = dev->NullStream(true);
-    if (queue != nullptr) {
-      queue->finish();
+    hip::Stream* stream = dev->NullStream(true);
+    if (stream != nullptr) {
+      stream->finish();
     }
   }
 
@@ -1030,6 +1042,7 @@ hipError_t ihipArrayCreate(hipArray** array,
   (*array)->depth = pAllocateArray->Depth;
   (*array)->Format = pAllocateArray->Format;
   (*array)->NumChannels = pAllocateArray->NumChannels;
+  (*array)->flags = pAllocateArray->Flags;
   {
     amd::ScopedLock lock(hip::hipArraySetLock);
     hip::hipArraySet.insert(*array);
@@ -1138,7 +1151,7 @@ hipError_t ihipHostRegister(void* hostPtr, size_t sizeBytes, unsigned int flags)
   if (hostPtr == nullptr || sizeBytes == 0 || flags > 15) {
     return hipErrorInvalidValue;
   } else {
-    amd::Memory* mem = new (*hip::host_device->asContext()) amd::Buffer(*hip::host_device->asContext(),
+    amd::Memory* mem = new (*hip::host_context) amd::Buffer(*hip::host_context,
                             CL_MEM_USE_HOST_PTR | CL_MEM_SVM_ATOMICS, sizeBytes);
 
     constexpr bool sysMemAlloc = false;
@@ -1187,9 +1200,9 @@ hipError_t ihipHostUnregister(void* hostPtr) {
     // Wait on the device, associated with the current memory object during allocation
     auto device_id = mem->getUserData().deviceId;
 
-    amd::HostQueue* queue = g_devices[device_id]->NullStream(true);
-    if (queue != nullptr) {
-      queue->finish();
+    hip::Stream* stream = g_devices[device_id]->NullStream(true);
+    if (stream != nullptr) {
+      stream->finish();
     }
 
     amd::MemObjMap::RemoveMemObj(hostPtr);
@@ -1374,11 +1387,11 @@ hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice,
                          size_t ByteCount) {
   HIP_INIT_API(hipMemcpyHtoD, dstDevice, srcHost, ByteCount);
   CHECK_STREAM_CAPTURING();
-  amd::HostQueue* queue = hip::getQueue(nullptr);
-  if (queue == nullptr) {
+  hip::Stream* stream = hip::getStream(nullptr);
+  if (stream == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
-  HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *queue));
+  HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcHost, ByteCount, hipMemcpyHostToDevice, *stream));
 }
 
 hipError_t hipMemcpyDtoH(void* dstHost,
@@ -1386,11 +1399,11 @@ hipError_t hipMemcpyDtoH(void* dstHost,
                          size_t ByteCount) {
   HIP_INIT_API(hipMemcpyDtoH, dstHost, srcDevice, ByteCount);
   CHECK_STREAM_CAPTURING();
-  amd::HostQueue* queue = hip::getQueue(nullptr);
-  if (queue == nullptr) {
+  hip::Stream* stream = hip::getStream(nullptr);
+  if (stream == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
-  HIP_RETURN_DURATION(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *queue));
+  HIP_RETURN_DURATION(ihipMemcpy(dstHost, srcDevice, ByteCount, hipMemcpyDeviceToHost, *stream));
 }
 
 hipError_t hipMemcpyDtoD(hipDeviceptr_t dstDevice,
@@ -1398,22 +1411,22 @@ hipError_t hipMemcpyDtoD(hipDeviceptr_t dstDevice,
                          size_t ByteCount) {
   HIP_INIT_API(hipMemcpyDtoD, dstDevice, srcDevice, ByteCount);
   CHECK_STREAM_CAPTURING();
-  amd::HostQueue* queue = hip::getQueue(nullptr);
-  if (queue == nullptr) {
+  hip::Stream* stream = hip::getStream(nullptr);
+  if (stream == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
-  HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *queue));
+  HIP_RETURN_DURATION(ihipMemcpy(dstDevice, srcDevice, ByteCount, hipMemcpyDeviceToDevice, *stream));
 }
 
 hipError_t hipMemcpyAsync_common(void* dst, const void* src, size_t sizeBytes,
                           hipMemcpyKind kind, hipStream_t stream) {
   STREAM_CAPTURE(hipMemcpyAsync, stream, dst, src, sizeBytes, kind);
 
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
     return hipErrorInvalidValue;
   }
-  return ihipMemcpy(dst, src, sizeBytes, kind, *queue, true);
+  return ihipMemcpy(dst, src, sizeBytes, kind, *hip_stream, true);
 }
 
 hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes,
@@ -1434,12 +1447,12 @@ hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, void* srcHost, size_t By
   HIP_INIT_API(hipMemcpyHtoDAsync, dstDevice, srcHost, ByteCount, stream);
   hipMemcpyKind kind = hipMemcpyHostToDevice;
   STREAM_CAPTURE(hipMemcpyHtoDAsync, stream, dstDevice, srcHost, ByteCount, kind);
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
   HIP_RETURN_DURATION(
-      ihipMemcpy(dstDevice, srcHost, ByteCount, kind, *queue, true));
+      ihipMemcpy(dstDevice, srcHost, ByteCount, kind, *hip_stream, true));
 }
 
 hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice, size_t ByteCount,
@@ -1447,12 +1460,12 @@ hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dstDevice, hipDeviceptr_t srcDevice
   HIP_INIT_API(hipMemcpyDtoDAsync, dstDevice, srcDevice, ByteCount, stream);
   hipMemcpyKind kind = hipMemcpyDeviceToDevice;
   STREAM_CAPTURE(hipMemcpyDtoDAsync, stream, dstDevice, srcDevice, ByteCount, kind);
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
   HIP_RETURN_DURATION(
-      ihipMemcpy(dstDevice, srcDevice, ByteCount, kind, *queue, true));
+      ihipMemcpy(dstDevice, srcDevice, ByteCount, kind, *hip_stream, true));
 }
 
 hipError_t hipMemcpyDtoHAsync(void* dstHost, hipDeviceptr_t srcDevice, size_t ByteCount,
@@ -1460,12 +1473,12 @@ hipError_t hipMemcpyDtoHAsync(void* dstHost, hipDeviceptr_t srcDevice, size_t By
   HIP_INIT_API(hipMemcpyDtoHAsync, dstHost, srcDevice, ByteCount, stream);
   hipMemcpyKind kind = hipMemcpyDeviceToHost;
   STREAM_CAPTURE(hipMemcpyDtoHAsync, stream, dstHost, srcDevice, ByteCount, kind);
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
     HIP_RETURN(hipErrorInvalidValue);
   }
   HIP_RETURN_DURATION(
-      ihipMemcpy(dstHost, srcDevice, ByteCount, kind, *queue, true));
+      ihipMemcpy(dstHost, srcDevice, ByteCount, kind, *hip_stream, true));
 }
 
 hipError_t ihipMemcpyAtoDValidate(hipArray* srcArray, void* dstDevice, amd::Coord3D& srcOrigin,
@@ -1514,7 +1527,7 @@ hipError_t ihipMemcpyAtoDValidate(hipArray* srcArray, void* dstDevice, amd::Coor
 hipError_t ihipMemcpyAtoDCommand(amd::Command*& command, hipArray* srcArray, void* dstDevice,
                                  amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
                                  amd::Coord3D copyRegion, size_t dstRowPitch, size_t dstSlicePitch,
-                                 amd::HostQueue* queue) {
+                                 hip::Stream* stream) {
   amd::BufferRect srcRect;
   amd::BufferRect dstRect;
   amd::Memory* dstMemory;
@@ -1526,7 +1539,7 @@ hipError_t ihipMemcpyAtoDCommand(amd::Command*& command, hipArray* srcArray, voi
     return status;
   }
 
-  amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*queue, CL_COMMAND_COPY_IMAGE_TO_BUFFER,
+  amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*stream, CL_COMMAND_COPY_IMAGE_TO_BUFFER,
                                        amd::Command::EventWaitList{}, *srcImage, *dstMemory,
                                        srcOrigin, dstOrigin, copyRegion, srcRect, dstRect);
 
@@ -1588,7 +1601,7 @@ hipError_t ihipMemcpyDtoAValidate(void* srcDevice, hipArray* dstArray, amd::Coor
 hipError_t ihipMemcpyDtoACommand(amd::Command*& command, void* srcDevice, hipArray* dstArray,
                                  amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
                                  amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch,
-                                 amd::HostQueue* queue) {
+                                 hip::Stream* stream) {
   amd::Image* dstImage;
   amd::Memory* srcMemory;
   amd::BufferRect dstRect;
@@ -1599,7 +1612,7 @@ hipError_t ihipMemcpyDtoACommand(amd::Command*& command, void* srcDevice, hipArr
   if (status != hipSuccess) {
     return status;
   }
-  amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*queue, CL_COMMAND_COPY_BUFFER_TO_IMAGE,
+  amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*stream, CL_COMMAND_COPY_BUFFER_TO_IMAGE,
                                        amd::Command::EventWaitList{}, *srcMemory, *dstImage,
                                        srcOrigin, dstOrigin, copyRegion, srcRect, dstRect);
 
@@ -1661,7 +1674,7 @@ hipError_t ihipMemcpyDtoDValidate(void* srcDevice, void* dstDevice, amd::Coord3D
 hipError_t ihipMemcpyDtoDCommand(amd::Command*& command, void* srcDevice, void* dstDevice,
                                  amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
                                  amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch,
-                                 size_t dstRowPitch, size_t dstSlicePitch, amd::HostQueue* queue) {
+                                 size_t dstRowPitch, size_t dstSlicePitch, hip::Stream* stream) {
   amd::Memory* srcMemory;
   amd::Memory* dstMemory;
   amd::BufferRect srcRect;
@@ -1676,7 +1689,7 @@ hipError_t ihipMemcpyDtoDCommand(amd::Command*& command, void* srcDevice, void*
   amd::Coord3D srcStart(srcRect.start_, 0, 0);
   amd::Coord3D dstStart(dstRect.start_, 0, 0);
   amd::CopyMemoryCommand* copyCommand = new amd::CopyMemoryCommand(
-      *queue, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, *srcMemory, *dstMemory,
+      *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, *srcMemory, *dstMemory,
       srcStart, dstStart, copyRegion, srcRect, dstRect);
 
   if (copyCommand == nullptr) {
@@ -1726,7 +1739,7 @@ hipError_t ihipMemcpyDtoHValidate(void* srcDevice, void* dstHost, amd::Coord3D&
 hipError_t ihipMemcpyDtoHCommand(amd::Command*& command, void* srcDevice, void* dstHost,
                                  amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
                                  amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch,
-                                 size_t dstRowPitch, size_t dstSlicePitch, amd::HostQueue* queue,
+                                 size_t dstRowPitch, size_t dstSlicePitch, hip::Stream* stream,
                                  bool isAsync = false) {
   amd::Memory* srcMemory;
   amd::BufferRect srcRect;
@@ -1740,7 +1753,7 @@ hipError_t ihipMemcpyDtoHCommand(amd::Command*& command, void* srcDevice, void*
   amd::Coord3D srcStart(srcRect.start_, 0, 0);
   amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
   amd::ReadMemoryCommand* readCommand =
-      new amd::ReadMemoryCommand(*queue, CL_COMMAND_READ_BUFFER_RECT, amd::Command::EventWaitList{},
+      new amd::ReadMemoryCommand(*stream, CL_COMMAND_READ_BUFFER_RECT, amd::Command::EventWaitList{},
                                  *srcMemory, srcStart, copyRegion, dstHost, srcRect, dstRect,
                                  copyMetadata);
 
@@ -1791,7 +1804,7 @@ hipError_t ihipMemcpyHtoDValidate(const void* srcHost, void* dstDevice, amd::Coo
 hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, void* dstDevice,
                                  amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
                                  amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch,
-                                 size_t dstRowPitch, size_t dstSlicePitch, amd::HostQueue* queue,
+                                 size_t dstRowPitch, size_t dstSlicePitch, hip::Stream* stream,
                                  bool isAsync = false) {
   amd::Memory* dstMemory;
   amd::BufferRect srcRect;
@@ -1806,7 +1819,7 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, vo
   amd::Coord3D dstStart(dstRect.start_, 0, 0);
   amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
   amd::WriteMemoryCommand* writeCommand = new amd::WriteMemoryCommand(
-      *queue, CL_COMMAND_WRITE_BUFFER_RECT, amd::Command::EventWaitList{}, *dstMemory, dstStart,
+      *stream, CL_COMMAND_WRITE_BUFFER_RECT, amd::Command::EventWaitList{}, *dstMemory, dstStart,
       copyRegion, srcHost, dstRect, srcRect, copyMetadata);
 
   if (writeCommand == nullptr) {
@@ -1824,7 +1837,7 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, vo
 hipError_t ihipMemcpyHtoH(const void* srcHost, void* dstHost, amd::Coord3D srcOrigin,
                           amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch,
                           size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch,
-                          amd::HostQueue* queue) {
+                          hip::Stream* stream) {
   if ((srcHost == nullptr) || (dstHost == nullptr)) {
     return hipErrorInvalidValue;
   }
@@ -1841,8 +1854,8 @@ hipError_t ihipMemcpyHtoH(const void* srcHost, void* dstHost, amd::Coord3D srcOr
     return hipErrorInvalidValue;
   }
 
-  if (queue) {
-    queue->finish();
+  if (stream) {
+    stream->finish();
   }
 
   for (size_t slice = 0; slice < copyRegion[2]; slice++) {
@@ -1891,7 +1904,7 @@ hipError_t ihipMemcpyAtoAValidate(hipArray* srcArray, hipArray* dstArray, amd::C
 
 hipError_t ihipMemcpyAtoACommand(amd::Command*& command, hipArray* srcArray, hipArray* dstArray,
                                  amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
-                                 amd::Coord3D copyRegion, amd::HostQueue* queue) {
+                                 amd::Coord3D copyRegion, hip::Stream* stream) {
   amd::Image* srcImage;
   amd::Image* dstImage;
 
@@ -1901,7 +1914,7 @@ hipError_t ihipMemcpyAtoACommand(amd::Command*& command, hipArray* srcArray, hip
     return status;
   }
 
-  amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*queue, CL_COMMAND_COPY_IMAGE,
+  amd::CopyMemoryCommand* cpyMemCmd = new amd::CopyMemoryCommand(*stream, CL_COMMAND_COPY_IMAGE,
                                        amd::Command::EventWaitList{}, *srcImage, *dstImage,
                                        srcOrigin, dstOrigin, copyRegion);
 
@@ -1950,7 +1963,7 @@ hipError_t ihipMemcpyHtoAValidate(const void* srcHost, hipArray* dstArray,
 hipError_t ihipMemcpyHtoACommand(amd::Command*& command, const void* srcHost, hipArray* dstArray,
                                  amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
                                  amd::Coord3D copyRegion, size_t srcRowPitch, size_t srcSlicePitch,
-                                 amd::HostQueue* queue, bool isAsync = false) {
+                                 hip::Stream* stream, bool isAsync = false) {
   amd::Image* dstImage;
   amd::BufferRect srcRect;
 
@@ -1962,7 +1975,7 @@ hipError_t ihipMemcpyHtoACommand(amd::Command*& command, const void* srcHost, hi
 
   amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
   amd::WriteMemoryCommand* writeMemCmd = new amd::WriteMemoryCommand(
-      *queue, CL_COMMAND_WRITE_IMAGE, amd::Command::EventWaitList{}, *dstImage, dstOrigin,
+      *stream, CL_COMMAND_WRITE_IMAGE, amd::Command::EventWaitList{}, *dstImage, dstOrigin,
       copyRegion, static_cast<const char*>(srcHost) + srcRect.start_, srcRowPitch, srcSlicePitch,
       copyMetadata);
 
@@ -2011,7 +2024,7 @@ hipError_t ihipMemcpyAtoHValidate(hipArray* srcArray, void* dstHost, amd::Coord3
 hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray* srcArray, void* dstHost,
                                  amd::Coord3D srcOrigin, amd::Coord3D dstOrigin,
                                  amd::Coord3D copyRegion, size_t dstRowPitch, size_t dstSlicePitch,
-                                 amd::HostQueue* queue, bool isAsync = false) {
+                                 hip::Stream* stream, bool isAsync = false) {
   amd::Image* srcImage;
   amd::BufferRect dstRect;
   amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA);
@@ -2023,7 +2036,7 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray* srcArray, voi
   }
 
   amd::ReadMemoryCommand* readMemCmd = new amd::ReadMemoryCommand(
-      *queue, CL_COMMAND_READ_IMAGE, amd::Command::EventWaitList{}, *srcImage, srcOrigin,
+      *stream, CL_COMMAND_READ_IMAGE, amd::Command::EventWaitList{}, *srcImage, srcOrigin,
       copyRegion, static_cast<char*>(dstHost) + dstRect.start_, dstRowPitch, dstSlicePitch,
       copyMetadata);
 
@@ -2040,7 +2053,7 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray* srcArray, voi
 }
 
 hipError_t ihipGetMemcpyParam3DCommand(amd::Command*& command, const HIP_MEMCPY3D* pCopy,
-                                       amd::HostQueue* queue) {
+                                       hip::Stream* stream) {
   // If {src/dst}MemoryType is hipMemoryTypeUnified, {src/dst}Device and {src/dst}Pitch specify the
   // (unified virtual address space) base address of the source data and the bytes per row to apply.
   // {src/dst}Array is ignored.
@@ -2088,41 +2101,41 @@ hipError_t ihipGetMemcpyParam3DCommand(amd::Command*& command, const HIP_MEMCPY3
     // Host to Device.
     return ihipMemcpyHtoDCommand(command, pCopy->srcHost, pCopy->dstDevice, srcOrigin, dstOrigin,
                                  copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight,
-                                 pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, queue);
+                                 pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, stream);
   } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeHost)) {
     // Device to Host.
     return ihipMemcpyDtoHCommand(command, pCopy->srcDevice, pCopy->dstHost, srcOrigin, dstOrigin,
                                  copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight,
-                                 pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, queue);
+                                 pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, stream);
   } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeDevice)) {
     // Device to Device.
     return ihipMemcpyDtoDCommand(command, pCopy->srcDevice, pCopy->dstDevice, srcOrigin, dstOrigin,
                                  copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight,
-                                 pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, queue);
+                                 pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight, stream);
   } else if ((srcMemoryType == hipMemoryTypeHost) && (dstMemoryType == hipMemoryTypeArray)) {
     // Host to Image.
     return ihipMemcpyHtoACommand(command, pCopy->srcHost, pCopy->dstArray, srcOrigin, dstOrigin,
                                  copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight,
-                                 queue);
+                                 stream);
   } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeHost)) {
     // Image to Host.
     return ihipMemcpyAtoHCommand(command, pCopy->srcArray, pCopy->dstHost, srcOrigin, dstOrigin,
                                  copyRegion, pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight,
-                                 queue);
+                                 stream);
   } else if ((srcMemoryType == hipMemoryTypeDevice) && (dstMemoryType == hipMemoryTypeArray)) {
     // Device to Image.
     return ihipMemcpyDtoACommand(command, pCopy->srcDevice, pCopy->dstArray, srcOrigin, dstOrigin,
                                  copyRegion, pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight,
-                                 queue);
+                                 stream);
   } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeDevice)) {
     // Image to Device.
     return ihipMemcpyAtoDCommand(command, pCopy->srcArray, pCopy->dstDevice, srcOrigin, dstOrigin,
                                  copyRegion, pCopy->dstPitch, pCopy->dstPitch * pCopy->dstHeight,
-                                 queue);
+                                 stream);
   } else if ((srcMemoryType == hipMemoryTypeArray) && (dstMemoryType == hipMemoryTypeArray)) {
     // Image to Image.
     return ihipMemcpyAtoACommand(command, pCopy->srcArray, pCopy->dstArray, srcOrigin, dstOrigin,
-                                 copyRegion, queue);
+                                 copyRegion, stream);
   } else {
     ShouldNotReachHere();
   }
@@ -2194,14 +2207,14 @@ hipError_t ihipMemcpyParam3D(const HIP_MEMCPY3D* pCopy, hipStream_t stream, bool
     // Host to Host.
     return ihipMemcpyHtoH(pCopy->srcHost, pCopy->dstHost, srcOrigin, dstOrigin, copyRegion,
                           pCopy->srcPitch, pCopy->srcPitch * pCopy->srcHeight, pCopy->dstPitch,
-                          pCopy->dstPitch * pCopy->dstHeight, hip::getQueue(stream));
+                          pCopy->dstPitch * pCopy->dstHeight, hip::getStream(stream));
   } else {
     amd::Command* command;
-    amd::HostQueue* queue = hip::getQueue(stream);
-    if (queue == nullptr) {
+    hip::Stream* hip_stream = hip::getStream(stream);
+    if (hip_stream == nullptr) {
       return hipErrorInvalidValue;
     }
-    status = ihipGetMemcpyParam3DCommand(command, pCopy, queue);
+    status = ihipGetMemcpyParam3DCommand(command, pCopy, hip_stream);
     if (status != hipSuccess) return status;
 
     // Transfers from device memory to pageable host memory and transfers from any host memory to any host memory
@@ -2489,13 +2502,13 @@ hipError_t ihipMemcpyAtoD(hipArray* srcArray, void* dstDevice, amd::Coord3D srcO
                           amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t dstRowPitch,
                           size_t dstSlicePitch, hipStream_t stream, bool isAsync = false) {
   amd::Command* command;
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
     return hipErrorInvalidValue;
   }
   hipError_t status =
       ihipMemcpyAtoDCommand(command, srcArray, dstDevice, srcOrigin, dstOrigin, copyRegion,
-                               dstRowPitch, dstSlicePitch, queue);
+                               dstRowPitch, dstSlicePitch, hip_stream);
   if (status != hipSuccess) return status;
   return ihipMemcpyCmdEnqueue(command, isAsync);
 }
@@ -2503,13 +2516,13 @@ hipError_t ihipMemcpyDtoA(void* srcDevice, hipArray* dstArray, amd::Coord3D srcO
                           amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch,
                           size_t srcSlicePitch, hipStream_t stream, bool isAsync = false) {
   amd::Command* command;
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
     return hipErrorInvalidValue;
   }
   hipError_t status =
       ihipMemcpyDtoACommand(command, srcDevice, dstArray, srcOrigin, dstOrigin, copyRegion,
-                               srcRowPitch, srcSlicePitch, queue);
+                               srcRowPitch, srcSlicePitch, hip_stream);
   if (status != hipSuccess) return status;
   return ihipMemcpyCmdEnqueue(command, isAsync);
 }
@@ -2518,13 +2531,13 @@ hipError_t ihipMemcpyDtoD(void* srcDevice, void* dstDevice, amd::Coord3D srcOrig
                           size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch,
                           hipStream_t stream, bool isAsync = false) {
   amd::Command* command;
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
     return hipErrorInvalidValue;
   }
   hipError_t status = ihipMemcpyDtoDCommand(command, srcDevice, dstDevice, srcOrigin, dstOrigin,
                                                copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch,
-                                               dstSlicePitch, queue);
+                                               dstSlicePitch, hip_stream);
   if (status != hipSuccess) return status;
   return ihipMemcpyCmdEnqueue(command, isAsync);
 }
@@ -2533,13 +2546,13 @@ hipError_t ihipMemcpyDtoH(void* srcDevice, void* dstHost, amd::Coord3D srcOrigin
                           size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch,
                           hipStream_t stream, bool isAsync = false) {
   amd::Command* command;
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
     return hipErrorInvalidValue;
   }
   hipError_t status = ihipMemcpyDtoHCommand(command, srcDevice, dstHost, srcOrigin, dstOrigin,
                                                copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch,
-                                               dstSlicePitch, queue, isAsync);
+                                               dstSlicePitch, hip_stream, isAsync);
   if (status != hipSuccess) return status;
   return ihipMemcpyCmdEnqueue(command, isAsync);
 }
@@ -2548,13 +2561,13 @@ hipError_t ihipMemcpyHtoD(const void* srcHost, void* dstDevice, amd::Coord3D src
                           size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch,
                           hipStream_t stream, bool isAsync = false) {
   amd::Command* command;
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
     return hipErrorInvalidValue;
   }
   hipError_t status = ihipMemcpyHtoDCommand(command, srcHost, dstDevice, srcOrigin, dstOrigin,
                                                copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch,
-                                               dstSlicePitch, queue, isAsync);
+                                               dstSlicePitch, hip_stream, isAsync);
   if (status != hipSuccess) return status;
   return ihipMemcpyCmdEnqueue(command, isAsync);
 }
@@ -2562,12 +2575,12 @@ hipError_t ihipMemcpyAtoA(hipArray* srcArray, hipArray* dstArray, amd::Coord3D s
                           amd::Coord3D dstOrigin, amd::Coord3D copyRegion, hipStream_t stream,
                           bool isAsync = false) {
   amd::Command* command;
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
     return hipErrorInvalidValue;
   }
   hipError_t status = ihipMemcpyAtoACommand(command, srcArray, dstArray, srcOrigin, dstOrigin,
-                                               copyRegion, queue);
+                                               copyRegion, hip_stream);
   if (status != hipSuccess) return status;
   return ihipMemcpyCmdEnqueue(command, isAsync);
 }
@@ -2575,13 +2588,13 @@ hipError_t ihipMemcpyHtoA(const void* srcHost, hipArray* dstArray, amd::Coord3D
                           amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t srcRowPitch,
                           size_t srcSlicePitch, hipStream_t stream, bool isAsync = false) {
   amd::Command* command;
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
     return hipErrorInvalidValue;
   }
   hipError_t status =
       ihipMemcpyHtoACommand(command, srcHost, dstArray, srcOrigin, dstOrigin, copyRegion,
-                               srcRowPitch, srcSlicePitch, queue, isAsync);
+                               srcRowPitch, srcSlicePitch, hip_stream, isAsync);
   if (status != hipSuccess) return status;
   return ihipMemcpyCmdEnqueue(command, isAsync);
 }
@@ -2589,13 +2602,13 @@ hipError_t ihipMemcpyAtoH(hipArray* srcArray, void* dstHost, amd::Coord3D srcOri
                           amd::Coord3D dstOrigin, amd::Coord3D copyRegion, size_t dstRowPitch,
                           size_t dstSlicePitch, hipStream_t stream, bool isAsync = false) {
   amd::Command* command;
-  amd::HostQueue* queue = hip::getQueue(stream);
-  if (queue == nullptr) {
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
     return hipErrorInvalidValue;
   }
   hipError_t status =
       ihipMemcpyAtoHCommand(command, srcArray, dstHost, srcOrigin, dstOrigin, copyRegion,
-                               dstRowPitch, dstSlicePitch, queue, isAsync);
+                               dstRowPitch, dstSlicePitch, hip_stream, isAsync);
   if (status != hipSuccess) return status;
   return ihipMemcpyCmdEnqueue(command, isAsync);
 }
@@ -2655,9 +2668,9 @@ hipError_t ihipMemcpy3D_validate(const hipMemcpy3DParms* p) {
 }
 
 hipError_t ihipMemcpy3DCommand(amd::Command*& command, const hipMemcpy3DParms* p,
-                               amd::HostQueue* queue) {
+                               hip::Stream* stream) {
   const HIP_MEMCPY3D desc = hip::getDrvMemcpy3DDesc(*p);
-  return ihipGetMemcpyParam3DCommand(command, &desc, queue);
+  return ihipGetMemcpyParam3DCommand(command, &desc, stream);
 }
 
 hipError_t ihipMemcpy3D(const hipMemcpy3DParms* p, hipStream_t stream, bool isAsync = false) {
@@ -2715,8 +2728,8 @@ hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream) {
 
 hipError_t packFillMemoryCommand(amd::Command*& command, amd::Memory* memory, size_t offset,
                                  int64_t value, size_t valueSize, size_t sizeBytes,
-                                 amd::HostQueue* queue) {
-  if ((memory == nullptr) || (queue == nullptr)) {
+                                 hip::Stream* stream) {
+  if ((memory == nullptr) || (stream == nullptr)) {
     return hipErrorInvalidValue;
   }
 
@@ -2726,7 +2739,7 @@ hipError_t packFillMemoryCommand(amd::Command*& command, amd::Memory* memory, si
   // surface=[pitch, width, height]
   amd::Coord3D surface(sizeBytes, sizeBytes, 1);
   amd::FillMemoryCommand* fillMemCommand =
-      new amd::FillMemoryCommand(*queue, CL_COMMAND_FILL_BUFFER, waitList, *memory->asBuffer(),
+      new amd::FillMemoryCommand(*stream, CL_COMMAND_FILL_BUFFER, waitList, *memory->asBuffer(),
                                  &value, valueSize, fillOffset, fillSize, surface);
   if (fillMemCommand == nullptr) {
     return hipErrorOutOfMemory;
@@ -2792,7 +2805,7 @@ hipError_t ihipGraphMemsetParams_validate(const hipMemsetParams* pNodeParams) {
 }
 
 hipError_t ihipMemsetCommand(std::vector<amd::Command*>& commands, void* dst, int64_t value,
-                             size_t valueSize, size_t sizeBytes, amd::HostQueue* queue) {
+                             size_t valueSize, size_t sizeBytes, hip::Stream* stream) {
   hipError_t hip_error = hipSuccess;
   auto aligned_dst = amd::alignUp(reinterpret_cast<address>(dst), sizeof(uint64_t));
   size_t offset = 0;
@@ -2802,7 +2815,7 @@ hipError_t ihipMemsetCommand(std::vector<amd::Command*>& commands, void* dst, in
   amd::Command* command;
 
   hip_error = packFillMemoryCommand(command, memory, offset, value, valueSize, sizeBytes,
-                                    queue);
+                                    stream);
   commands.push_back(command);
 
   return hip_error;
@@ -2836,8 +2849,8 @@ hipError_t ihipMemset(void* dst, int64_t value, size_t valueSize, size_t sizeByt
       }
     }
     std::vector<amd::Command*> commands;
-    amd::HostQueue* queue = hip::getQueue(stream);
-    hip_error = ihipMemsetCommand(commands, dst, value, valueSize, sizeBytes, queue);
+    hip::Stream* hip_stream = hip::getStream(stream);
+    hip_error = ihipMemsetCommand(commands, dst, value, valueSize, sizeBytes, hip_stream);
     if (hip_error != hipSuccess) {
       break;
     }
@@ -2954,13 +2967,13 @@ hipError_t ihipMemset3D_validate(hipPitchedPtr pitchedDevPtr, int value, hipExte
 }
 
 hipError_t ihipMemset3DCommand(std::vector<amd::Command*> &commands, hipPitchedPtr pitchedDevPtr,
-                               int value, hipExtent extent, amd::HostQueue* queue, size_t elementSize = 1) {
+                               int value, hipExtent extent, hip::Stream* stream, size_t elementSize = 1) {
   size_t offset = 0;
   auto sizeBytes = extent.width * extent.height * extent.depth;
   amd::Memory* memory = getMemoryObject(pitchedDevPtr.ptr, offset);
   if (pitchedDevPtr.pitch == extent.width) {
     return ihipMemsetCommand(commands, pitchedDevPtr.ptr, value, elementSize,
-                                  static_cast<size_t>(sizeBytes), queue);
+                                  static_cast<size_t>(sizeBytes), stream);
   }
   // Workaround for cases when pitch > row until fill kernel will be updated to support pitch.
   // Fall back to filling one row at a time.
@@ -2976,7 +2989,7 @@ hipError_t ihipMemset3DCommand(std::vector<amd::Command*> &commands, hipPitchedP
   }
   amd::FillMemoryCommand* command;
   command = new amd::FillMemoryCommand(
-      *queue, CL_COMMAND_FILL_BUFFER, amd::Command::EventWaitList{}, *memory->asBuffer(),
+      *stream, CL_COMMAND_FILL_BUFFER, amd::Command::EventWaitList{}, *memory->asBuffer(),
       &value, elementSize, origin, region, surface);
   commands.push_back(command);
   return hipSuccess;
@@ -3007,9 +3020,9 @@ hipError_t ihipMemset3D(hipPitchedPtr pitchedDevPtr, int value, hipExtent extent
       isAsync = true;
     }
   }
-  amd::HostQueue* queue = hip::getQueue(stream);
+  hip::Stream* hip_stream = hip::getStream(stream);
   std::vector<amd::Command*> commands;
-  status = ihipMemset3DCommand(commands, pitchedDevPtr, value, extent, queue);
+  status = ihipMemset3DCommand(commands, pitchedDevPtr, value, extent, hip_stream);
   if (status != hipSuccess) {
     return status;
   }
@@ -3524,22 +3537,105 @@ hipError_t hipArrayDestroy(hipArray* array) {
   HIP_RETURN(ihipArrayDestroy(array));
 }
 
-hipError_t hipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor,
-                                   hipArray* array) {
-  HIP_INIT_API(hipArray3DGetDescriptor, pArrayDescriptor, array);
+hipError_t ihipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* desc,
+                                    hipArray* array) {
+  {
+    amd::ScopedLock lock(hip::hipArraySetLock);
+    if (hip::hipArraySet.find(array) == hip::hipArraySet.end()) {
+      return hipErrorInvalidHandle;
+    }
+  }
 
-  assert(false && "Unimplemented");
+  desc->Width = array->width;
+  desc->Height = array->height;
+  desc->Depth = array->depth;
+  desc->Format = array->Format;
+  desc->NumChannels = array->NumChannels;
+  desc->Flags = array->flags;
 
-  HIP_RETURN(hipSuccess);
+  return hipSuccess;
+}
+
+hipError_t hipArrayGetInfo(hipChannelFormatDesc* desc,
+                           hipExtent* extent,
+                           unsigned int* flags,
+                           hipArray* array) {
+  HIP_INIT_API(hipArrayGetInfo, desc, extent, flags, array);
+  CHECK_STREAM_CAPTURE_SUPPORTED();
+
+  if (array == nullptr) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  // If all output parameters are nullptr, then no need to proceed further
+  if ((desc == nullptr) && (extent == nullptr) && (flags == nullptr)) {
+    HIP_RETURN(hipSuccess);
+  }
+
+  HIP_ARRAY3D_DESCRIPTOR array3DDescriptor;
+  hipError_t status = ihipArray3DGetDescriptor(&array3DDescriptor, array);
+
+  // Fill each output parameter
+  if (status == hipSuccess) {
+    if (desc != nullptr) {
+      *desc = hip::getChannelFormatDesc(array3DDescriptor.NumChannels, array3DDescriptor.Format);
+    }
+
+    if (extent != nullptr) {
+      extent->width = array3DDescriptor.Width;
+      extent->height = array3DDescriptor.Height;
+      extent->depth = array3DDescriptor.Depth;
+    }
+
+    if (flags != nullptr) {
+      *flags = array3DDescriptor.Flags;
+    }
+  }
+
+  HIP_RETURN(status);
 }
 
 hipError_t hipArrayGetDescriptor(HIP_ARRAY_DESCRIPTOR* pArrayDescriptor,
                                  hipArray* array) {
   HIP_INIT_API(hipArrayGetDescriptor, pArrayDescriptor, array);
+  CHECK_STREAM_CAPTURE_SUPPORTED();
+
+  if (array == nullptr) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
 
-  assert(false && "Unimplemented");
+  if (pArrayDescriptor == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
 
-  HIP_RETURN(hipSuccess);
+  HIP_ARRAY3D_DESCRIPTOR array3DDescriptor;
+  hipError_t status = ihipArray3DGetDescriptor(&array3DDescriptor, array);
+
+  // Fill each output parameter
+  if (status == hipSuccess) {
+    pArrayDescriptor->Width = array3DDescriptor.Width;
+    pArrayDescriptor->Height = array3DDescriptor.Height;
+    pArrayDescriptor->Format = array3DDescriptor.Format;
+    pArrayDescriptor->NumChannels = array3DDescriptor.NumChannels;
+  }
+
+  HIP_RETURN(status);
+}
+
+hipError_t hipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor,
+                                   hipArray* array) {
+  HIP_INIT_API(hipArray3DGetDescriptor, pArrayDescriptor, array);
+  CHECK_STREAM_CAPTURE_SUPPORTED();
+
+  if (array == nullptr) {
+    HIP_RETURN(hipErrorInvalidHandle);
+  }
+
+  if (pArrayDescriptor == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  HIP_RETURN(ihipArray3DGetDescriptor(pArrayDescriptor, array));
 }
 
 hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy,
@@ -3845,9 +3941,9 @@ hipError_t ihipMipmappedArrayDestroy(hipMipmappedArray_t mipmapped_array_ptr) {
   }
 
   for (auto& dev : g_devices) {
-    amd::HostQueue* queue = dev->NullStream(true);
-    if (queue != nullptr) {
-      queue->finish();
+    hip::Stream* stream = dev->NullStream(true);
+    if (stream != nullptr) {
+      stream->finish();
     }
   }
 
diff --git a/src/hip_mempool.cpp b/src/hip_mempool.cpp
index eea254f5..f798f8c8 100644
--- a/src/hip_mempool.cpp
+++ b/src/hip_mempool.cpp
@@ -70,7 +70,7 @@ hipError_t hipMallocAsync(void** dev_ptr, size_t size, hipStream_t stream) {
   if ((dev_ptr == nullptr) || (size == 0) || (!hip::isValid(stream))) {
     HIP_RETURN(hipErrorInvalidValue);
   }
-  auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->GetNullStream() :
+  auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() :
     reinterpret_cast<hip::Stream*>(stream);
   auto device = hip_stream->GetDevice();
   auto mem_pool = device->GetCurrentMemoryPool();
@@ -92,7 +92,7 @@ hipError_t hipFreeAsync(void* dev_ptr, hipStream_t stream) {
   auto memory = getMemoryObject(dev_ptr, offset);
   if (memory != nullptr) {
     auto id = memory->getUserData().deviceId;
-    auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->GetNullStream() :
+    auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() :
       reinterpret_cast<hip::Stream*>(stream);
     if (!g_devices[id]->FreeMemory(memory, hip_stream)) {
       //! @todo It's not the most optimal logic. The current implementation has unconditional waits
@@ -241,7 +241,7 @@ hipError_t hipMallocFromPoolAsync(
   STREAM_CAPTURE(hipMallocAsync, stream, mem_pool, size, dev_ptr);
 
   auto mpool = reinterpret_cast<hip::MemoryPool*>(mem_pool);
-  auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->GetNullStream() :
+  auto hip_stream = (stream == nullptr) ? hip::getCurrentDevice()->NullStream() :
     reinterpret_cast<hip::Stream*>(stream);
   *dev_ptr = mpool->AllocateMemory(size, hip_stream);
   HIP_RETURN(hipSuccess);
diff --git a/src/hip_mempool_impl.cpp b/src/hip_mempool_impl.cpp
index ddec8f49..2606688f 100644
--- a/src/hip_mempool_impl.cpp
+++ b/src/hip_mempool_impl.cpp
@@ -397,4 +397,10 @@ void MemoryPool::GetAccess(hip::Device* device, hipMemAccessFlags* flags) {
   }
 }
 
+void MemoryPool::FreeAllMemory(hip::Stream* stream) {
+  while (!busy_heap_.Allocations().empty()) {
+    FreeMemory(busy_heap_.Allocations().begin()->first, stream);
+  }
+}
+
 }
diff --git a/src/hip_mempool_impl.hpp b/src/hip_mempool_impl.hpp
index e42bc7eb..5e18cb35 100644
--- a/src/hip_mempool_impl.hpp
+++ b/src/hip_mempool_impl.hpp
@@ -136,7 +136,7 @@ class Heap : public amd::EmbeddedObject {
   bool IsActiveMemory(amd::Memory* memory) const {
     return (allocations_.find(memory) != allocations_.end());
   }
-
+  const auto& Allocations() { return allocations_; }
 private:
   Heap() = delete;
   Heap(const Heap&) = delete;
@@ -213,6 +213,9 @@ class MemoryPool : public amd::ReferenceCountedObject {
   /// Set memory pool access by different devices
   void GetAccess(hip::Device* device, hipMemAccessFlags* flags);
 
+  /// Frees all busy memory
+  void FreeAllMemory(hip::Stream* stream = nullptr);
+
   /// Accessors for the pool state
   bool EventDependencies() const { return (state_.event_dependencies_) ? true : false; }
   bool Opportunistic() const { return (state_.opportunistic_) ? true : false; }
@@ -223,7 +226,6 @@ class MemoryPool : public amd::ReferenceCountedObject {
   MemoryPool(const MemoryPool&) = delete;
   MemoryPool& operator=(const MemoryPool&) = delete;
 
-
   Heap busy_heap_;    //!< Heap of busy allocations
   Heap free_heap_;    //!< Heap of freed allocations
   struct {
diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index f3ae2611..a3fa4919 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -305,7 +305,7 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
                                    uint32_t globalWorkSizeX, uint32_t globalWorkSizeY,
                                    uint32_t globalWorkSizeZ, uint32_t blockDimX, uint32_t blockDimY,
                                    uint32_t blockDimZ, uint32_t sharedMemBytes,
-                                   amd::HostQueue* queue, void** kernelParams, void** extra,
+                                   hip::Stream* stream, void** kernelParams, void** extra,
                                    hipEvent_t startEvent = nullptr, hipEvent_t stopEvent = nullptr,
                                    uint32_t flags = 0, uint32_t params = 0, uint32_t gridId = 0,
                                    uint32_t numGrids = 0, uint64_t prevGridSum = 0,
@@ -328,7 +328,7 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
   }
 
   amd::NDRangeKernelCommand* kernelCommand = new amd::NDRangeKernelCommand(
-      *queue, waitList, *kernel, ndrange, sharedMemBytes, params, gridId, numGrids, prevGridSum,
+      *stream, waitList, *kernel, ndrange, sharedMemBytes, params, gridId, numGrids, prevGridSum,
       allGridSum, firstDevice, profileNDRange);
   if (!kernelCommand) {
     return hipErrorOutOfMemory;
@@ -336,10 +336,12 @@ hipError_t ihipLaunchKernelCommand(amd::Command*& command, hipFunction_t f,
 
   // Capture the kernel arguments
   if (CL_SUCCESS != kernelCommand->captureAndValidate()) {
-    delete kernelCommand;
+    kernelCommand->release();
     return hipErrorOutOfMemory;
   }
+
   command = kernelCommand;
+
   return hipSuccess;
 }
 
@@ -369,9 +371,9 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
     return status;
   }
   amd::Command* command = nullptr;
-  amd::HostQueue* queue = hip::getQueue(hStream);
+  hip::Stream* hip_stream = hip::getStream(hStream);
   status = ihipLaunchKernelCommand(command, f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ,
-                                   blockDimX, blockDimY, blockDimZ, sharedMemBytes, queue,
+                                   blockDimX, blockDimY, blockDimZ, sharedMemBytes, hip_stream,
                                    kernelParams, extra, startEvent, stopEvent, flags, params,
                                    gridId, numGrids, prevGridSum, allGridSum, firstDevice);
   if (status != hipSuccess) {
@@ -386,12 +388,25 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX,
     }
   }
 
-  command->enqueue();
-
   if (stopEvent != nullptr) {
     hip::Event* eStop = reinterpret_cast<hip::Event*>(stopEvent);
+    if (eStop->flags & hipEventDisableSystemFence) {
+      command->setEventScope(amd::Device::kCacheStateIgnore);
+    } else {
+      command->setEventScope(amd::Device::kCacheStateSystem);
+    }
+    // Enqueue Dispatch and bind the stop event
+    command->enqueue();
     eStop->BindCommand(*command, false);
+  } else {
+    command->enqueue();
+  }
+
+  if (command->status() == CL_INVALID_OPERATION) {
+    command->release();
+    return hipErrorIllegalState;
   }
+
   command->release();
 
   return hipSuccess;
@@ -516,7 +531,8 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
     return hipErrorInvalidValue;
   }
 
-  if ((flags < 0) || (flags > hipCooperativeLaunchMultiDeviceNoPostSync)) {
+  if (flags > (hipCooperativeLaunchMultiDeviceNoPostSync + 
+               hipCooperativeLaunchMultiDeviceNoPreSync)) {
     return hipErrorInvalidValue;
   }
 
@@ -535,8 +551,8 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
     }
     if (launch.hStream != nullptr) {
       // Validate devices to make sure it dosn't have duplicates
-      amd::HostQueue* queue = reinterpret_cast<hip::Stream*>(launch.hStream)->asHostQueue();
-      auto device = &queue->vdev()->device();
+      hip::Stream* hip_stream = reinterpret_cast<hip::Stream*>(launch.hStream);
+      auto device = &hip_stream->vdev()->device();
       for (int j = 0; j < numDevices; ++j) {
         if (mgpu_list[j] == device) {
           return hipErrorInvalidDevice;
@@ -553,23 +569,23 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
   // Sync the execution streams on all devices
   if ((flags & hipCooperativeLaunchMultiDeviceNoPreSync) == 0) {
     for (int i = 0; i < numDevices; ++i) {
-      amd::HostQueue* queue =
-          reinterpret_cast<hip::Stream*>(launchParamsList[i].hStream)->asHostQueue();
-      queue->finish();
+      hip::Stream* hip_stream =
+          reinterpret_cast<hip::Stream*>(launchParamsList[i].hStream);
+      hip_stream->finish();
     }
   }
 
   for (int i = 0; i < numDevices; ++i) {
     const hipFunctionLaunchParams& launch = launchParamsList[i];
-    amd::HostQueue* queue = reinterpret_cast<hip::Stream*>(launch.hStream)->asHostQueue();
+    hip::Stream* hip_stream = reinterpret_cast<hip::Stream*>(launch.hStream);
 
     if (i == 0) {
       // The order of devices in the launch may not match the order in the global array
       for (size_t dev = 0; dev < g_devices.size(); ++dev) {
         // Find the matching device
-        if (&queue->vdev()->device() == g_devices[dev]->devices()[0]) {
+        if (&hip_stream->vdev()->device() == g_devices[dev]->devices()[0]) {
           // Save ROCclr index of the first device in the launch
-          firstDevice = queue->vdev()->device().index();
+          firstDevice = hip_stream->vdev()->device().index();
           break;
         }
       }
@@ -599,9 +615,9 @@ hipError_t ihipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams*
   // Sync the execution streams on all devices
   if ((flags & hipCooperativeLaunchMultiDeviceNoPostSync) == 0) {
     for (int i = 0; i < numDevices; ++i) {
-      amd::HostQueue* queue =
-          reinterpret_cast<hip::Stream*>(launchParamsList[i].hStream)->asHostQueue();
-      queue->finish();
+      hip::Stream* hip_stream =
+          reinterpret_cast<hip::Stream*>(launchParamsList[i].hStream);
+      hip_stream->finish();
     }
   }
 
@@ -730,12 +746,12 @@ hipError_t ihipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsL
       return hipErrorInvalidValue;
     }
 
-    amd::HostQueue* queue = reinterpret_cast<hip::Stream*>(launch.stream)->asHostQueue();
+    hip::Stream* hip_stream = hip::getStream(launch.stream);
     hipFunction_t func = nullptr;
     // The order of devices in the launch may not match the order in the global array
     for (size_t dev = 0; dev < g_devices.size(); ++dev) {
       // Find the matching device and request the kernel function
-      if (&queue->vdev()->device() == g_devices[dev]->devices()[0]) {
+      if (&hip_stream->vdev()->device() == g_devices[dev]->devices()[0]) {
         IHIP_RETURN_ONFAIL(PlatformState::instance().getStatFunc(&func, launch.func, dev));
         break;
       }
diff --git a/src/hip_peer.cpp b/src/hip_peer.cpp
index fb6b3329..17dc65da 100644
--- a/src/hip_peer.cpp
+++ b/src/hip_peer.cpp
@@ -220,7 +220,8 @@ hipError_t hipMemcpyPeer(void* dst, int dstDevice, const void* src, int srcDevic
     HIP_RETURN(hipErrorInvalidDevice);
   }
 
-  HIP_RETURN(hipMemcpy(dst, src, sizeBytes, hipMemcpyDeviceToDevice));
+  HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, hipMemcpyDeviceToDevice, *hip::getNullStream(),
+                        true, false));
 }
 
 hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int srcDevice,
@@ -232,8 +233,14 @@ hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src, int src
       srcDevice < 0 || dstDevice < 0) {
     HIP_RETURN(hipErrorInvalidDevice);
   }
-
-  HIP_RETURN(hipMemcpyAsync(dst, src, sizeBytes, hipMemcpyDeviceToDevice, stream));
+  if (!hip::isValid(stream)) {
+    return hipErrorContextIsDestroyed;
+  }
+  hip::Stream* hip_stream = hip::getStream(stream);
+  if (hip_stream == nullptr) {
+    return hipErrorInvalidValue;
+  }
+  HIP_RETURN(ihipMemcpy(dst, src, sizeBytes, hipMemcpyDeviceToDevice, *hip_stream, true, true));
 }
 
 hipError_t hipCtxEnablePeerAccess(hipCtx_t peerCtx, unsigned int flags) {
diff --git a/src/hip_platform.cpp b/src/hip_platform.cpp
index aebd7f99..10b8d3f8 100644
--- a/src/hip_platform.cpp
+++ b/src/hip_platform.cpp
@@ -33,8 +33,6 @@ PlatformState* PlatformState::platform_;  // Initiaized as nullptr by default
 
 // forward declaration of methods required for __hipRegisrterManagedVar
 hipError_t ihipMallocManaged(void** ptr, size_t size, unsigned int align = 0);
-hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
-                      amd::HostQueue& queue, bool isAsync = false);
 
 struct __CudaFatBinaryWrapper {
   unsigned int magic;
@@ -146,9 +144,9 @@ extern "C" void __hipRegisterManagedVar(
   HIP_INIT_VOID();
   hipError_t status = ihipMallocManaged(pointer, size, align);
   if (status == hipSuccess) {
-    amd::HostQueue* queue = hip::getNullStream();
-    if (queue != nullptr) {
-      status = ihipMemcpy(*pointer, init_value, size, hipMemcpyHostToDevice, *queue);
+    hip::Stream* stream = hip::getNullStream();
+    if (stream != nullptr) {
+      status = ihipMemcpy(*pointer, init_value, size, hipMemcpyHostToDevice, *stream);
       guarantee((status == hipSuccess), "Error during memcpy to managed memory!");
     } else {
       ClPrint(amd::LOG_ERROR, amd::LOG_API, "Host Queue is NULL");
@@ -662,7 +660,10 @@ void PlatformState::init() {
   initialized_ = true;
   for (auto& it : statCO_.modules_) {
     hipError_t err = digestFatBinary(it.first, it.second);
-    assert(err == hipSuccess);
+    if (err != hipSuccess) {
+      HIP_ERROR_PRINT(err);
+      return;
+    }
   }
   for (auto& it : statCO_.vars_) {
     it.second->resize_dVar(g_devices.size());
@@ -673,8 +674,6 @@ void PlatformState::init() {
 }
 
 hipError_t PlatformState::loadModule(hipModule_t* module, const char* fname, const void* image) {
-  amd::ScopedLock lock(lock_);
-
   if (module == nullptr) {
     return hipErrorInvalidValue;
   }
@@ -689,6 +688,7 @@ hipError_t PlatformState::loadModule(hipModule_t* module, const char* fname, con
   *module = dynCo->module();
   assert(*module != nullptr);
 
+  amd::ScopedLock lock(lock_);
   if (dynCO_map_.find(*module) != dynCO_map_.end()) {
     delete dynCo;
     return hipErrorAlreadyMapped;
diff --git a/src/hip_runtime.cpp b/src/hip_runtime.cpp
index 4c877806..78eb4aca 100644
--- a/src/hip_runtime.cpp
+++ b/src/hip_runtime.cpp
@@ -46,9 +46,14 @@ extern "C" BOOL WINAPI DllMain(HINSTANCE hinst, DWORD reason, LPVOID reserved) {
       }
 #endif  // DEBUG
       break;
-    case DLL_PROCESS_DETACH:
+    case DLL_PROCESS_DETACH: {
+      amd::Thread* thread = amd::Thread::current();
+      if (!(thread != nullptr ||
+            ((thread = new amd::HostThread()) != nullptr && thread == amd::Thread::current()))) {
+        return true;
+      }
       ihipDestroyDevice();
-      break;
+    } break;
     case DLL_THREAD_DETACH: {
       amd::Thread* thread = amd::Thread::current();
       delete thread;
diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp
index 3d1e9168..6d085fe7 100644
--- a/src/hip_stream.cpp
+++ b/src/hip_stream.cpp
@@ -31,7 +31,8 @@ namespace hip {
 // ================================================================================================
 Stream::Stream(hip::Device* dev, Priority p, unsigned int f, bool null_stream,
                const std::vector<uint32_t>& cuMask, hipStreamCaptureStatus captureStatus)
-    : queue_(nullptr),
+    : amd::HostQueue(*dev->asContext(), *dev->devices()[0], 0, amd::CommandQueue::RealTimeDisabled, 
+        convertToQueuePriority(p), cuMask),
       lock_("Stream Callback lock"),
       device_(dev),
       priority_(p),
@@ -40,22 +41,11 @@ Stream::Stream(hip::Device* dev, Priority p, unsigned int f, bool null_stream,
       cuMask_(cuMask),
       captureStatus_(captureStatus),
       originStream_(false),
-      captureID_(0) {}
-
-// ================================================================================================
-Stream::~Stream() {
-  if (queue_ != nullptr) {
-    amd::ScopedLock lock(streamSetLock);
-    streamSet.erase(this);
-
-    // Skip queue destruction for null stream in MT. Queue worker thread can be destroyed on
-    // the app exit, during the stream destruction, causing a race condition.
-    if (!null_ || AMD_DIRECT_DISPATCH) {
-      queue_->release();
-      queue_ = nullptr;
-    }
-  }
-}
+      captureID_(0)
+      {
+        amd::ScopedLock lock(streamSetLock);
+        streamSet.insert(this);
+      }
 
 // ================================================================================================
 hipError_t Stream::EndCapture() {
@@ -81,38 +71,16 @@ hipError_t Stream::EndCapture() {
 
 // ================================================================================================
 bool Stream::Create() {
-  amd::CommandQueue::Priority p;
-  switch (priority_) {
-    case Priority::High:
-      p = amd::CommandQueue::Priority::High;
-      break;
-    case Priority::Low:
-      p = amd::CommandQueue::Priority::Low;
-      break;
-    case Priority::Normal:
-    default:
-      p = amd::CommandQueue::Priority::Normal;
-      break;
-  }
-  amd::HostQueue* queue = new amd::HostQueue(*device_->asContext(), *device_->devices()[0],
-                                             0, amd::CommandQueue::RealTimeDisabled,
-                                             p, cuMask_);
-
-  // Create a host queue
-  bool result = (queue != nullptr) ? queue->create() : false;
-  // Insert just created stream into the list of the blocking queues
-  if (result) {
+  return create();
+}
+
+// ================================================================================================
+bool Stream::terminate() {
+  {
     amd::ScopedLock lock(streamSetLock);
-    streamSet.insert(this);
-    queue_ = queue;
-    device_->SaveQueue(queue);
-  } else if (queue != nullptr) {
-    // Queue creation has failed, and virtual device associated with the queue may not be created.
-    // Just need to delete the queue instance.
-    delete queue;
+    streamSet.erase(this);
   }
-
-  return result;
+  return HostQueue::terminate();
 }
 
 // ================================================================================================
@@ -134,29 +102,6 @@ bool isValid(hipStream_t& stream) {
   return true;
 }
 
-// ================================================================================================
-amd::HostQueue* Stream::asHostQueue(bool skip_alloc) {
-  if (queue_ != nullptr) {
-    return queue_;
-  }
-  // Access to the stream object is lock protected, because possible allocation
-  amd::ScopedLock l(Lock());
-  if (queue_ == nullptr) {
-    // Create the host queue for the first time
-    if (!skip_alloc) {
-      Create();
-    }
-  }
-  return queue_;
-}
-
-// ================================================================================================
-void Stream::Finish() const {
-  if (queue_ != nullptr) {
-    queue_->finish();
-  }
-}
-
 // ================================================================================================
 int Stream::DeviceId() const {
   return device_->deviceId();
@@ -180,7 +125,7 @@ void Stream::syncNonBlockingStreams(int deviceId) {
   for (auto& it : streamSet) {
     if (it->Flags() & hipStreamNonBlocking) {
       if (it->DeviceId() == deviceId) {
-        it->asHostQueue()->finish();
+        it->finish();
       }
     }
   }
@@ -207,7 +152,7 @@ void Stream::destroyAllStreams(int deviceId) {
     }
   }
   for (auto& it : toBeDeleted) {
-    delete it;
+    it->release();
   }
 }
 
@@ -215,36 +160,48 @@ bool Stream::StreamCaptureOngoing(void) {
   return (g_allCapturingStreams.empty() == true) ? false : true;
 }
 
+bool Stream::existsActiveStreamForDevice(hip::Device* device) {
+
+  amd::ScopedLock lock(streamSetLock);
+
+  for (const auto& active_stream : streamSet) {
+    if ((active_stream->GetDevice() == device) &&
+      active_stream->GetQueueStatus()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 };// hip namespace
 
 // ================================================================================================
-void iHipWaitActiveStreams(amd::HostQueue* blocking_queue, bool wait_null_stream) {
+void iHipWaitActiveStreams(hip::Stream* blocking_stream, bool wait_null_stream) {
   amd::Command::EventWaitList eventWaitList(0);
   bool submitMarker = 0;
   {
     amd::ScopedLock lock(streamSetLock);
 
-    for (const auto& stream : streamSet) {
-      amd::HostQueue* active_queue = stream->asHostQueue();
+    for (const auto& active_stream : streamSet) {
       // If it's the current device
-      if ((&active_queue->device() == &blocking_queue->device()) &&
+      if ((&active_stream->device() == &blocking_stream->device()) &&
           // Make sure it's a default stream
-          ((stream->Flags() & hipStreamNonBlocking) == 0) &&
+          ((active_stream->Flags() & hipStreamNonBlocking) == 0) &&
           // and it's not the current stream
-          (active_queue != blocking_queue) &&
+          (active_stream != blocking_stream) &&
           // check for a wait on the null stream
-          (stream->Null() == wait_null_stream)) {
+          (active_stream->Null() == wait_null_stream)) {
         // Get the last valid command
-        amd::Command* command = active_queue->getLastQueuedCommand(true);
+        amd::Command* command = active_stream->getLastQueuedCommand(true);
         if (command != nullptr) {
           amd::Event& event = command->event();
           // Check HW status of the ROCcrl event.
           // Note: not all ROCclr modes support HW status
-          bool ready = active_queue->device().IsHwEventReady(event);
+          bool ready = active_stream->device().IsHwEventReady(event);
           if (!ready) {
             ready = (command->status() == CL_COMPLETE);
           }
-          submitMarker |= active_queue->vdev()->isFenceDirty();
+          submitMarker |= active_stream->vdev()->isFenceDirty();
           // Check the current active status
           if (!ready) {
             command->notifyCmdQueue();
@@ -263,11 +220,19 @@ void iHipWaitActiveStreams(amd::HostQueue* blocking_queue, bool wait_null_stream
 
   // Check if we have to wait anything
   if (eventWaitList.size() > 0 || submitMarker) {
-    amd::Command* command = new amd::Marker(*blocking_queue, kMarkerDisableFlush, eventWaitList);
+    amd::Command* command = new amd::Marker(*blocking_stream, kMarkerDisableFlush, eventWaitList);
     if (command != nullptr) {
       command->enqueue();
       command->release();
     }
+
+    //Reset the dirty flag for all streams now that the marker is submitted
+    for (const auto& stream : streamSet) {
+      amd::HostQueue* active_queue = stream->asHostQueue();
+      if (active_queue->vdev()->isFenceDirty()) {
+        active_queue->vdev()->resetFenceDirty();
+      }
+    }
   }
 
   // Release all active commands. It's safe after the marker was enqueued
@@ -292,8 +257,11 @@ static hipError_t ihipStreamCreate(hipStream_t* stream,
   }
   hip::Stream* hStream = new hip::Stream(hip::getCurrentDevice(), priority, flags, false, cuMask);
 
-  if (hStream == nullptr || !hStream->Create()) {
-    delete hStream;
+  if (hStream == nullptr) {
+    return hipErrorOutOfMemory;
+  }
+  else if (!hStream->Create()) {
+    hStream->release();
     return hipErrorOutOfMemory;
   }
 
@@ -314,7 +282,7 @@ stream_per_thread::stream_per_thread() {
 stream_per_thread::~stream_per_thread() {
   for (auto &stream:m_streams) {
     if (stream != nullptr && hip::isValid(stream)) {
-      delete reinterpret_cast<hip::Stream*>(stream);
+      reinterpret_cast<hip::Stream*>(stream)->release();
       stream = nullptr;
     }
   }
@@ -453,7 +421,7 @@ hipError_t hipStreamSynchronize_common(hipStream_t stream) {
     }
   }
   // Wait for the current host queue
-  hip::getQueue(stream)->finish();
+  hip::getStream(stream)->finish();
   return hipSuccess;
 }
 
@@ -502,7 +470,7 @@ hipError_t hipStreamDestroy(hipStream_t stream) {
   if (l_it != hip::tls.capture_streams_.end()) {
     hip::tls.capture_streams_.erase(l_it);
   }
-  delete s;
+  s->release();
 
   HIP_RETURN(hipSuccess);
 }
@@ -534,12 +502,6 @@ hipError_t hipStreamWaitEvent_common(hipStream_t stream, hipEvent_t event, unsig
     return hipErrorContextIsDestroyed;
   }
 
-  if (stream != nullptr) {
-    // If still capturing return error
-    if (hip::Stream::StreamCaptureOngoing() == true) {
-      HIP_RETURN(hipErrorStreamCaptureIsolation);
-    }
-  }
   hip::Event* e = reinterpret_cast<hip::Event*>(event);
   return e->streamWait(stream, flags);
 }
@@ -568,9 +530,9 @@ hipError_t hipStreamQuery_common(hipStream_t stream) {
       HIP_RETURN(hipErrorStreamCaptureUnsupported);
     }
   }
-  amd::HostQueue* hostQueue = hip::getQueue(stream);
+  hip::Stream* hip_stream = hip::getStream(stream);
 
-  amd::Command* command = hostQueue->getLastQueuedCommand(true);
+  amd::Command* command = hip_stream->getLastQueuedCommand(true);
   if (command == nullptr) {
     // Nothing was submitted to the queue
     return hipSuccess;
@@ -608,13 +570,13 @@ hipError_t streamCallback_common(hipStream_t stream, StreamCallback* cbo, void*
     return hipErrorContextIsDestroyed;
   }
 
-  amd::HostQueue* hostQueue = hip::getQueue(stream);
-  amd::Command* last_command = hostQueue->getLastQueuedCommand(true);
+  hip::Stream* hip_stream = hip::getStream(stream);
+  amd::Command* last_command = hip_stream->getLastQueuedCommand(true);
   amd::Command::EventWaitList eventWaitList;
   if (last_command != nullptr) {
     eventWaitList.push_back(last_command);
   }
-  amd::Command* command = new amd::Marker(*hostQueue, !kMarkerDisableFlush, eventWaitList);
+  amd::Command* command = new amd::Marker(*hip_stream, !kMarkerDisableFlush, eventWaitList);
   if (command == nullptr) {
     return hipErrorInvalidValue;
   }
@@ -634,7 +596,7 @@ hipError_t streamCallback_common(hipStream_t stream, StreamCallback* cbo, void*
   // Add the new barrier to stall the stream, until the callback is done
   eventWaitList.clear();
   eventWaitList.push_back(command);
-  amd::Command* block_command = new amd::Marker(*hostQueue, !kMarkerDisableFlush, eventWaitList);
+  amd::Command* block_command = new amd::Marker(*hip_stream, !kMarkerDisableFlush, eventWaitList);
   if (block_command == nullptr) {
     return hipErrorInvalidValue;
   }
@@ -695,7 +657,7 @@ hipError_t hipLaunchHostFunc_spt(hipStream_t stream, hipHostFn_t fn, void* userD
 // ================================================================================================
 hipError_t hipLaunchHostFunc(hipStream_t stream, hipHostFn_t fn, void* userData) {
   HIP_INIT_API(hipLaunchHostFunc, stream, fn, userData);
-  if (stream == nullptr) {
+  if (stream == nullptr && (hip::Stream::StreamCaptureOngoing() == true)) {
     HIP_RETURN(hipErrorStreamCaptureImplicit);
   }
   HIP_RETURN(hipLaunchHostFunc_common(stream, fn, userData));
@@ -833,3 +795,27 @@ hipError_t hipExtStreamGetCUMask(hipStream_t stream, uint32_t cuMaskSize, uint32
   }
   HIP_RETURN(hipSuccess);
 }
+
+// ================================================================================================
+hipError_t hipStreamGetDevice(hipStream_t stream, hipDevice_t* device) {
+  HIP_INIT_API(hipStreamGetDevice, stream, device);
+
+  if (device == nullptr) {
+    HIP_RETURN(hipErrorInvalidValue);
+  }
+
+  if (!hip::isValid(stream)) {
+    return HIP_RETURN(hipErrorContextIsDestroyed);
+  }
+
+  if (stream == nullptr) {  // handle null stream
+    // null stream is associated with current device, return the device id associated with the
+    // current device
+    *device = hip::getCurrentDevice()->deviceId();
+  } else {
+    getStreamPerThread(stream);
+    *device = reinterpret_cast<hip::Stream*>(stream)->DeviceId();
+  }
+
+  HIP_RETURN(hipSuccess);
+}
diff --git a/src/hip_stream_ops.cpp b/src/hip_stream_ops.cpp
index a3bed6cf..7032c4c6 100644
--- a/src/hip_stream_ops.cpp
+++ b/src/hip_stream_ops.cpp
@@ -69,11 +69,11 @@ hipError_t ihipStreamOperation(hipStream_t stream, cl_command_type cmdType, void
     return hipErrorInvalidValue;
   }
 
-  amd::HostQueue* queue = hip::getQueue(stream);
+  hip::Stream* hip_stream = hip::getStream(stream);
   amd::Command::EventWaitList waitList;
 
   amd::StreamOperationCommand* command =
-    new amd::StreamOperationCommand(*queue, cmdType, waitList, *memory->asBuffer(),
+    new amd::StreamOperationCommand(*hip_stream, cmdType, waitList, *memory->asBuffer(),
                                     value, mask, outFlags, offset, sizeBytes);
 
   if (command == nullptr) {
diff --git a/src/hip_texture.cpp b/src/hip_texture.cpp
index 610d93fe..8c443739 100644
--- a/src/hip_texture.cpp
+++ b/src/hip_texture.cpp
@@ -25,9 +25,6 @@
 #include "hip_conversions.hpp"
 #include "platform/sampler.hpp"
 
-hipError_t ihipMemcpy(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind,
-                      amd::HostQueue& queue, bool isAsync = false);
-
 hipError_t ihipFree(void* ptr);
 
 struct __hip_texture {
@@ -575,8 +572,8 @@ hipError_t hipBindTexture2D(size_t* offset,
     HIP_RETURN(err);
   }
   // Copy to device.
-  amd::HostQueue* queue = hip::getNullStream();
-  HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue));
+  hip::Stream* stream = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *stream));
 }
 
 hipError_t ihipBindTextureToArray(const textureReference* texref,
@@ -624,8 +621,8 @@ hipError_t hipBindTextureToArray(const textureReference* texref,
     HIP_RETURN(err);
   }
   // Copy to device.
-  amd::HostQueue* queue = hip::getNullStream();
-  HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue));
+  hip::Stream* stream = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *stream));
 }
 
 hipError_t ihipBindTextureToMipmappedArray(const textureReference* texref,
@@ -674,8 +671,8 @@ hipError_t hipBindTextureToMipmappedArray(const textureReference* texref,
     HIP_RETURN(err);
   }
   // Copy to device.
-  amd::HostQueue* queue = hip::getNullStream();
-  HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue));
+  hip::Stream* stream = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *stream));
 }
 
 hipError_t hipUnbindTexture(const textureReference* texref) {
@@ -701,8 +698,8 @@ hipError_t hipBindTexture(size_t* offset,
     HIP_RETURN(err);
   }
   // Copy to device.
-  amd::HostQueue* queue = hip::getNullStream();
-  HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *queue));
+  hip::Stream* stream = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texref, refDevSize, hipMemcpyHostToDevice, *stream));
 }
 
 hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc,
@@ -966,8 +963,8 @@ hipError_t hipTexRefSetArray(textureReference* texRef,
     HIP_RETURN(err);
   }
   // Copy to device.
-  amd::HostQueue* queue = hip::getNullStream();
-  HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue));
+  hip::Stream* stream = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *stream));
 }
 
 hipError_t hipTexRefGetAddress(hipDeviceptr_t* dptr,
@@ -1049,8 +1046,8 @@ hipError_t hipTexRefSetAddress(size_t* ByteOffset,
     HIP_RETURN(err);
   }
   // Copy to device.
-  amd::HostQueue* queue = hip::getNullStream();
-  HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue));
+  hip::Stream* stream = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *stream));
 }
 
 hipError_t hipTexRefSetAddress2D(textureReference* texRef,
@@ -1091,8 +1088,8 @@ hipError_t hipTexRefSetAddress2D(textureReference* texRef,
     HIP_RETURN(err);
   }
   // Copy to device.
-  amd::HostQueue* queue = hip::getNullStream();
-  HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue));
+  hip::Stream* stream = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *stream));
 }
 
 hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w, hipChannelFormatKind f) {
@@ -1454,8 +1451,8 @@ hipError_t hipTexRefSetMipmappedArray(textureReference* texRef,
     HIP_RETURN(err);
   }
   // Copy to device.
-  amd::HostQueue* queue = hip::getNullStream();
-  HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *queue));
+  hip::Stream* stream = hip::getNullStream();
+  HIP_RETURN(ihipMemcpy(refDevPtr, texRef, refDevSize, hipMemcpyHostToDevice, *stream));
 }
 
 hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject,
diff --git a/src/hiprtc/cmake/HIPRTC.cmake b/src/hiprtc/cmake/HIPRTC.cmake
index 19b59f36..521b0234 100644
--- a/src/hiprtc/cmake/HIPRTC.cmake
+++ b/src/hiprtc/cmake/HIPRTC.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 - 2022 Advanced Micro Devices, Inc. All Rights Reserved.
+# Copyright (c) 2021 - 2023 Advanced Micro Devices, Inc. All Rights Reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -29,6 +29,7 @@ function(get_hiprtc_macros HIPRTC_DEFINES)
   set(${HIPRTC_DEFINES}
 "#pragma clang diagnostic push\n\
 #pragma clang diagnostic ignored \"-Wreserved-id-macro\"\n\
+#pragma clang diagnostic ignored \"-Wc++98-compat-pedantic\"\n\
 #define __device__ __attribute__((device))\n\
 #define __host__ __attribute__((host))\n\
 #define __global__ __attribute__((global))\n\
@@ -51,7 +52,10 @@ function(get_hiprtc_macros HIPRTC_DEFINES)
 #pragma clang diagnostic pop\n\
 #define HIP_INCLUDE_HIP_HIP_RUNTIME_H\n\
 #define HIP_INCLUDE_HIP_HIP_FP16_H\n\
+#pragma clang diagnostic push\n\
+#pragma clang diagnostic ignored \"-Wreserved-macro-identifier\"\n\
 #define _HIP_BFLOAT16_H_\n\
+#pragma clang diagnostic pop\n\
 #define HIP_INCLUDE_HIP_HIP_VECTOR_TYPES_H"
   PARENT_SCOPE)
 endfunction(get_hiprtc_macros)
@@ -63,12 +67,15 @@ if(HIPRTC_ADD_MACROS)
   FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_DEFINES}")
   FILE(READ "${HIPRTC_WARP_HEADER_FILE}" HIPRTC_WARP_HEADER)
   FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_WARP_HEADER}")
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-macro-identifier"
   FILE(READ "${HIPRTC_COOP_HELPER_FILE}" HIPRTC_COOP_HELPER)
   FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_COOP_HELPER}")
   FILE(READ "${HIPRTC_COOP_HEADER_FILE}" HIPRTC_COOP_HEADER)
   FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_COOP_HEADER}")
   FILE(READ "${HIPRTC_UNSAFE_ATOMICS_FILE}" HIPRTC_UNSAFE_ATOMICS)
   FILE(APPEND ${HIPRTC_PREPROCESSED_FILE} "${HIPRTC_UNSAFE_ATOMICS}")
+#pragma clang diagnostic pop
 endif()
 
 macro(generate_hiprtc_header HiprtcHeader)
diff --git a/src/hiprtc/hiprtcComgrHelper.cpp b/src/hiprtc/hiprtcComgrHelper.cpp
index b8dca86c..c3794c61 100644
--- a/src/hiprtc/hiprtcComgrHelper.cpp
+++ b/src/hiprtc/hiprtcComgrHelper.cpp
@@ -157,6 +157,11 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp
       sramEccSupported = false;
       proc_name = "gfx90c";
       break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX940:
+      xnackSupported = true;
+      sramEccSupported = true;
+      proc_name = "gfx940";
+      break;
     case EF_AMDGPU_MACH_AMDGCN_GFX1010:
       xnackSupported = true;
       sramEccSupported = false;
@@ -172,6 +177,11 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp
       sramEccSupported = false;
       proc_name = "gfx1012";
       break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1013:
+      xnackSupported = true;
+      sramEccSupported = false;
+      proc_name = "gfx1013";
+      break;
     case EF_AMDGPU_MACH_AMDGCN_GFX1030:
       xnackSupported = false;
       sramEccSupported = false;
@@ -192,6 +202,41 @@ static bool getProcName(uint32_t EFlags, std::string& proc_name, bool& xnackSupp
       sramEccSupported = false;
       proc_name = "gfx1033";
       break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1034:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1034";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1035:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1035";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1036:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1036";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1100:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1100";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1101:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1101";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1102:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1102";
+      break;
+    case EF_AMDGPU_MACH_AMDGCN_GFX1103:
+      xnackSupported = false;
+      sramEccSupported = false;
+      proc_name = "gfx1103";
+      break;
     default:
       return false;
   }
diff --git a/src/hiprtc/hiprtcInternal.cpp b/src/hiprtc/hiprtcInternal.cpp
index dfff262c..188b42d8 100644
--- a/src/hiprtc/hiprtcInternal.cpp
+++ b/src/hiprtc/hiprtcInternal.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2022 - Present Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -117,11 +117,6 @@ RTCCompileProgram::RTCCompileProgram(std::string name_) : RTCProgram(name_), fgp
   compile_options_.reserve(20);  // count of options below
   compile_options_.push_back("-O3");
 
-#ifdef HIPRTC_EARLY_INLINE
-  compile_options_.push_back("-mllvm");
-  compile_options_.push_back("-amdgpu-early-inline-all");
-#endif
-
   if (GPU_ENABLE_WGP_MODE) compile_options_.push_back("-mcumode");
 
   if (!GPU_ENABLE_WAVE32_MODE) compile_options_.push_back("-mwavefrontsize64");
@@ -135,6 +130,8 @@ RTCCompileProgram::RTCCompileProgram(std::string name_) : RTCProgram(name_), fgp
   compile_options_.push_back("hiprtc_runtime.h");
   compile_options_.push_back("-std=c++14");
   compile_options_.push_back("-nogpuinc");
+  compile_options_.push_back("-Wno-gnu-line-marker");
+  compile_options_.push_back("-Wno-missing-prototypes");
 #ifdef _WIN32
   compile_options_.push_back("-target");
   compile_options_.push_back("x86_64-pc-windows-msvc");
@@ -186,7 +183,23 @@ bool RTCCompileProgram::addBuiltinHeader() {
   return true;
 }
 
-bool RTCCompileProgram::transformOptions() {
+bool RTCCompileProgram::findLLVMOptions(const std::vector<std::string>& options,
+                                        std::vector<std::string>& llvm_options) {
+  for (size_t i = 0; i < options.size(); ++i) {
+    if (options[i] == "-mllvm") {
+      if (options.size() == (i+1)) {
+        LogInfo(
+          "-mllvm option passed by the app, it comes as a pair but there is no option after this");
+        return false;
+      }
+      llvm_options.push_back(options[i]);
+      llvm_options.push_back(options[i + 1]);
+    }
+  }
+  return true;
+}
+
+bool RTCCompileProgram::transformOptions(std::vector<std::string>& compile_options) {
   auto getValueOf = [](const std::string& option) {
     std::string res;
     auto f = std::find(option.begin(), option.end(), '=');
@@ -194,7 +207,7 @@ bool RTCCompileProgram::transformOptions() {
     return res;
   };
 
-  for (auto& i : compile_options_) {
+  for (auto& i : compile_options) {
     if (i == "-hip-pch") {
       LogInfo(
           "-hip-pch is deprecated option, has no impact on execution of new hiprtc programs, it "
@@ -216,9 +229,9 @@ bool RTCCompileProgram::transformOptions() {
   }
 
   if (auto res = std::find_if(
-          compile_options_.begin(), compile_options_.end(),
+          compile_options.begin(), compile_options.end(),
           [](const std::string& str) { return str.find("--offload-arch=") != std::string::npos; });
-      res != compile_options_.end()) {
+      res != compile_options.end()) {
     auto isaName = getValueOf(*res);
     isa_ = "amdgcn-amd-amdhsa--" + isaName;
     settings_.offloadArchProvided = true;
@@ -240,15 +253,21 @@ bool RTCCompileProgram::compile(const std::vector<std::string>& options, bool fg
   fgpu_rdc_ = fgpu_rdc;
 
   // Append compile options
-  compile_options_.reserve(compile_options_.size() + options.size());
-  compile_options_.insert(compile_options_.end(), options.begin(), options.end());
+  std::vector<std::string> compileOpts(compile_options_);
+  compileOpts.reserve(compile_options_.size() + options.size() + 2);
+  compileOpts.insert(compileOpts.end(), options.begin(), options.end());
+  
+  if (!fgpu_rdc_) {
+    compileOpts.push_back("-Xclang");
+    compileOpts.push_back("-disable-llvm-passes");
+  }
 
-  if (!transformOptions()) {
+  if (!transformOptions(compileOpts)) {
     LogError("Error in hiprtc: unable to transform options");
     return false;
   }
 
-  if (!compileToBitCode(compile_input_, isa_, compile_options_, build_log_, LLVMBitcode_)) {
+  if (!compileToBitCode(compile_input_, isa_, compileOpts, build_log_, LLVMBitcode_)) {
     LogError("Error in hiprtc: unable to compile source to bitcode");
     return false;
   }
@@ -285,14 +304,30 @@ bool RTCCompileProgram::compile(const std::vector<std::string>& options, bool fg
     return false;
   }
 
+  std::vector<std::string> llvmOptions;
+  // Find the -mllvm options passed by the app such as "-mllvm" "-amdgpu-early-inline-all=true"
+  if (!findLLVMOptions(options, llvmOptions)) {
+    LogError("Error in hiprtc: unable to match -mllvm options");
+    return false;
+  }
+
+  std::vector<std::string> exeOpts(exe_options_);
+  exeOpts.reserve(exeOpts.size() + llvmOptions.size() + 2);
+  // Add these options by default for optimizations during BC to Relocatable phase.
+  exeOpts.push_back("-mllvm");
+  exeOpts.push_back("-amdgpu-internalize-symbols");
+  // User provided -mllvm options are appended at the end since they can override the above
+  // default options if necessary
+  exeOpts.insert(exeOpts.end(), llvmOptions.begin(), llvmOptions.end());
+
   if (settings_.dumpISA) {
-    if (!dumpIsaFromBC(exec_input_, isa_, exe_options_, name_, build_log_)) {
+    if (!dumpIsaFromBC(exec_input_, isa_, exeOpts, name_, build_log_)) {
       LogError("Error in hiprtc: unable to dump isa code");
       return false;
     }
   }
 
-  if (!createExecutable(exec_input_, isa_, exe_options_, build_log_, executable_)) {
+  if (!createExecutable(exec_input_, isa_, exeOpts, build_log_, executable_)) {
     LogError("Error in hiprtc: unable to create executable");
     return false;
   }
diff --git a/src/hiprtc/hiprtcInternal.hpp b/src/hiprtc/hiprtcInternal.hpp
index 8b846e49..28fcc90f 100644
--- a/src/hiprtc/hiprtcInternal.hpp
+++ b/src/hiprtc/hiprtcInternal.hpp
@@ -146,7 +146,9 @@ class RTCCompileProgram : public RTCProgram {
   // Private Member functions
   bool addSource_impl();
   bool addBuiltinHeader();
-  bool transformOptions();
+  bool transformOptions(std::vector<std::string>& compile_options);
+  bool findLLVMOptions(const std::vector<std::string>& options,
+                       std::vector<std::string>& llvm_options);
 
   RTCCompileProgram() = delete;
   RTCCompileProgram(RTCCompileProgram&) = delete;