diff --git a/tests/amdgpu/amdgpu_waves.c b/tests/amdgpu/amdgpu_waves.c new file mode 100644 index 000000000..0605000b0 --- /dev/null +++ b/tests/amdgpu/amdgpu_waves.c @@ -0,0 +1,515 @@ +#include +#include +#include +#include +#include + +#include "amdgpu.h" +#include "amdgpu_drm.h" +#include "amdgpu_waves.h" + +#define mmSQ_IND_INDEX 0x8de0 +#define mmSQ_IND_DATA 0x8de4 + +#define AMDGPU_MAX_SE 4 +#define AMDGPU_SH_PER_SE 1 +#define AMDGPU_CU_PER_SH 16 + +#define AMDGPU_WAVE_STATUS_INDEX 1 +#define AMDGPU_WAVE_PC_LOW_INDEX 2 +#define AMDGPU_WAVE_PC_HI_INDEX 3 +#define AMDGPU_WAVE_EXEC_LOW_INDEX 4 +#define AMDGPU_WAVE_EXEC_HI_INDEX 5 +#define AMDGPU_WAVE_HW_ID_INDEX 6 +#define AMDGPU_WAVE_GPR_ALLOC_INDEX 8 +#define AMDGPU_WAVE_LDS_ALLOC_INDEX 9 +#define AMDGPU_WAVE_TRAPSTS_INDEX 10 +#define AMDGPU_WAVE_IB_STS_INDEX 11 + +#define AMDGPU_WAVE_STATUS_VALID_MASK (1 << 16) + +#define AMDGPU_WAVE_HW_ID_WAVE_ID_SHIFT 0 +#define AMDGPU_WAVE_HW_ID_SIMD_ID_SHIFT 4 +#define AMDGPU_WAVE_HW_ID_PIPE_ID_SHIFT 6 +#define AMDGPU_WAVE_HW_ID_CU_ID_SHIFT 8 +#define AMDGPU_WAVE_HW_ID_SH_ID_SHIFT 12 +#define AMDGPU_WAVE_HW_ID_SE_ID_SHIFT 13 +#define AMDGPU_WAVE_HW_ID_WAVE_ID_MASK 0xF +#define AMDGPU_WAVE_HW_ID_SIMD_ID_MASK 0x3 +#define AMDGPU_WAVE_HW_ID_PIPE_ID_MASK 0x3 +#define AMDGPU_WAVE_HW_ID_CU_ID_MASK 0xF +#define AMDGPU_WAVE_HW_ID_SH_ID_MASK 0x1 +#define AMDGPU_WAVE_HW_ID_SE_ID_MASK 0x3 + +#define AMDGPU_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT 8 +#define AMDGPU_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT 24 +#define AMDGPU_WAVE_GPR_ALLOC_VGPR_SIZE_MASK 0x3F +#define AMDGPU_WAVE_GPR_ALLOC_SGPR_SIZE_MASK 0xF + +#define AMDGPU_FAMILY_AI_DEV_ID_MASK 0x686 +#define AMDGPU_FAMILY_MATCH(x, y) (((x) >> 4) == (y)) + +#define AMDGPU_MMIO_SE_OR_ME_SHIFT 24 +#define AMDGPU_MMIO_SH_OR_PIPE_SHIFT 34 +#define AMDGPU_MMIO_CU_OR_QUEUE_SHIFT 44 +#define AMDGPU_MMIO_USE_RING 61 /* ME_PIPE=1 */ +#define AMDGPU_MMIO_USE_BANK 62 /* SE_SH_CU=1 */ + +#define AMDGPU_GPR_BANK_SEL_SHIFT 60 /* VGPR=0, SGPR=1 */ +#define AMDGPU_GPR_THREAD_SEL_SHIFT 52 +#define AMDGPU_GPR_SIMD_SEL_SHIFT 44 +#define AMDGPU_GPR_WAVE_ID_SEL_SHIFT 36 +#define AMDGPU_GPR_CU_SEL_SHIFT 28 +#define AMDGPU_GPR_SH_SEL_SHIFT 20 +#define AMDGPU_GPR_SE_SEL_SHIFT 12 + +static inline uint32_t amdgpu_read_bits(uint32_t value, int shift, + uint32_t mask) +{ + return ((value >> shift) & mask); +} + +static int amdgpu_open_debugfs(const char *fmt, uint32_t instance) +{ + int fd; + char filepath[128] = { 0 }; + + if (snprintf(filepath, 128, fmt, instance) <= 0) { + printf("Failed to prepare debugfs path: %s.\n", + strerror(errno)); + return -1; + } + + fd = open(filepath, O_RDWR); + if (fd <= 0) { + printf("Failed to open %s: %s.\n", filepath, strerror(errno)); + return -1; + } + + return fd; +} + +static int amdgpu_read_regu32(struct amdgpu_waves_handle *wh, + const uint64_t addr, const uint32_t *value) +{ + int ret; + ssize_t sz; + off_t offset; + + offset = addr; + offset = lseek(wh->fd.mmio_reg, offset, SEEK_SET); + if (offset < 0) + return -1; + + sz = read(wh->fd.mmio_reg, value, sizeof(uint32_t)); + if (sz < 0) { + return -1; + } + + if (sz != sizeof(uint32_t)) { + return -1; + } + + return 0; +} + +static int amdgpu_write_regu32(struct amdgpu_waves_handle *wh, + const uint64_t addr, const uint32_t value) +{ + int ret; + ssize_t sz; + off_t offset; + + offset = addr; + offset = lseek(wh->fd.mmio_reg, offset, SEEK_SET); + if (offset < 0) + return -1; + + sz = write(wh->fd.mmio_reg, &value, sizeof(uint32_t)); + if (sz < 0) { + return -1; + } + + if (sz != sizeof(uint32_t)) { + return -1; + } + + return 0; +} + +static int amdgpu_is_cu_active(struct amdgpu_waves_handle *wh, uint32_t se, + uint32_t sh, uint32_t cu) +{ + int ret; + uint64_t addr_msbs; + uint32_t value = 0; + + addr_msbs = ((uint64_t)cu << AMDGPU_MMIO_CU_OR_QUEUE_SHIFT) | + ((uint64_t)sh << AMDGPU_MMIO_SH_OR_PIPE_SHIFT) | + ((uint64_t)se << AMDGPU_MMIO_SE_OR_ME_SHIFT) | + ((uint64_t)1 << AMDGPU_MMIO_USE_BANK); + + ret = amdgpu_write_regu32(wh, addr_msbs | mmSQ_IND_INDEX, (1 << 19)); + if (ret) + return ret; + + ret = amdgpu_read_regu32(wh, addr_msbs | mmSQ_IND_DATA, &value); + if (ret) + return ret; + + if (value == 0xbebebeef) + return -EINVAL; + + if (value & 1) + return 0; + + return -EBADF; +} + +static int amdgpu_waves_check_device(uint32_t instance) +{ + FILE *file; + char filepath[256] = { 0 }; + char name[64] = { 0 }; + char *dev; + uint32_t dev_id; + + if (snprintf(filepath, 256, "/sys/kernel/debug/dri/%d/name", + instance) <= 0) { + printf("Failed to prepare debugfs path.\n"); + return -1; + } + + file = fopen(filepath, "r"); + if (!file) { + printf("Failed to open: %s: %s.\n", filepath, strerror(errno)); + return -1; + } + + if (fscanf(file, "%*s %s", name) != 1) { + printf("Failed to read device name\n"); + fclose(file); + return -1; + } + + fclose(file); + dev = strstr(name, "dev="); + if (!dev) + return -1; + + /* skip 'dev=' */ + dev += 4; + if (snprintf(filepath, 256, "/sys/bus/pci/devices/%s/device", dev) <= + 0) { + printf("Failed to prepare pci device path.\n"); + return -1; + } + + file = fopen(filepath, "r"); + if (!file) { + printf("Failed to open: %s: %s.\n", filepath, strerror(errno)); + return -1; + } + + if (fscanf(file, "%x", &dev_id) != 1) { + printf("Failed to read device name\n"); + fclose(file); + return -1; + } + + fclose(file); + if (AMDGPU_FAMILY_MATCH(dev_id, AMDGPU_FAMILY_AI_DEV_ID_MASK)) + return 0; + + return -1; +} + +int amdgpu_waves_create(struct amdgpu_waves_handle *w_handle) +{ + uint32_t instance = 0; + + if (!w_handle) + return -1; + + if (amdgpu_waves_check_device(instance) < 0) + return -1; + + w_handle->fd.mmio_reg = amdgpu_open_debugfs( + "/sys/kernel/debug/dri/%d/amdgpu_regs", instance); + if (w_handle->fd.mmio_reg <= 0) + return w_handle->fd.mmio_reg; + + w_handle->fd.waves = amdgpu_open_debugfs( + "/sys/kernel/debug/dri/%d/amdgpu_wave", instance); + if (w_handle->fd.waves <= 0) { + close(w_handle->fd.mmio_reg); + return w_handle->fd.waves; + } + + w_handle->fd.gpr = amdgpu_open_debugfs( + "/sys/kernel/debug/dri/%d/amdgpu_gpr", instance); + if (w_handle->fd.gpr <= 0) { + close(w_handle->fd.mmio_reg); + close(w_handle->fd.waves); + return w_handle->fd.gpr; + } + + return 0; +} + +static int amdgpu_print_vgprs(struct amdgpu_waves_handle *w_handle, + uint32_t se_id, uint32_t sh_id, uint32_t cu_id, + uint32_t simd_id, uint32_t wave_id, + uint32_t thread, uint32_t vgpr_count, + uint64_t exec) +{ + int ret; + uint32_t i; + uint32_t j; + ssize_t sz; + uint32_t buffer[256 * 16] = { 0 }; + off_t addr_msbs; + + addr_msbs = ((uint64_t)se_id << AMDGPU_GPR_SE_SEL_SHIFT) | + ((uint64_t)sh_id << AMDGPU_GPR_SH_SEL_SHIFT) | + ((uint64_t)cu_id << AMDGPU_GPR_CU_SEL_SHIFT) | + ((uint64_t)wave_id << AMDGPU_GPR_WAVE_ID_SEL_SHIFT) | + ((uint64_t)simd_id << AMDGPU_GPR_SIMD_SEL_SHIFT) | + ((uint64_t)thread << AMDGPU_GPR_THREAD_SEL_SHIFT) | + ((uint64_t)0 << AMDGPU_GPR_BANK_SEL_SHIFT); + + addr_msbs = lseek(w_handle->fd.gpr, addr_msbs, SEEK_SET); + if (addr_msbs < 0) + return -1; + + sz = read(w_handle->fd.gpr, &buffer, sizeof(uint32_t) * vgpr_count); + if (sz < 0) + return -1; + + printf("Thread[%02x]: %s\n", thread, + (exec & (1 << thread)) ? "Executing" : "Not Executing"); + for (i = 0; i < vgpr_count;) { + if (i == 0) { + printf(" "); + for (j = 0; j < 16 && (i + j) < vgpr_count; ++j) { + printf(" %08x", j); + } + printf("\n"); + } + printf("VGPR %3x:", i); + for (j = 0; j < 16 && (i + j) < vgpr_count; ++j) { + printf(" %08x", buffer[i + j]); + } + + i += j; + printf("\n"); + } + + printf("\n"); + + return 0; +} + +static int amdgpu_print_sgprs(struct amdgpu_waves_handle *w_handle, + uint32_t se_id, uint32_t sh_id, uint32_t cu_id, + uint32_t simd_id, uint32_t wave_id, + uint32_t sgpr_count) +{ + int ret; + uint32_t i; + uint32_t j; + ssize_t sz; + uint32_t buffer[256 * 4] = { 0 }; + off_t addr_msbs; + + addr_msbs = ((uint64_t)se_id << AMDGPU_GPR_SE_SEL_SHIFT) | + ((uint64_t)sh_id << AMDGPU_GPR_SH_SEL_SHIFT) | + ((uint64_t)cu_id << AMDGPU_GPR_CU_SEL_SHIFT) | + ((uint64_t)wave_id << AMDGPU_GPR_WAVE_ID_SEL_SHIFT) | + ((uint64_t)simd_id << AMDGPU_GPR_SIMD_SEL_SHIFT) | + ((uint64_t)1 << AMDGPU_GPR_BANK_SEL_SHIFT); + + addr_msbs = lseek(w_handle->fd.gpr, addr_msbs, SEEK_SET); + if (addr_msbs < 0) + return -1; + + sz = read(w_handle->fd.gpr, &buffer, sizeof(uint32_t) * sgpr_count); + if (sz < 0) + return -1; + + for (i = 0; i < sgpr_count;) { + if (i == 0) { + printf("\n "); + for (j = 0; j < 16 && (i + j) < sgpr_count; ++j) { + printf(" %08x", j); + } + printf("\n"); + } + printf("SGPR %3x:", i); + for (j = 0; j < 16 && (i + j) < sgpr_count; ++j) { + printf(" %08x", buffer[i + j]); + } + + i += j; + printf("\n"); + } + + printf("\n"); + + return 0; +} + +static int amdgpu_print_wavedata(struct amdgpu_waves_handle *w_handle, + uint32_t se, uint32_t sh, uint32_t cu, + uint32_t simd, uint32_t wave) +{ + int i; + int ret; + ssize_t sz; + uint32_t buffer[32] = { 0 }; + off_t addr_msbs; + uint32_t value = 0, se_id, sh_id, cu_id, wave_id, simd_id, sgpr_size, + vgpr_size; + + addr_msbs = ((uint64_t)se << 7) | ((uint64_t)sh << 15) | + ((uint64_t)cu << 23) | ((uint64_t)wave << 31) | + ((uint64_t)simd << 37); + + addr_msbs = lseek(w_handle->fd.waves, addr_msbs, SEEK_SET); + if (addr_msbs < 0) + return -1; + + sz = read(w_handle->fd.waves, &buffer, sizeof(uint32_t) * 32); + if (sz < 0) + return -1; + + if (buffer[0] != 1) + return -1; + + if ((buffer[AMDGPU_WAVE_STATUS_INDEX] & + AMDGPU_WAVE_STATUS_VALID_MASK) == 0) + return -1; + + printf("%2u %2u %2u %4u %4u " + "%08x %08x %08x %08x " + "%08x %08x %08x %08x" + " %08x %08x\n", + se, sh, cu, simd, wave, buffer[AMDGPU_WAVE_STATUS_INDEX], + buffer[AMDGPU_WAVE_PC_LOW_INDEX], + buffer[AMDGPU_WAVE_PC_HI_INDEX], + buffer[AMDGPU_WAVE_EXEC_LOW_INDEX], + buffer[AMDGPU_WAVE_EXEC_HI_INDEX], + buffer[AMDGPU_WAVE_HW_ID_INDEX], + buffer[AMDGPU_WAVE_GPR_ALLOC_INDEX], + buffer[AMDGPU_WAVE_LDS_ALLOC_INDEX], + buffer[AMDGPU_WAVE_TRAPSTS_INDEX], + buffer[AMDGPU_WAVE_IB_STS_INDEX]); + + se_id = amdgpu_read_bits(buffer[AMDGPU_WAVE_HW_ID_INDEX], + AMDGPU_WAVE_HW_ID_SE_ID_SHIFT, + AMDGPU_WAVE_HW_ID_SE_ID_MASK); + + sh_id = amdgpu_read_bits(buffer[AMDGPU_WAVE_HW_ID_INDEX], + AMDGPU_WAVE_HW_ID_SH_ID_SHIFT, + AMDGPU_WAVE_HW_ID_SH_ID_MASK); + + cu_id = amdgpu_read_bits(buffer[AMDGPU_WAVE_HW_ID_INDEX], + AMDGPU_WAVE_HW_ID_CU_ID_SHIFT, + AMDGPU_WAVE_HW_ID_CU_ID_MASK); + + wave_id = amdgpu_read_bits(buffer[AMDGPU_WAVE_HW_ID_INDEX], + AMDGPU_WAVE_HW_ID_WAVE_ID_SHIFT, + AMDGPU_WAVE_HW_ID_WAVE_ID_MASK); + + simd_id = amdgpu_read_bits(buffer[AMDGPU_WAVE_HW_ID_INDEX], + AMDGPU_WAVE_HW_ID_SIMD_ID_SHIFT, + AMDGPU_WAVE_HW_ID_SIMD_ID_MASK); + + sgpr_size = amdgpu_read_bits(buffer[AMDGPU_WAVE_GPR_ALLOC_INDEX], + AMDGPU_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT, + AMDGPU_WAVE_GPR_ALLOC_SGPR_SIZE_MASK); + + vgpr_size = amdgpu_read_bits(buffer[AMDGPU_WAVE_GPR_ALLOC_INDEX], + AMDGPU_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT, + AMDGPU_WAVE_GPR_ALLOC_VGPR_SIZE_MASK); + + /* AI family SGPR allocation are in blocks of 16 */ + sgpr_size = (sgpr_size + 1) << 4; + + /* AI family VGPR allocation are in blocks of 16 */ + vgpr_size = (vgpr_size + 1) << 2; + + amdgpu_print_sgprs(w_handle, se_id, sh_id, cu_id, simd_id, wave_id, + sgpr_size); + + /* AI family has 64 threads */ + for (i = 0; i < 64; ++i) { + amdgpu_print_vgprs( + w_handle, se_id, sh_id, cu_id, simd_id, wave_id, i, + vgpr_size, + (((uint64_t)buffer[AMDGPU_WAVE_EXEC_HI_INDEX] << 32) | + buffer[AMDGPU_WAVE_EXEC_LOW_INDEX])); + } + + return 0; +} + +static int amdgpu_print_cu(struct amdgpu_waves_handle *w_handle, uint32_t se, + uint32_t sh, uint32_t cu) +{ + int ret; + int active = 1; + uint32_t simd, wave; + + for (simd = 0; simd < 4; ++simd) { + for (wave = 0; wave < 10; ++wave) { + ret = amdgpu_print_wavedata(w_handle, se, sh, cu, simd, + wave); + if (!ret) + active = 0; + } + } + + return active; +} + +int amdgpu_waves_print(struct amdgpu_waves_handle *w_handle) +{ + int ret; + int active = -1; + uint32_t sh, se, cu; + if (!w_handle || w_handle->fd.mmio_reg <= 0 || w_handle->fd.waves <= 0) + return -EINVAL; + + for (se = 0; se < AMDGPU_MAX_SE; ++se) { + for (sh = 0; sh < AMDGPU_SH_PER_SE; ++sh) { + for (cu = 0; cu < AMDGPU_CU_PER_SH; ++cu) { + ret = amdgpu_is_cu_active(w_handle, se, sh, cu); + if (!ret) { + active = 0; + printf("SE SH CU SIMD WAVE WAVE_STATUS" + " PC_LOW PC_HI EXEC_LO EXEC_HI" + " WAVE_HW_ID GPR_ALLOC LDS_ALLOC" + " WAVE_TRAPSTS WAVE_IB_STS\n"); + ret = amdgpu_print_cu(w_handle, se, sh, + cu); + if (ret) + return ret; + } + } + } + } + + return active; +} + +int amdgpu_waves_destroy(struct amdgpu_waves_handle *w_handle) +{ + if (!w_handle || w_handle->fd.mmio_reg <= 0 || w_handle->fd.waves <= 0) + return -EINVAL; + + close(w_handle->fd.mmio_reg); + close(w_handle->fd.waves); + close(w_handle->fd.gpr); + return 0; +} diff --git a/tests/amdgpu/amdgpu_waves.h b/tests/amdgpu/amdgpu_waves.h new file mode 100644 index 000000000..6772a3190 --- /dev/null +++ b/tests/amdgpu/amdgpu_waves.h @@ -0,0 +1,16 @@ +#ifndef AMDGPU_WAVES_H +#define AMDGPU_WAVES_H + +struct amdgpu_waves_handle { + struct { + int mmio_reg; + int waves; + int gpr; + } fd; +}; + +int amdgpu_waves_create(struct amdgpu_waves_handle *waves); +int amdgpu_waves_print(struct amdgpu_waves_handle *w_handle); +int amdgpu_waves_destroy(struct amdgpu_waves_handle *w_handle); + +#endif \ No newline at end of file diff --git a/tests/amdgpu/basic_tests.c b/tests/amdgpu/basic_tests.c index 510ccdebb..f391a0bc1 100644 --- a/tests/amdgpu/basic_tests.c +++ b/tests/amdgpu/basic_tests.c @@ -28,6 +28,7 @@ # include #endif #include +#include #include "CUnit/Basic.h" @@ -35,6 +36,8 @@ #include "amdgpu_drm.h" #include "util_math.h" +#include "amdgpu_waves.h" + static amdgpu_device_handle device_handle; static uint32_t major_version; static uint32_t minor_version; @@ -50,8 +53,12 @@ static void amdgpu_semaphore_test(void); static void amdgpu_sync_dependency_test(void); static void amdgpu_bo_eviction_test(void); static void amdgpu_dispatch_test(void); +static void amdgpu_wavefront_read_test(void); +static void amdgpu_trap_registers_access_test(void); static void amdgpu_draw_test(void); static void amdgpu_direct_gma_test(void); +static void amdgpu_command_submission_ram_to_vram(void); +static void amdgpu_command_submission_vram_to_ram(void); static void amdgpu_command_submission_write_linear_helper(unsigned ip_type); static void amdgpu_command_submission_const_fill_helper(unsigned ip_type); @@ -62,7 +69,17 @@ static void amdgpu_test_exec_cs_helper(amdgpu_context_handle context_handle, int res_cnt, amdgpu_bo_handle *resources, struct amdgpu_cs_ib_info *ib_info, struct amdgpu_cs_request *ibs_request); - +static int amdgpu_prepare_sdma_copy_packet(uint32_t *pm4, uint64_t src_mc, + uint64_t dst_mc, uint32_t copy_size); +static void amdgpu_command_sdma_copy_helper(unsigned src_heap, unsigned dst_heap); +static bool amdgpu_verify_trap_reg_magic(uint32_t reg_value); +static bool amdgpu_verify_trap_reg_vm_id(uint32_t reg_value, uint32_t vm_id); +static bool amdgpu_verify_trap_reg_num(uint32_t reg_value, uint32_t num); +static void amdgpu_trap_register_test_helper( + amdgpu_device_handle device_handle, + uint32_t ip_type, + uint32_t ring); + CU_TestInfo basic_tests[] = { { "Query Info Test", amdgpu_query_info_test }, { "Userptr Test", amdgpu_userptr_test }, @@ -71,11 +88,15 @@ CU_TestInfo basic_tests[] = { { "Command submission Test (Compute)", amdgpu_command_submission_compute }, { "Command submission Test (Multi-Fence)", amdgpu_command_submission_multi_fence }, { "Command submission Test (SDMA)", amdgpu_command_submission_sdma }, + { "Mem Copy System RAM to VRAM (SDMA)", amdgpu_command_submission_ram_to_vram }, + { "Mem Copy VRAM to System RAM (SDMA)", amdgpu_command_submission_vram_to_ram }, { "SW semaphore Test", amdgpu_semaphore_test }, { "Sync dependency Test", amdgpu_sync_dependency_test }, { "Dispatch Test", amdgpu_dispatch_test }, { "Draw Test", amdgpu_draw_test }, { "Direct GMA", amdgpu_direct_gma_test }, + { "Wavefront read Test", amdgpu_wavefront_read_test }, + { "Trap registers access Test", amdgpu_trap_registers_access_test }, CU_TEST_INFO_NULL, }; #define BUFFER_SIZE (8 * 1024) @@ -293,6 +314,23 @@ static uint32_t shader_bin[] = { SWAP_32(0x000070e0), SWAP_32(0x00000080), SWAP_32(0x000081bf) }; + +/* Shader code in GCN ISA + + .text + s_mov_b32 s6, 0x0 + checkagain: + s_mov_b32 s4, 0xFFFFFF + s_add_u32 s6, s6, 1 + s_cmp_eq_u32 s6, s4 + s_cbranch_scc0 checkagain + s_endpgm +*/ +static uint32_t longloopshader_bin[] = { + 0xBE860080, 0xBE8400FF, 0x00FFFFFF, 0x80068106, + 0xBF060406, 0xBF84FFFB, 0xBF810000, +}; + #define CODE_OFFSET 512 #define DATA_OFFSET 1024 @@ -301,6 +339,36 @@ enum cs_type { CS_BUFFERCOPY }; +#define AMDGPU_TRAP_REG_MAGIC 0x547652 + +/* Shader code in GCN ISA + + offset = 0 + size = 32 + tba_l = 16 | (offset << 6) | ((size - 1) << 11) + tba_h = 17 | (offset << 6) | ((size - 1) << 11) + tma_l = 18 | (offset << 6) | ((size - 1) << 11) + tma_h = 19 | (offset << 6) | ((size - 1) << 11) + + .text + v_mov_b32_e32 v0, 0 + s_getreg_b32 s4, tba_l + s_getreg_b32 s5, tba_h + s_getreg_b32 s6, tma_l + s_getreg_b32 s7, tma_h + v_mov_b32_e32 v1, s4 + v_mov_b32_e32 v2, s5 + v_mov_b32_e32 v3, s6 + v_mov_b32_e32 v4, s7 + buffer_store_format_xyzw v[1:4], v0, s[0:3], 0 idxen + s_endpgm +*/ +static const uint32_t tba_tma_read_shader[] = { + 0x7e000280, 0xb884f810, 0xb885f811, 0xb886f812, + 0xb887f813, 0x7e020204, 0x7e040205, 0x7e060206, + 0x7e080207, 0xe01c2000, 0x80000100, 0xbf810000 +}; + static const uint32_t bufferclear_cs_shader_gfx9[] = { 0xD1FD0000, 0x04010C08, 0x7E020204, 0x7E040205, 0x7E060206, 0x7E080207, 0xE01C2000, 0x80000100, @@ -1701,6 +1769,169 @@ static void amdgpu_command_submission_copy_linear_helper(unsigned ip_type) CU_ASSERT_EQUAL(r, 0); } +#define MIN(X, Y) (((X) < (Y)) ? (X) : (Y)) +static int amdgpu_prepare_sdma_copy_packet(uint32_t *pm4, uint64_t src_mc, + uint64_t dst_mc, uint32_t copy_size) +{ + int i = 0; + + if (family_id == AMDGPU_FAMILY_SI) { + pm4[i++] = SDMA_PACKET_SI(SDMA_OPCODE_COPY_SI, + 0, 0, 0, + copy_size); + pm4[i++] = 0xffffffff & dst_mc; + pm4[i++] = 0xffffffff & src_mc; + pm4[i++] = (0xffffffff00000000 & dst_mc) >> 32; + pm4[i++] = (0xffffffff00000000 & src_mc) >> 32; + } else { + pm4[i++] = SDMA_PACKET(SDMA_OPCODE_COPY, + SDMA_COPY_SUB_OPCODE_LINEAR, + 0); + if (family_id >= AMDGPU_FAMILY_AI) + pm4[i++] = copy_size - 1; + else + pm4[i++] = copy_size; + pm4[i++] = 0; + pm4[i++] = 0xffffffff & src_mc; + pm4[i++] = (0xffffffff00000000 & src_mc) >> 32; + pm4[i++] = 0xffffffff & dst_mc; + pm4[i++] = (0xffffffff00000000 & dst_mc) >> 32; + } + + return i; +} + +static void amdgpu_command_submission_ram_to_vram(void) +{ + amdgpu_command_sdma_copy_helper(AMDGPU_GEM_DOMAIN_GTT, + AMDGPU_GEM_DOMAIN_VRAM); +} + +static void amdgpu_command_submission_vram_to_ram(void) +{ + amdgpu_command_sdma_copy_helper(AMDGPU_GEM_DOMAIN_VRAM, + AMDGPU_GEM_DOMAIN_GTT); +} + +static void amdgpu_command_sdma_copy_helper(unsigned src_heap, unsigned dst_heap) +{ + const int sdma_write_length = 1024*1024*512; + unsigned int remaining_size = 0; + + /* Packets for 512MB should fit here since each 4MB needs 28 bytes. */ + const int pm4_dw = 1024; + amdgpu_context_handle context_handle; + amdgpu_bo_handle bo1, bo2; + amdgpu_bo_handle *resources; + uint32_t sz; + uint64_t src_addr, dst_addr; + uint32_t *pm4; + struct amdgpu_cs_ib_info *ib_info; + struct amdgpu_cs_request *ibs_request; + uint64_t bo1_mc, bo2_mc; + volatile unsigned char *bo1_cpu, *bo2_cpu = 0; + int i, r; + amdgpu_va_handle bo1_va_handle, bo2_va_handle; + struct drm_amdgpu_info_hw_ip hw_ip_info; + struct timespec submit_time, comp_time; + unsigned long int usec; + + CU_ASSERT_EQUAL(family_id, AMDGPU_FAMILY_AI); + if (family_id != AMDGPU_FAMILY_AI) + return; + + pm4 = calloc(pm4_dw, sizeof(*pm4)); + CU_ASSERT_NOT_EQUAL(pm4, NULL); + + ib_info = calloc(1, sizeof(*ib_info)); + CU_ASSERT_NOT_EQUAL(ib_info, NULL); + + ibs_request = calloc(1, sizeof(*ibs_request)); + CU_ASSERT_NOT_EQUAL(ibs_request, NULL); + + r = amdgpu_query_hw_ip_info(device_handle, AMDGPU_HW_IP_DMA, 0, &hw_ip_info); + CU_ASSERT_EQUAL(r, 0); + + r = amdgpu_cs_ctx_create(device_handle, &context_handle); + CU_ASSERT_EQUAL(r, 0); + + /* prepare resource */ + resources = calloc(2, sizeof(amdgpu_bo_handle)); + CU_ASSERT_NOT_EQUAL(resources, NULL); + + CU_ASSERT_NOT_EQUAL(hw_ip_info.available_rings, 0); + + /* allocate bo1 for sDMA use */ + r = amdgpu_bo_alloc_and_map(device_handle, + sdma_write_length, 4096, + src_heap, + 0, &bo1, + (void**)&bo1_cpu, &bo1_mc, + &bo1_va_handle); + CU_ASSERT_EQUAL(r, 0); + + /* allocate bo2 for sDMA use */ + r = amdgpu_bo_alloc_and_map(device_handle, + sdma_write_length, 4096, + dst_heap, + 0, &bo2, + (void**)&bo2_cpu, &bo2_mc, + &bo2_va_handle); + CU_ASSERT_EQUAL(r, 0); + + resources[0] = bo1; + resources[1] = bo2; + + i = 0; + remaining_size = sdma_write_length; + src_addr = bo1_mc; + dst_addr = bo2_mc; + while (remaining_size != 0) { + /* the SDMA_OPCODE_COPY packet size field has only 22 bits */ + sz = MIN(sdma_write_length, (1 << 22)); + i += amdgpu_prepare_sdma_copy_packet(&pm4[i], src_addr, dst_addr, sz); + src_addr += sz; + dst_addr += sz; + remaining_size -= sz; + } + + r = clock_gettime(CLOCK_MONOTONIC, &submit_time); + CU_ASSERT_EQUAL(r, 0); + + amdgpu_test_exec_cs_helper(context_handle, + AMDGPU_HW_IP_DMA, 0, + i, pm4, + 2, resources, + ib_info, ibs_request); + r = clock_gettime(CLOCK_MONOTONIC, &comp_time); + CU_ASSERT_EQUAL(r, 0); + + usec = (comp_time.tv_sec - submit_time.tv_sec)*1000*1000; + if (comp_time.tv_nsec < submit_time.tv_nsec) + usec -= (submit_time.tv_nsec - comp_time.tv_nsec)/1000; + else + usec += (comp_time.tv_nsec - submit_time.tv_nsec)/1000; + + printf("transfer speed: %.2fMiB/s\n", + ((((sdma_write_length) / (usec * 1.0f)) / 1024 / 1024)*1000*1000)); + r = amdgpu_bo_unmap_and_free(bo1, bo1_va_handle, bo1_mc, + sdma_write_length); + CU_ASSERT_EQUAL(r, 0); + r = amdgpu_bo_unmap_and_free(bo2, bo2_va_handle, bo2_mc, + sdma_write_length); + CU_ASSERT_EQUAL(r, 0); + + /* clean resources */ + free(resources); + free(ibs_request); + free(ib_info); + free(pm4); + + /* end of test */ + r = amdgpu_cs_ctx_free(context_handle); + CU_ASSERT_EQUAL(r, 0); +} + static void amdgpu_command_submission_sdma_copy_linear(void) { amdgpu_command_submission_copy_linear_helper(AMDGPU_HW_IP_DMA); @@ -2491,6 +2722,319 @@ static void amdgpu_memcpy_dispatch_test(amdgpu_device_handle device_handle, r = amdgpu_cs_ctx_free(context_handle); CU_ASSERT_EQUAL(r, 0); } + +static void amdgpu_wave_read_test(amdgpu_device_handle device_handle, + uint32_t ip_type, + uint32_t ring) +{ + amdgpu_context_handle context_handle; + amdgpu_bo_handle bo_shader, bo_cmd, resources[2]; + void *ptr_shader; + uint32_t *ptr_cmd; + uint64_t mc_address_shader, mc_address_cmd; + amdgpu_va_handle va_shader, va_cmd; + int i, r; + int bo_shader_size = 4096; + int bo_cmd_size = 4096; + struct amdgpu_cs_request ibs_request = {0}; + struct amdgpu_cs_ib_info ib_info= {0}; + amdgpu_bo_list_handle bo_list; + struct amdgpu_cs_fence fence_status = {0}; + uint32_t expired; + struct amdgpu_waves_handle waves_handle; + + r = amdgpu_waves_create(&waves_handle); + CU_ASSERT_EQUAL(r, 0); + if (r) + return; + + r = amdgpu_cs_ctx_create(device_handle, &context_handle); + CU_ASSERT_EQUAL(r, 0); + + r = amdgpu_bo_alloc_and_map(device_handle, bo_cmd_size, 4096, + AMDGPU_GEM_DOMAIN_GTT, 0, + &bo_cmd, (void **)&ptr_cmd, + &mc_address_cmd, &va_cmd); + CU_ASSERT_EQUAL(r, 0); + memset(ptr_cmd, 0, bo_cmd_size); + + r = amdgpu_bo_alloc_and_map(device_handle, bo_shader_size, 4096, + AMDGPU_GEM_DOMAIN_VRAM, 0, + &bo_shader, &ptr_shader, + &mc_address_shader, &va_shader); + CU_ASSERT_EQUAL(r, 0); + + /* copy shader into shader buffer */ + memcpy(ptr_shader, longloopshader_bin, + sizeof(longloopshader_bin)); + + i = 0; + i += amdgpu_dispatch_init(ptr_cmd + i, ip_type); + i += amdgpu_dispatch_write_cumask(ptr_cmd + i); + i += amdgpu_dispatch_write2hw(ptr_cmd + i, mc_address_shader); + + /* Start 8 threads */ + ptr_cmd[i++] = PACKET3_COMPUTE(PACKET3_DISPATCH_DIRECT, 3); + ptr_cmd[i++] = 8; + ptr_cmd[i++] = 1; + ptr_cmd[i++] = 1; + ptr_cmd[i++] = 1; + + while (i & 7) + ptr_cmd[i++] = 0xffff1000; /* type3 nop packet */ + + resources[0] = bo_shader; + resources[1] = bo_cmd; + r = amdgpu_bo_list_create(device_handle, 2, resources, NULL, &bo_list); + CU_ASSERT_EQUAL(r, 0); + + ib_info.ib_mc_address = mc_address_cmd; + ib_info.size = i; + ibs_request.ip_type = ip_type; + ibs_request.ring = ring; + ibs_request.resources = bo_list; + ibs_request.number_of_ibs = 1; + ibs_request.ibs = &ib_info; + ibs_request.fence_info.handle = NULL; + + /* submit CS */ + r = amdgpu_cs_submit(context_handle, 0, &ibs_request, 1); + CU_ASSERT_EQUAL(r, 0); + + r = amdgpu_bo_list_destroy(bo_list); + CU_ASSERT_EQUAL(r, 0); + + fence_status.ip_type = ip_type; + fence_status.ip_instance = 0; + fence_status.ring = ring; + fence_status.context = context_handle; + fence_status.fence = ibs_request.seq_no; + + printf("\n"); + r = amdgpu_waves_print(&waves_handle); + CU_ASSERT_EQUAL(r, 0); + + /* wait for IB accomplished */ + r = amdgpu_cs_query_fence_status(&fence_status, + AMDGPU_TIMEOUT_INFINITE, + 0, &expired); + if (r) { + fprintf(stderr, "W: Error waiting for execution to finish.\n"); + } + + r = amdgpu_bo_unmap_and_free(bo_shader, va_shader, mc_address_shader, bo_shader_size); + CU_ASSERT_EQUAL(r, 0); + + r = amdgpu_bo_unmap_and_free(bo_cmd, va_cmd, mc_address_cmd, bo_cmd_size); + CU_ASSERT_EQUAL(r, 0); + + r = amdgpu_cs_ctx_free(context_handle); + CU_ASSERT_EQUAL(r, 0); + + amdgpu_waves_destroy(&waves_handle); +} + +static void amdgpu_wavefront_read_test(void) +{ + int r; + struct drm_amdgpu_info_hw_ip info; + uint32_t ring_id; + + r = amdgpu_query_hw_ip_info(device_handle, AMDGPU_HW_IP_GFX, 0, &info); + CU_ASSERT_EQUAL(r, 0); + + for (ring_id = 0; (1 << ring_id) & info.available_rings; ring_id++) { + amdgpu_wave_read_test(device_handle, AMDGPU_HW_IP_GFX, ring_id); + } +} + +static bool amdgpu_verify_trap_reg_magic(uint32_t reg_value) +{ + return (reg_value >> 8) == AMDGPU_TRAP_REG_MAGIC; +} + +static bool amdgpu_verify_trap_reg_vm_id(uint32_t reg_value, uint32_t vm_id) +{ + return (reg_value & 0xF) == vm_id; +} + +static bool amdgpu_verify_trap_reg_num(uint32_t reg_value, uint32_t num) +{ + return ((reg_value >> 4) & 0xF) == num; +} + +static void amdgpu_trap_register_test_helper( + amdgpu_device_handle device_handle, + uint32_t ip_type, + uint32_t ring) +{ + uint32_t vm_id; + amdgpu_context_handle context_handle; + amdgpu_bo_handle bo_dst, bo_shader, bo_cmd, resources[3]; + volatile uint32_t *ptr_dst; + void *ptr_shader; + uint32_t *ptr_cmd; + uint64_t mc_address_dst, mc_address_shader, mc_address_cmd; + amdgpu_va_handle va_dst, va_shader, va_cmd; + int i, r; + int bo_dst_size = 16; + int bo_shader_size = 4096; + int bo_cmd_size = 4096; + struct amdgpu_cs_request ibs_request = {0}; + struct amdgpu_cs_ib_info ib_info= {0}; + amdgpu_bo_list_handle bo_list; + struct amdgpu_cs_fence fence_status = {0}; + uint32_t expired; + + r = amdgpu_cs_ctx_create(device_handle, &context_handle); + CU_ASSERT_EQUAL(r, 0); + + r = amdgpu_bo_alloc_and_map(device_handle, bo_cmd_size, 4096, + AMDGPU_GEM_DOMAIN_GTT, 0, + &bo_cmd, (void **)&ptr_cmd, + &mc_address_cmd, &va_cmd); + CU_ASSERT_EQUAL(r, 0); + memset(ptr_cmd, 0, bo_cmd_size); + + r = amdgpu_bo_alloc_and_map(device_handle, bo_shader_size, 4096, + AMDGPU_GEM_DOMAIN_VRAM, 0, + &bo_shader, &ptr_shader, + &mc_address_shader, &va_shader); + CU_ASSERT_EQUAL(r, 0); + + memcpy(ptr_shader, tba_tma_read_shader, sizeof(tba_tma_read_shader)); + + r = amdgpu_bo_alloc_and_map(device_handle, bo_dst_size, 4096, + AMDGPU_GEM_DOMAIN_VRAM, 0, + &bo_dst, (void **)&ptr_dst, + &mc_address_dst, &va_dst); + CU_ASSERT_EQUAL(r, 0); + + i = 0; + i += amdgpu_dispatch_init(ptr_cmd + i, ip_type); + + /* Issue commands to set cu mask used in current dispatch */ + i += amdgpu_dispatch_write_cumask(ptr_cmd + i); + + /* Writes shader state to HW */ + i += amdgpu_dispatch_write2hw(ptr_cmd + i, mc_address_shader); + + /* Write Resource data */ + ptr_cmd[i++] = PACKET3_COMPUTE(PKT3_SET_SH_REG, 4); + ptr_cmd[i++] = 0x240; + ptr_cmd[i++] = mc_address_dst; + ptr_cmd[i++] = (mc_address_dst >> 32) | 0x100000; + ptr_cmd[i++] = 0x1; + ptr_cmd[i++] = 0x74fac; + + /* dispatch direct command */ + ptr_cmd[i++] = PACKET3_COMPUTE(PACKET3_DISPATCH_DIRECT, 3); + ptr_cmd[i++] = 1; + ptr_cmd[i++] = 1; + ptr_cmd[i++] = 1; + ptr_cmd[i++] = 1; + + while (i & 7) + ptr_cmd[i++] = 0xffff1000; /* type3 nop packet */ + + resources[0] = bo_dst; + resources[1] = bo_shader; + resources[2] = bo_cmd; + r = amdgpu_bo_list_create(device_handle, 3, resources, NULL, &bo_list); + CU_ASSERT_EQUAL(r, 0); + + ib_info.ib_mc_address = mc_address_cmd; + ib_info.size = i; + ibs_request.ip_type = ip_type; + ibs_request.ring = ring; + ibs_request.resources = bo_list; + ibs_request.number_of_ibs = 1; + ibs_request.ibs = &ib_info; + ibs_request.fence_info.handle = NULL; + + /* submit CS */ + r = amdgpu_cs_submit(context_handle, 0, &ibs_request, 1); + CU_ASSERT_EQUAL(r, 0); + + r = amdgpu_bo_list_destroy(bo_list); + CU_ASSERT_EQUAL(r, 0); + + fence_status.ip_type = ip_type; + fence_status.ip_instance = 0; + fence_status.ring = ring; + fence_status.context = context_handle; + fence_status.fence = ibs_request.seq_no; + + /* wait for IB accomplished */ + r = amdgpu_cs_query_fence_status(&fence_status, + AMDGPU_TIMEOUT_INFINITE, + 0, &expired); + CU_ASSERT_EQUAL(r, 0); + CU_ASSERT_EQUAL(expired, true); + + /* verify if trap registers were read. The registers hold following value, + value = 0x54765200 | (reg_num << 4) | VM_ID + + expected values are, + TBA_LO = 0x54765200 | (reg_num << 4) | VM_ID + TBA_HI = 0x00000000 | (reg_num << 4) | VM_ID + TMA_LO = 0x54765200 | (reg_num << 4) | VM_ID + TMA_HI = 0x00000000 | (reg_num << 4) | VM_ID + */ + + /* TBA_LO [31:0] */ + i = 0; + CU_ASSERT_TRUE(amdgpu_verify_trap_reg_magic(ptr_dst[i])); + CU_ASSERT_TRUE(amdgpu_verify_trap_reg_num(ptr_dst[i], i)); + vm_id = ptr_dst[i++] & 0xF; + + /* TBA_HI [47:32]*/ + CU_ASSERT_TRUE(amdgpu_verify_trap_reg_num(ptr_dst[i], i)); + CU_ASSERT_TRUE(amdgpu_verify_trap_reg_vm_id(ptr_dst[i++], vm_id)); + + /* TMA_LO [31:0] */ + CU_ASSERT_TRUE(amdgpu_verify_trap_reg_magic(ptr_dst[i])); + CU_ASSERT_TRUE(amdgpu_verify_trap_reg_num(ptr_dst[i], i)); + CU_ASSERT_TRUE(amdgpu_verify_trap_reg_vm_id(ptr_dst[i++], vm_id)); + + /* TMA_HI [47:32]*/ + CU_ASSERT_TRUE(amdgpu_verify_trap_reg_num(ptr_dst[i], i)); + CU_ASSERT_TRUE(amdgpu_verify_trap_reg_vm_id(ptr_dst[i], vm_id)); + + i = 0; + printf("\n"); + for (i=0; i<4; ++i) { + printf("0x%08x ", ptr_dst[i]); + } + printf("\n"); + + r = amdgpu_bo_unmap_and_free(bo_dst, va_dst, mc_address_dst, bo_dst_size); + CU_ASSERT_EQUAL(r, 0); + + r = amdgpu_bo_unmap_and_free(bo_shader, va_shader, mc_address_shader, bo_shader_size); + CU_ASSERT_EQUAL(r, 0); + + r = amdgpu_bo_unmap_and_free(bo_cmd, va_cmd, mc_address_cmd, bo_cmd_size); + CU_ASSERT_EQUAL(r, 0); + + r = amdgpu_cs_ctx_free(context_handle); + CU_ASSERT_EQUAL(r, 0); +} + +static void amdgpu_trap_registers_access_test(void) +{ + int r; + struct drm_amdgpu_info_hw_ip info; + uint32_t ring_id; + + r = amdgpu_query_hw_ip_info(device_handle, AMDGPU_HW_IP_GFX, 0, &info); + CU_ASSERT_EQUAL(r, 0); + + for (ring_id = 0; (1 << ring_id) & info.available_rings; ring_id++) { + amdgpu_trap_register_test_helper(device_handle, AMDGPU_HW_IP_GFX, ring_id); + } +} + static void amdgpu_dispatch_test(void) { int r;