diff --git a/kernels/avx/matmul_avx.h b/kernels/avx/matmul_avx.h new file mode 100644 index 00000000..8f663ad5 --- /dev/null +++ b/kernels/avx/matmul_avx.h @@ -0,0 +1,37 @@ +#ifndef MATMUL_OPERATOR_AVX_H +#define MATMUL_OPERATOR_AVX_H + +#include "matmul.h" +#include + +namespace matmul { + +class MatmulOperatorAVX : public MatmulOperator { + public: + void mat_mul_accelerator_transposed_fastover_column(const struct matmul_params* params) override; + void mat_mul_accelerator_transposed_fastover_column_bias(const struct matmul_params* params) override; + + // int8 operations + void mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(const struct matmul_params* params) override; + + void mat_mul_accelerator_int8_int4_fast_no_offset(struct matmul_params* params) override; + + void mat_mul_accelerator_int4_fast(const struct matmul_params* params) override; + void mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params* params) override; +}; + +inline MatmulOperator& CreateMatmulOperatorAVX() { + static MatmulOperatorAVX instance; + return instance; +} + +} // namespace matmul + +#endif diff --git a/kernels/avx/matmul_avx_fp32.cc b/kernels/avx/matmul_avx_fp32.cc index 2dbcfda4..179d1d8e 100644 --- a/kernels/avx/matmul_avx_fp32.cc +++ b/kernels/avx/matmul_avx_fp32.cc @@ -4,8 +4,9 @@ #include #include #include // intel SSE intrinsic +#include -#include "../matmul.h" +#include "matmul_avx.h" namespace matmul { @@ -60,7 +61,7 @@ void *mat_mul_transposed_fastover_column_func(void *args) { return NULL; } -void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) { +void MatmulOperatorAVX::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) { int i, j, k; int num_thread = params->opt_params.num_thread; @@ -112,7 +113,7 @@ void fp32_ref_matmul_bias(const struct matmul_params *params) { } } -void MatmulOperator::mat_mul_accelerator_transposed_fastover_column_bias(const struct matmul_params *params) { +void MatmulOperatorAVX::mat_mul_accelerator_transposed_fastover_column_bias(const struct matmul_params *params) { fp32_ref_matmul_bias(params); } diff --git a/kernels/avx/matmul_avx_int4.cc b/kernels/avx/matmul_avx_int4.cc index b5ee4c27..65011ebc 100644 --- a/kernels/avx/matmul_avx_int4.cc +++ b/kernels/avx/matmul_avx_int4.cc @@ -3,7 +3,7 @@ #include -#include "../matmul.h" +#include "matmul_avx.h" static inline __m256i bytes_from_nibbles_32(const uint8_t *rsi) { // Load 16 bytes from memory @@ -675,7 +675,7 @@ static void *fast_zp_no_offset_over_column_func_v5(void *args) { } namespace matmul { -void MatmulOperator::mat_mul_accelerator_int4_fast(const struct matmul_params *params) { +void MatmulOperatorAVX::mat_mul_accelerator_int4_fast(const struct matmul_params *params) { const int num_thread = params->opt_params.num_thread; int i, j, k; pthread_t thread_pool[num_thread]; @@ -693,7 +693,7 @@ void MatmulOperator::mat_mul_accelerator_int4_fast(const struct matmul_params *p for (j = 0; j < num_thread; j++) pthread_join(thread_pool[j], NULL); }; -void MatmulOperator::mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params *params) { +void MatmulOperatorAVX::mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params *params) { const int num_thread = params->opt_params.num_thread; int i, j, k; pthread_t thread_pool[num_thread]; diff --git a/kernels/avx/matmul_avx_int8.cc b/kernels/avx/matmul_avx_int8.cc index f1327658..6e524064 100644 --- a/kernels/avx/matmul_avx_int8.cc +++ b/kernels/avx/matmul_avx_int8.cc @@ -5,7 +5,7 @@ #include #include -#include "../matmul.h" +#include "matmul_avx.h" inline void assign_8int32(int *ptr, int &acc) { acc = (ptr[0] + ptr[1] + ptr[2] + ptr[3] + ptr[4] + ptr[5] + ptr[6] + ptr[7]); @@ -381,7 +381,7 @@ void *mat_mul_accelerator_int8_thread_func_2x2_32unroll(void *args) { return NULL; } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params *params) { +void MatmulOperatorAVX::mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params *params) { int j, num_thread = params->opt_params.num_thread; assert(params->A.column % 64 == 0); @@ -478,7 +478,7 @@ void *mat_mul_accelerator_int8_fast_32unroll_over_column_thread_func(void *args) return NULL; } -void MatmulOperator::mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params *params) { +void MatmulOperatorAVX::mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params *params) { int j, num_thread = params->opt_params.num_thread; if (num_thread > params->C.column) num_thread = params->C.column; @@ -610,7 +610,7 @@ void *mat_mul_accelerator_int8_thread_func_2x2_32unroll_nobias(void *args) { return NULL; } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params *params) { +void MatmulOperatorAVX::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params *params) { int j, num_thread = params->opt_params.num_thread; assert((params->C.column) % 2 == 0); @@ -681,7 +681,7 @@ void *mat_mul_accelerator_int8_thread_func_2x2_32unroll_nobias_batch(void *args) return NULL; } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params *params) { +void MatmulOperatorAVX::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params *params) { int j, num_thread = params->opt_params.num_thread; assert((params->C.column) % 2 == 0); @@ -791,7 +791,7 @@ void *mat_mul_accelerator_int8_thread_func_2x2_32unroll_nobias_ofp32(void *args) return NULL; } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params *params) { +void MatmulOperatorAVX::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params *params) { int j, num_thread = params->opt_params.num_thread; assert(params->A.column % 32 == 0); @@ -851,7 +851,7 @@ void *mat_mul_accelerator_int8_thread_func_2x2_32unroll_nobias_ofp32_batch(void return NULL; } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params *params) { +void MatmulOperatorAVX::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params *params) { int j, num_thread = params->opt_params.num_thread; assert(params->A.column % 32 == 0); @@ -940,7 +940,7 @@ void *mat_mul_accelerator_int8_thread_func_2x2_32unroll_bfp32_ofp32(void *args) return NULL; } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params *params) { +void MatmulOperatorAVX::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params *params) { int j, num_thread = params->opt_params.num_thread; assert(params->A.column % 64 == 0); @@ -1211,8 +1211,9 @@ void *mat_mul_accelerator_int8_thread_func_2x2_32unroll_bfp32_ofp32_over_column( return NULL; } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column( +void MatmulOperatorAVX::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column( const struct matmul_params *params) { + int j, num_thread = params->opt_params.num_thread; if (num_thread > params->C.column) num_thread = params->C.column; @@ -1241,4 +1242,5 @@ void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over } } -} // namespace matmul + +} diff --git a/kernels/avx/matmul_avx_int8_int4.cc b/kernels/avx/matmul_avx_int8_int4.cc index eceaecc4..2e7bdb5c 100644 --- a/kernels/avx/matmul_avx_int8_int4.cc +++ b/kernels/avx/matmul_avx_int8_int4.cc @@ -4,7 +4,7 @@ #include #include -#include "../matmul.h" +#include "matmul_avx.h" #include "pthread_pool.h" @@ -322,7 +322,7 @@ static void quantize_fp_to_int8_block_size32(float *x, int size, int8_t *qx, flo namespace matmul { -void MatmulOperator::mat_mul_accelerator_int8_int4_fast_no_offset(struct matmul_params *params) { +void MatmulOperatorAVX::mat_mul_accelerator_int8_int4_fast_no_offset(struct matmul_params *params) { // const int num_thread = 4; const int num_thread = params->opt_params.num_thread; int i, j, k; diff --git a/kernels/cuda/gemv_cuda.cu b/kernels/cuda/gemv_cuda.cu index 82a85bf5..04e2d0cb 100644 --- a/kernels/cuda/gemv_cuda.cu +++ b/kernels/cuda/gemv_cuda.cu @@ -17,7 +17,7 @@ #include #include -#include "../matmul.h" +#include "matmul_cuda.h" #include "ops/linear.h" // #include @@ -210,7 +210,7 @@ namespace matmul{ Returns: out_feats: tensor of shape [B, OC]; */ - void MatmulOperator::gemv_forward_cuda(const struct matmul_params *params) + void MatmulOperatorCUDA::gemv_forward_cuda(const struct matmul_params *params) { const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; @@ -259,11 +259,11 @@ namespace matmul{ PROFILE_END("gemv_forward_cuda"); } - void MatmulOperator::mat_mul_accelerator_int4_fast(const struct matmul_params *params) { + void MatmulOperatorCUDA::mat_mul_accelerator_int4_fast(const struct matmul_params *params) { // TODO: remove this }; - void MatmulOperator::mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params *params) { + void MatmulOperatorCUDA::mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params *params) { // TODO: remove this }; diff --git a/kernels/cuda/matmul_cuda.h b/kernels/cuda/matmul_cuda.h new file mode 100644 index 00000000..a1d02c0b --- /dev/null +++ b/kernels/cuda/matmul_cuda.h @@ -0,0 +1,38 @@ +#ifndef MATMUL_OPERATOR_CUDA_H +#define MATMUL_OPERATOR_CUDA_H + +#include "matmul.h" +#include + +namespace matmul { + +class MatmulOperatorCUDA : public MatmulOperator { + public: + void mat_mul_accelerator_transposed_fastover_column(const struct matmul_params* params) override; + + // int8 operations + void mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(const struct matmul_params* params) override; + + void mat_mul_accelerator_int4_fast(const struct matmul_params* params) override; + void mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params* params) override; + + void gemv_forward_cuda(const struct matmul_params* params) override; + void naive_mat_mul_fp16_int4(const struct matmul_params* params) override; +}; + +// Declaring as static to prevent linker errors due to both cc and cu files +static inline MatmulOperator& CreateMatmulOperatorCUDA() { + static MatmulOperatorCUDA instance; + return instance; +} + +} // namespace matmul + +#endif diff --git a/kernels/cuda/matmul_int4.cu b/kernels/cuda/matmul_int4.cu index 507b564d..87e71c9e 100644 --- a/kernels/cuda/matmul_int4.cu +++ b/kernels/cuda/matmul_int4.cu @@ -1,11 +1,11 @@ #include #include -#include "../matmul.h" +#include "matmul_cuda.h" namespace matmul { -void MatmulOperator::naive_mat_mul_fp16_int4(const struct matmul_params *params) { +void MatmulOperatorCUDA::naive_mat_mul_fp16_int4(const struct matmul_params *params) { const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; const int block_size = params->block_size; // CHECK_MATRICES_int4weight(A, B, C); diff --git a/kernels/cuda/matmul_ref_fp32.cc b/kernels/cuda/matmul_ref_fp32.cc index 548d9926..d92b8a3b 100644 --- a/kernels/cuda/matmul_ref_fp32.cc +++ b/kernels/cuda/matmul_ref_fp32.cc @@ -4,7 +4,7 @@ #include #include -#include "../matmul.h" +#include "matmul_cuda.h" namespace matmul { void fp32_ref_matmul(const struct matmul_params *params) { @@ -28,7 +28,8 @@ void fp32_ref_matmul(const struct matmul_params *params) { } } -void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) { +void MatmulOperatorCUDA::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) { + std::cout<<"mat_mul_accelerator_transposed_fastover_column, fp32"< #include -#include "../matmul.h" +#include "matmul_cuda.h" namespace matmul { void int8_ref_matmul(const struct matmul_params *params) { @@ -157,35 +157,35 @@ void int8_ref_matmul_nobias_ofp32_batch(const struct matmul_params *params) { } } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params *params) { +void MatmulOperatorCUDA::mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params *params) { int8_ref_matmul(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params *params) { +void MatmulOperatorCUDA::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params *params) { int8_ref_matmul_nobias(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params *params) { +void MatmulOperatorCUDA::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params *params) { int8_ref_matmul_nobias_batch(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params *params) { +void MatmulOperatorCUDA::mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params *params) { int8_ref_matmul(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params *params) { +void MatmulOperatorCUDA::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params *params) { int8_ref_matmul_bfp32_ofp32(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params *params) { +void MatmulOperatorCUDA::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params *params) { int8_ref_matmul_nobias_ofp32(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params *params) { +void MatmulOperatorCUDA::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params *params) { int8_ref_matmul_nobias_ofp32_batch(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column( +void MatmulOperatorCUDA::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column( const struct matmul_params *params) { int8_ref_matmul_bfp32_ofp32(params); } diff --git a/kernels/matmul.h b/kernels/matmul.h index 0424edee..8351c189 100644 --- a/kernels/matmul.h +++ b/kernels/matmul.h @@ -109,48 +109,61 @@ struct thread_args { namespace matmul { class MatmulOperator { public: - void mat_mul_transposed(const struct matmul_params *params); - void mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params); - void mat_mul_accelerator_transposed_fastover_column_bias(const struct matmul_params *params); - void mat_mul_accelerator_untransposed_fastover_column(const struct matmul_params *params); - // int8 - void naive_mat_mul_int8(const struct matmul_params *params); - void mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params *params); - void mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params *params); - void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params *params); - void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params *params); - void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params *params); - void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params *params); - void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params *params); - void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(const struct matmul_params *params); - // void mat_mul_accelerator_int8_fast_2x2_omp(const struct matmul_params *params); - // int4 - void mat_mul_accelerator_int4_fast(const struct matmul_params *params); - void mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params *params); - void mat_mul_accelerator_int8_int4_fast_no_offset(struct matmul_params *params); - void gemv_accelerator_int8_int4_fast_no_offset(struct matmul_params *params); - void gemm_accelerator_int8_int4_fast_no_offset(struct matmul_params *params); - void gemm_accelerator_int8_int4_fast_no_offset_v2(struct matmul_params *params); - void cblas_gemm_accelerator_no_offset(struct matmul_params *params); - void naive_mat_mul_int4(const struct matmul_params *params); - void naive_mat_mul_int4_with_offset(const struct matmul_params *params); - // cuda - void naive_mat_mul_fp16_int4(const struct matmul_params *params); - // void naive_mat_mul_fp16_int4_gemv(const struct matmul_params *params); - void mat_mul_cuda(const struct matmul_params *params); - //// GEMM - void gemm_forward_cuda(const struct matmul_params *params, int split_k_iters); - void gemm_forward_cuda_8splits(const struct matmul_params *params, float16_t *split_8_buffer); - void gemm_forward_cuda_half(const struct matmul_params *params, int split_k_iters); - void gemm_forward_cuda_half_test(const struct matmul_params *params, int split_k_iters); - //// GEMV - void gemv_forward_cuda(const struct matmul_params *params); + virtual ~MatmulOperator() = default; + + // Virtual methods for various matrix multiplication operations + virtual void mat_mul_transposed(const struct matmul_params* params); + virtual void mat_mul_accelerator_transposed_fastover_column(const struct matmul_params* params) {} + virtual void mat_mul_accelerator_transposed_fastover_column_bias(const struct matmul_params* params) {} + virtual void mat_mul_accelerator_untransposed_fastover_column(const struct matmul_params* params) {} + + // int8 operations + virtual void naive_mat_mul_int8(const struct matmul_params* params); + virtual void mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params* params) {} + virtual void mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params* params) {} + virtual void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params* params) {} + virtual void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params* params) {} + virtual void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params* params) {} + virtual void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params* params) {} + virtual void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params* params) {} + virtual void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(const struct matmul_params* params) {} + + // int4 operations + virtual void mat_mul_accelerator_int4_fast(const struct matmul_params* params) {} + virtual void mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params* params) {} + virtual void mat_mul_accelerator_int8_int4_fast_no_offset(struct matmul_params* params) {} + virtual void gemv_accelerator_int8_int4_fast_no_offset(struct matmul_params* params) {} + virtual void gemm_accelerator_int8_int4_fast_no_offset(struct matmul_params* params) {} + virtual void gemm_accelerator_int8_int4_fast_no_offset_v2(struct matmul_params* params) {} + virtual void cblas_gemm_accelerator_no_offset(struct matmul_params* params) {} + virtual void naive_mat_mul_int4(const struct matmul_params* params); + virtual void naive_mat_mul_int4_with_offset(const struct matmul_params* params); + + // CUDA-specific operations + virtual void naive_mat_mul_fp16_int4(const struct matmul_params* params) {} + virtual void mat_mul_cuda(const struct matmul_params* params) {} + virtual void gemm_forward_cuda(const struct matmul_params* params, int split_k_iters) {} + virtual void gemm_forward_cuda_8splits(const struct matmul_params* params, float16_t* split_8_buffer) {} + virtual void gemm_forward_cuda_half(const struct matmul_params* params, int split_k_iters) {} + virtual void gemm_forward_cuda_half_test(const struct matmul_params* params, int split_k_iters) {} + virtual void gemv_forward_cuda(const struct matmul_params* params) {} + + protected: + // Protected constructor to prevent direct instantiation + // Directly creating an object of this class is not allowed because of the empty constructor + MatmulOperator() = default; private: + // Delete copy constructor and assignment operator for safety + MatmulOperator& operator=(const MatmulOperator&) = delete; + float interval_to_us(struct timeval *start, struct timeval *end); void CHECK_MATRICES(const struct matrix *A, const struct matrix *B, const struct matrix *C); void CHECK_MATRICES_int4weight(const struct matrix *A, const struct matrix *B, const struct matrix *C); }; + +MatmulOperator& CreateMatmulOperator(); + } // namespace matmul #endif diff --git a/kernels/matmul_factory.cc b/kernels/matmul_factory.cc new file mode 100644 index 00000000..228456ca --- /dev/null +++ b/kernels/matmul_factory.cc @@ -0,0 +1,31 @@ +#include "matmul.h" +#include "avx/matmul_avx.h" +#include "mkl/matmul_mkl.h" +#include "cuda/matmul_cuda.h" +#include "neon/matmul_neon.h" +#include "ref/matmul_ref.h" + +namespace matmul { + +// Declare external factory functions for each implementation +MatmulOperator& CreateMatmulOperatorMKL(); +MatmulOperator& CreateMatmulOperatorAVX(); +MatmulOperator& CreateMatmulOperatorCUDA(); +MatmulOperator& CreateMatmulOperatorNeon(); +MatmulOperator& CreateMatmulOperatorRef(); + +MatmulOperator& CreateMatmulOperator() { +#ifdef QM_CUDA + return CreateMatmulOperatorCUDA(); +#elif defined(QM_MKL) + return CreateMatmulOperatorMKL(); +#elif defined(QM_ARM) + return CreateMatmulOperatorNeon(); +#elif defined(QM_x86) + return CreateMatmulOperatorAVX(); // Default to AVX +#else + return CreateMatmulOperatorRef(); +#endif +} + +} \ No newline at end of file diff --git a/kernels/matmul_int8.cc b/kernels/matmul_int8.cc index 3d5f2ed3..03e50dc3 100644 --- a/kernels/matmul_int8.cc +++ b/kernels/matmul_int8.cc @@ -13,7 +13,7 @@ void MatmulOperator::naive_mat_mul_int8(const struct matmul_params *params) { float effective_scale = A_sc * B_sc / C_sc; int8_t *data_A = A->int8_data_ptr, *data_B = B->int8_data_ptr, *data_C = C->int8_data_ptr; const int8_t q_min = C->qparams.q_min, q_max = C->qparams.q_max; - CHECK_MATRICES(A, B, C); + // CHECK_MATRICES(A, B, C); for (i = 0; i < C->row; i++) for (j = 0; j < C->column; j++) { diff --git a/kernels/mkl/matmul_mkl.h b/kernels/mkl/matmul_mkl.h new file mode 100644 index 00000000..903b0939 --- /dev/null +++ b/kernels/mkl/matmul_mkl.h @@ -0,0 +1,37 @@ +#ifndef MATMUL_OPERATOR_MKL_H +#define MATMUL_OPERATOR_MKL_H + +#include "matmul.h" +#include + +namespace matmul { + +class MatmulOperatorMKL : public MatmulOperator { + public: + void mat_mul_accelerator_transposed_fastover_column(const struct matmul_params* params) override; + void mat_mul_accelerator_transposed_fastover_column_bias(const struct matmul_params* params) override; + + // int8 operations + void mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(const struct matmul_params* params) override; + + void mat_mul_accelerator_int8_int4_fast_no_offset(struct matmul_params* params) override; + + void mat_mul_accelerator_int4_fast(const struct matmul_params* params) override; + void mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params* params) override; +}; + +inline MatmulOperator& CreateMatmulOperatorMKL() { + static MatmulOperatorMKL instance; + return instance; +} + +} // namespace matmul + +#endif diff --git a/kernels/mkl/matmul_mkl_int8.cc b/kernels/mkl/matmul_mkl_int8.cc new file mode 100644 index 00000000..145eff68 --- /dev/null +++ b/kernels/mkl/matmul_mkl_int8.cc @@ -0,0 +1,207 @@ +#include +#include +#include +#include +#include +#include + +#include "matmul_mkl.h" +#include "../avx/matmul_avx.h" + +namespace matmul{ +void mat_mul_mkl_int8_effective_scale(const matmul_params *params) { + const matrix &A = params->A; + const matrix &B = params->B; + matrix &C = const_cast(params->C); + + int M = A.row; + int K = A.column; + int N = B.column; + + const int8_t *data_A = A.int8_data_ptr; + int32_t *data_C = new int32_t[M * N]; // Temporary buffer for accumulation + + const int8_t *data_B = B.int8_data_ptr; + + // Shift A instead of B by adding 128: + // We shift A because the mkl interface expects A to be unsigned instead of B if row-major + uint8_t *data_A_shifted = new uint8_t[M * K]; + for (int i = 0; i < M * K; ++i) { + int16_t temp = static_cast(A.int8_data_ptr[i] + 128) ; + data_A_shifted[i] = static_cast(temp); + } + + MKL_INT lda = K; + MKL_INT ldb = K; + MKL_INT ldc = N; + + + const MKL_INT8 ao = -(A.qparams.zero_point+128); + assert(B.qparams.zero_point == 0); + const MKL_INT8 bo = -(B.qparams.zero_point); // Adjusted zero point for B + MKL_INT32 co = C.qparams.zero_point; + + float effective_scale = A.qparams.scale * B.qparams.scale / C.qparams.scale; + + float alpha = 1.0f; + float beta = 0.0f; + cblas_gemm_s8u8s32( + CblasRowMajor, CblasNoTrans, CblasTrans, CblasFixOffset, + M, N, K, + alpha, + data_A_shifted, lda, ao, + data_B, ldb, bo, + beta, + data_C, ldc, &co + ); + + // Post-process the result + int8_t *data_C_int8 = C.int8_data_ptr; + for (int i = 0; i < M * N; ++i) { + int32_t acc = data_C[i]; + acc = static_cast(std::round(acc * effective_scale)); + if (params->bias.int8_data_ptr) { + acc += static_cast(params->bias.int8_data_ptr[i % N]); + } + + acc = std::max(static_cast(C.qparams.q_min), acc); + acc = std::min(static_cast(C.qparams.q_max), acc); + + data_C_int8[i] = static_cast(acc); + } + + // Clean up + delete[] data_A_shifted; + delete[] data_C; +} +void mat_mul_mkl_int8_alpha(const matmul_params *params) { + const matrix &A = params->A; + const matrix &B = params->B; + matrix &C = const_cast(params->C); + + int M = A.row; + int K = A.column; + int N = B.column; + + const int8_t *data_A = A.int8_data_ptr; + int32_t *data_C = new int32_t[M * N]; // Temporary buffer for accumulation + + const int8_t *data_B = B.int8_data_ptr; + + // Shift A instead of B by adding 128: + // We shift A because the mkl interface expects A to be unsigned instead of B if row-major + uint8_t *data_A_shifted = new uint8_t[M * K]; + for (int i = 0; i < M * K; ++i) { + int16_t temp = static_cast(A.int8_data_ptr[i] + 128) ; + data_A_shifted[i] = static_cast(temp); + } + + MKL_INT lda = K; + MKL_INT ldb = K; + MKL_INT ldc = N; + + const MKL_INT8 ao = -(A.qparams.zero_point+128); + assert(B.qparams.zero_point == 0); + const MKL_INT8 bo = -(B.qparams.zero_point); // Adjusted zero point for B + MKL_INT32 co = C.qparams.zero_point; + + float alpha = 1.0f; + float beta = 0.0f; + cblas_gemm_s8u8s32( + CblasRowMajor, CblasNoTrans, CblasTrans, CblasFixOffset, + M, N, K, + alpha, + data_A_shifted, lda, ao, + data_B, ldb, bo, + beta, + data_C, ldc, &co + ); + + // Post-process the result + int8_t *data_C_int8 = C.int8_data_ptr; + for (int i = 0; i < M * N; ++i) { + int32_t acc = data_C[i]; + acc = static_cast(std::round(((float)acc * params->alpha + + (float)(params->bias.int8_data_ptr[i % N]) * params->beta))); + + acc = std::max(static_cast(C.qparams.q_min), acc); + acc = std::min(static_cast(C.qparams.q_max), acc); + + data_C_int8[i] = static_cast(acc); + } + + // Clean up + delete[] data_A_shifted; + delete[] data_C; +} + +void MatmulOperatorMKL::mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params *params) { + // std::cout << "Running mat_mul_mkl" << std::endl; + MatmulOperatorAVX fallback; + fallback.mat_mul_accelerator_int8_fast_32unroll_over_column(params); + // mat_mul_mkl_int8(params); +} + +void MatmulOperatorMKL::mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params *params) { + mat_mul_mkl_int8_alpha(params); + // MatmulOperatorAVX fallback; + // fallback.mat_mul_accelerator_int8_fast_2x2_32unroll(params); +} + +// avx fallback operations +void MatmulOperatorMKL::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params* params) { + MatmulOperatorAVX fallback; + fallback.mat_mul_accelerator_transposed_fastover_column(params); +} + +void MatmulOperatorMKL::mat_mul_accelerator_transposed_fastover_column_bias(const struct matmul_params* params) { + MatmulOperatorAVX fallback; + fallback.mat_mul_accelerator_transposed_fastover_column_bias(params); +} + +void MatmulOperatorMKL::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params* params) { + mat_mul_mkl_int8_effective_scale(params); +} + +void MatmulOperatorMKL::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params* params) { + MatmulOperatorAVX fallback; + fallback.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(params); +} + +void MatmulOperatorMKL::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params* params) { + MatmulOperatorAVX fallback; + fallback.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(params); +} + +void MatmulOperatorMKL::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params* params) { + MatmulOperatorAVX fallback; + fallback.mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(params); +} + +void MatmulOperatorMKL::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params* params) { + MatmulOperatorAVX fallback; + fallback.mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(params); +} + +void MatmulOperatorMKL::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(const struct matmul_params* params) { + MatmulOperatorAVX fallback; + fallback.mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(params); +} + +void MatmulOperatorMKL::mat_mul_accelerator_int8_int4_fast_no_offset(struct matmul_params* params) { + MatmulOperatorAVX fallback; + fallback.mat_mul_accelerator_int8_int4_fast_no_offset(params); +} + +void MatmulOperatorMKL::mat_mul_accelerator_int4_fast(const struct matmul_params* params) { + MatmulOperatorAVX fallback; + fallback.mat_mul_accelerator_int4_fast(params); +} + +void MatmulOperatorMKL::mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params* params) { + MatmulOperatorAVX fallback; + fallback.mat_mul_accelerator_int4_fast_no_offset(params); +} + +} // namespace matmul + diff --git a/kernels/neon/matmul_neon.h b/kernels/neon/matmul_neon.h new file mode 100644 index 00000000..4b17330d --- /dev/null +++ b/kernels/neon/matmul_neon.h @@ -0,0 +1,42 @@ +#ifndef MATMUL_OPERATOR_Neon_H +#define MATMUL_OPERATOR_Neon_H + +#include "matmul.h" +#include + +namespace matmul { + +class MatmulOperatorNeon : public MatmulOperator { + public: + void mat_mul_accelerator_transposed_fastover_column(const struct matmul_params* params) override; + void mat_mul_accelerator_transposed_fastover_column_bias(const struct matmul_params* params) override; + + // int8 operations + void mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(const struct matmul_params* params) override; + + void mat_mul_accelerator_int8_int4_fast_no_offset(struct matmul_params* params) override; + + void mat_mul_accelerator_int4_fast(const struct matmul_params* params) override; + void mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params* params) override; + void gemm_accelerator_int8_int4_fast_no_offset(struct matmul_params* params) override; + void gemv_accelerator_int8_int4_fast_no_offset(struct matmul_params* params) override; + void gemm_accelerator_int8_int4_fast_no_offset_v2(struct matmul_params* params) override; + void cblas_gemm_accelerator_no_offset(struct matmul_params* params) override; + void mat_mul_accelerator_untransposed_fastover_column(const struct matmul_params* params) override; +}; + +inline MatmulOperator& CreateMatmulOperatorNeon() { + static MatmulOperatorNeon instance; + return instance; +} + +} // namespace matmul + +#endif diff --git a/kernels/neon/matmul_neon_fp32.cc b/kernels/neon/matmul_neon_fp32.cc index e056b922..e37c1d05 100644 --- a/kernels/neon/matmul_neon_fp32.cc +++ b/kernels/neon/matmul_neon_fp32.cc @@ -11,7 +11,7 @@ #endif #include "common.h" -#include "../matmul.h" +#include "matmul_neon.h" #include "pthread_pool.h" struct fp32_thread_args { @@ -120,7 +120,7 @@ void fp32_matmul_bias_cblas_gemm(const struct matmul_params *params) { } #endif -void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) { +void MatmulOperatorNeon::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) { #ifdef USE_ACCELERATE fp32_matmul_transposed_cblas_gemm(params); #else @@ -128,7 +128,7 @@ void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct #endif } -void MatmulOperator::mat_mul_accelerator_untransposed_fastover_column(const struct matmul_params *params) { +void MatmulOperatorNeon::mat_mul_accelerator_untransposed_fastover_column(const struct matmul_params *params) { #ifdef USE_ACCELERATE fp32_matmul_untransposed_cblas_gemm(params); #endif @@ -260,7 +260,7 @@ inline static void* fp32_matmul_bias_optimized_gemm(void* args) { return NULL; } -void MatmulOperator::mat_mul_accelerator_transposed_fastover_column_bias(const struct matmul_params *params) { +void MatmulOperatorNeon::mat_mul_accelerator_transposed_fastover_column_bias(const struct matmul_params *params) { #ifdef USE_ACCELERATE fp32_matmul_bias_cblas_gemm(params); #else diff --git a/kernels/neon/matmul_neon_int4.cc b/kernels/neon/matmul_neon_int4.cc index d43453e3..f0af437a 100644 --- a/kernels/neon/matmul_neon_int4.cc +++ b/kernels/neon/matmul_neon_int4.cc @@ -6,7 +6,7 @@ #include #include -#include "../matmul.h" +#include "matmul_neon.h" static inline void dequantize_block_q4_unroll2_no_offset(const uint8_t *int4_w, float *y, float scale, const uint8_t *int4_w_2, float *y_2, float scale_2, @@ -398,7 +398,7 @@ static void *fast_zp_no_offset_over_column_func_v3(void *args) { namespace matmul { -void MatmulOperator::mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params *params) { +void MatmulOperatorNeon::mat_mul_accelerator_int4_fast_no_offset(const struct matmul_params *params) { // const int num_thread = 32; const int num_thread = params->opt_params.num_thread; int i, j, k; diff --git a/kernels/neon/matmul_neon_int4_offset.cc b/kernels/neon/matmul_neon_int4_offset.cc index 5cac2a74..fc472afa 100644 --- a/kernels/neon/matmul_neon_int4_offset.cc +++ b/kernels/neon/matmul_neon_int4_offset.cc @@ -6,7 +6,7 @@ #include #include -#include "../matmul.h" +#include "matmul_neon.h" static void dequantize_block_q4(const uint8_t *int4_w, float *y, float scale, float offset, int block_size) { const float32x4_t vd = vdupq_n_f32(scale); @@ -279,7 +279,7 @@ static void *fast_over_column_func_v1(void *args) { namespace matmul { -void MatmulOperator::mat_mul_accelerator_int4_fast(const struct matmul_params *params) { +void MatmulOperatorNeon::mat_mul_accelerator_int4_fast(const struct matmul_params *params) { // const int num_thread = 16; const int num_thread = params->opt_params.num_thread; int i, j, k; diff --git a/kernels/neon/matmul_neon_int8_int4.cc b/kernels/neon/matmul_neon_int8_int4.cc index 8b5bdf42..94482792 100644 --- a/kernels/neon/matmul_neon_int8_int4.cc +++ b/kernels/neon/matmul_neon_int8_int4.cc @@ -10,7 +10,7 @@ #include #endif -#include "../matmul.h" +#include "matmul_neon.h" #include "common.h" #include "pthread_pool.h" @@ -1293,7 +1293,7 @@ inline static void* fp32_matmul_transposed_cblas_gemm(void* args) { #endif namespace matmul { -void MatmulOperator::mat_mul_accelerator_int8_int4_fast_no_offset(struct matmul_params* params) { +void MatmulOperatorNeon::mat_mul_accelerator_int8_int4_fast_no_offset(struct matmul_params* params) { int i, j, k; const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; const int block_size = params->block_size; @@ -1342,7 +1342,7 @@ void MatmulOperator::mat_mul_accelerator_int8_int4_fast_no_offset(struct matmul_ pool_wait(pool); }; -void MatmulOperator::gemv_accelerator_int8_int4_fast_no_offset(struct matmul_params* params) { +void MatmulOperatorNeon::gemv_accelerator_int8_int4_fast_no_offset(struct matmul_params* params) { int i, j, k; const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; const int block_size = params->block_size; @@ -1374,7 +1374,7 @@ void MatmulOperator::gemv_accelerator_int8_int4_fast_no_offset(struct matmul_par pool_wait(pool); }; -void MatmulOperator::gemm_accelerator_int8_int4_fast_no_offset(struct matmul_params* params) { +void MatmulOperatorNeon::gemm_accelerator_int8_int4_fast_no_offset(struct matmul_params* params) { int i, j, k; const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; const int block_size = params->block_size; @@ -1406,7 +1406,7 @@ void MatmulOperator::gemm_accelerator_int8_int4_fast_no_offset(struct matmul_par pool_wait(pool); }; -void MatmulOperator::gemm_accelerator_int8_int4_fast_no_offset_v2(struct matmul_params* params) { +void MatmulOperatorNeon::gemm_accelerator_int8_int4_fast_no_offset_v2(struct matmul_params* params) { int i, j, k; const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; const int block_size = params->block_size; @@ -1439,7 +1439,7 @@ void MatmulOperator::gemm_accelerator_int8_int4_fast_no_offset_v2(struct matmul_ }; #ifdef USE_ACCELERATE -void MatmulOperator::cblas_gemm_accelerator_no_offset(struct matmul_params* params) { +void MatmulOperatorNeon::cblas_gemm_accelerator_no_offset(struct matmul_params* params) { int i, j, k; const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; const int block_size = params->block_size; diff --git a/kernels/neon/matmul_ref_int8.cc b/kernels/neon/matmul_ref_int8.cc index ab7ae40c..fdc4eb9e 100644 --- a/kernels/neon/matmul_ref_int8.cc +++ b/kernels/neon/matmul_ref_int8.cc @@ -5,7 +5,7 @@ #include #include -#include "../matmul.h" +#include "matmul_neon.h" namespace matmul { void int8_ref_matmul(const struct matmul_params *params) { @@ -158,35 +158,35 @@ void int8_ref_matmul_nobias_ofp32_batch(const struct matmul_params *params) { } } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params *params) { +void MatmulOperatorNeon::mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params *params) { int8_ref_matmul(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params *params) { +void MatmulOperatorNeon::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params *params) { int8_ref_matmul_nobias(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params *params) { +void MatmulOperatorNeon::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params *params) { int8_ref_matmul_nobias_batch(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params *params) { +void MatmulOperatorNeon::mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params *params) { int8_ref_matmul(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params *params) { +void MatmulOperatorNeon::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params *params) { int8_ref_matmul_bfp32_ofp32(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params *params) { +void MatmulOperatorNeon::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params *params) { int8_ref_matmul_nobias_ofp32(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params *params) { +void MatmulOperatorNeon::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params *params) { int8_ref_matmul_nobias_ofp32_batch(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column( +void MatmulOperatorNeon::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column( const struct matmul_params *params) { int8_ref_matmul_bfp32_ofp32(params); } diff --git a/kernels/ref/matmul_ref.h b/kernels/ref/matmul_ref.h new file mode 100644 index 00000000..1a666550 --- /dev/null +++ b/kernels/ref/matmul_ref.h @@ -0,0 +1,33 @@ +#ifndef MATMUL_OPERATOR_Ref_H +#define MATMUL_OPERATOR_Ref_H + +#include "matmul.h" +#include + +namespace matmul { + +class MatmulOperatorRef : public MatmulOperator { + public: + void mat_mul_accelerator_transposed_fastover_column(const struct matmul_params* params) override; + + // int8 operations + void mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params* params) override; + void mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column(const struct matmul_params* params) override; + + void mat_mul_accelerator_int4_fast(const struct matmul_params* params) override; +}; + +inline MatmulOperator& CreateMatmulOperatorRef() { + static MatmulOperatorRef instance; + return instance; +} + +} // namespace matmul + +#endif diff --git a/kernels/ref/matmul_ref_fp32.cc b/kernels/ref/matmul_ref_fp32.cc index 258610a4..5913b7bf 100644 --- a/kernels/ref/matmul_ref_fp32.cc +++ b/kernels/ref/matmul_ref_fp32.cc @@ -5,7 +5,7 @@ #include #include -#include "../matmul.h" +#include "matmul_ref.h" namespace matmul { void fp32_ref_matmul(const struct matmul_params *params) { @@ -29,7 +29,7 @@ void fp32_ref_matmul(const struct matmul_params *params) { } } -void MatmulOperator::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) { +void MatmulOperatorRef::mat_mul_accelerator_transposed_fastover_column(const struct matmul_params *params) { fp32_ref_matmul(params); } diff --git a/kernels/ref/matmul_ref_int4.cc b/kernels/ref/matmul_ref_int4.cc index 0f456991..c597f179 100644 --- a/kernels/ref/matmul_ref_int4.cc +++ b/kernels/ref/matmul_ref_int4.cc @@ -5,10 +5,10 @@ #include #include -#include "../matmul.h" +#include "matmul_ref.h" namespace matmul { -void MatmulOperator::mat_mul_accelerator_int4_fast(const struct matmul_params *params) { +void MatmulOperatorRef::mat_mul_accelerator_int4_fast(const struct matmul_params *params) { int i, j, k; const struct matrix *A = ¶ms->A, *B = ¶ms->B, *C = ¶ms->C; const int block_size = params->block_size; diff --git a/kernels/ref/matmul_ref_int8.cc b/kernels/ref/matmul_ref_int8.cc index ab7ae40c..a0449942 100644 --- a/kernels/ref/matmul_ref_int8.cc +++ b/kernels/ref/matmul_ref_int8.cc @@ -5,7 +5,7 @@ #include #include -#include "../matmul.h" +#include "matmul_ref.h" namespace matmul { void int8_ref_matmul(const struct matmul_params *params) { @@ -158,35 +158,35 @@ void int8_ref_matmul_nobias_ofp32_batch(const struct matmul_params *params) { } } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params *params) { +void MatmulOperatorRef::mat_mul_accelerator_int8_fast_2x2_32unroll(const struct matmul_params *params) { int8_ref_matmul(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params *params) { +void MatmulOperatorRef::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias(const struct matmul_params *params) { int8_ref_matmul_nobias(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params *params) { +void MatmulOperatorRef::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_batch(const struct matmul_params *params) { int8_ref_matmul_nobias_batch(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params *params) { +void MatmulOperatorRef::mat_mul_accelerator_int8_fast_32unroll_over_column(const struct matmul_params *params) { int8_ref_matmul(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params *params) { +void MatmulOperatorRef::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32(const struct matmul_params *params) { int8_ref_matmul_bfp32_ofp32(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params *params) { +void MatmulOperatorRef::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32(const struct matmul_params *params) { int8_ref_matmul_nobias_ofp32(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params *params) { +void MatmulOperatorRef::mat_mul_accelerator_int8_fast_2x2_32unroll_nobias_ofp32_batch(const struct matmul_params *params) { int8_ref_matmul_nobias_ofp32_batch(params); } -void MatmulOperator::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column( +void MatmulOperatorRef::mat_mul_accelerator_int8_fast_2x2_32unroll_bfp32_ofp32_over_column( const struct matmul_params *params) { int8_ref_matmul_bfp32_ofp32(params); } diff --git a/llm/Makefile b/llm/Makefile index 94c34bb9..6960c0cc 100644 --- a/llm/Makefile +++ b/llm/Makefile @@ -5,6 +5,7 @@ CXXFLAGS = -std=c++11 -pthread -Ofast # CUDA flag DISABLE_CUDA ?= 0 DEC_SHARED_MEM ?= 0 +USE_MKL ?= 0 # customize define DEFINE = @@ -21,7 +22,9 @@ TARGET = $(TEST_TARGET_GENERAL) $(TEST_TARGET_IF_CUDA) $(PROFILE_TARGET) $(CHAT_ BUILDDIR := build/transformer PROFILEDIR := build_profile/transformer LIB_DIR = ../kernels +FACTORY_SRC = $(LIB_DIR)/matmul_factory.cc # For dynamic dispatching of matmul kernels LIB_SRC = $(wildcard $(LIB_DIR)/*.cc) +LIB_SRC += $(FACTORY_SRC) INCLUDE_DIRS = -I$(LIB_DIR) -I./include -I./include/nn_modules -I./json/single_include/ -I./half-2.2.0/include/ LIB = LDFLAGS = @@ -55,8 +58,15 @@ $(info Detected CUDA_PATH: $(CUDA_HOME)) else CUDA_HOME = /usr/local/cuda CXX = $(CUDA_HOME)/bin/nvcc + GPU_ARCH := $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -n 1 | sed 's/\.//') + + ifeq ($(GPU_ARCH),) # Please modify 'arch=compute_87,code=sm_87' according to your GPU architecture/compute capability (https://developer.nvidia.com/cuda-gpus) - CXXFLAGS = -std=c++17 -Xptxas -O3 -gencode arch=compute_87,code=sm_87 --forward-unknown-to-host-compiler -Xcompiler "-pthread" -DQM_CUDA -DENABLE_BF16 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT162_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --threads=8 + GPU_ARCH := 87 + $(warning Unable to detect GPU compute capability. Using default compute capability: compute_$(GPU_ARCH), sm_$(GPU_ARCH)) + endif + + CXXFLAGS = -std=c++17 -Xptxas -O3 -gencode arch=compute_$(GPU_ARCH),code=sm_$(GPU_ARCH) --forward-unknown-to-host-compiler -Xcompiler "-pthread" -DQM_CUDA -DENABLE_BF16 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT162_CONVERSIONS__ --expt-relaxed-constexpr --expt-extended-lambda --use_fast_math --threads=8 endif # LIB_SRC_CUDA_CC = $(wildcard $(LIB_DIR)/cuda/*.cc) $(wildcard $(LIB_DIR)/cuda/attention/*.cc) # LIB_SRC_CUDA_CU = $(wildcard $(LIB_DIR)/cuda/*.cu) $(wildcard $(LIB_DIR)/cuda/attention/*.cu) $(wildcard src/*.cu) $(wildcard src/nn_modules/cuda/*.cu) $(wildcard src/ops/cuda/*.cu) @@ -82,10 +92,37 @@ ifeq ($(shell uname -m),x86_64) endif else # For x86_64 platforms with AVX2 - # For Intel machines with AVX - LIB_AVX_SRC = $(wildcard $(LIB_DIR)/avx/*.cc) - LIB_SRC += $(LIB_AVX_SRC) - CXXFLAGS += -mavx2 -mfma -ffast-math -DUSE_INT8_INT4_PRODUCT -fpermissive -DQM_x86 + ifeq ($(USE_MKL),1) + $(info Using MKL kernels instead of AVX kernels) + # Set MKLROOT (adjust this path if necessary) + MKLROOT ?= /home/elliotliu/miniconda + + # Add MKL compiler flags + CXXFLAGS += -DMKL_ILP64 -DUSE_MKL -m64 -mavx2 -mfma -ffast-math -DQM_MKL -DQM_x86 + + # Add MKL include directories + INCLUDE_DIRS += -I$(MKLROOT)/include + + # Add MKL libraries to LDFLAGS + LDFLAGS += -Wl,--start-group \ + $(MKLROOT)/lib/libmkl_intel_ilp64.so \ + $(MKLROOT)/lib/libmkl_gnu_thread.so \ + $(MKLROOT)/lib/libmkl_core.so \ + -Wl,--end-group -lgomp -lpthread -lm -ldl \ + -Wl,-rpath,$(MKLROOT)/lib + + # Include MKL kernels and AVX kernels + LIB_MKL_SRC = $(wildcard $(LIB_DIR)/mkl/*_int*.cc) + LIB_SRC += $(LIB_MKL_SRC) + + LIB_AVX_SRC = $(wildcard $(LIB_DIR)/avx/*.cc) + LIB_SRC += $(LIB_AVX_SRC) + else + # Use AVX kernels + LIB_AVX_SRC = $(wildcard $(LIB_DIR)/avx/*.cc) + LIB_SRC += $(LIB_AVX_SRC) + CXXFLAGS += -mavx2 -mfma -ffast-math -DUSE_INT8_INT4_PRODUCT -fpermissive -DQM_x86 + endif endif else ifeq ($(shell uname -m),aarch64) ifdef CUDA_AVAILABLE diff --git a/llm/src/ops/BMM_F32T.cc b/llm/src/ops/BMM_F32T.cc index bff8b7d4..653be693 100644 --- a/llm/src/ops/BMM_F32T.cc +++ b/llm/src/ops/BMM_F32T.cc @@ -31,7 +31,7 @@ void BMM_F32T::forward(const Matrix3D &a, const Matrix3D &weight, params.opt_params.num_thread = NUM_THREAD; params.alpha = alpha; - matmul::MatmulOperator op = matmul::MatmulOperator(); + matmul::MatmulOperator &op = matmul::CreateMatmulOperator(); for (int bz = 0; bz < a.m_dim_x; bz++) { // if (params.A.column % 8 == 0) // TODO: debug this @@ -82,7 +82,7 @@ void BMM_F32T::forward_weight_untransposed(const Matrix3D &a, const Matri params.opt_params.num_thread = NUM_THREAD; params.alpha = alpha; - matmul::MatmulOperator op = matmul::MatmulOperator(); + matmul::MatmulOperator &op = matmul::CreateMatmulOperator(); for (int i = 0; i < m * n * a.m_dim_x; i++) { params.C.data_ptr[i] = 0; diff --git a/llm/src/ops/BMM_S8T_S8N_F32T.cc b/llm/src/ops/BMM_S8T_S8N_F32T.cc index f5ef8701..ba2bd5f7 100644 --- a/llm/src/ops/BMM_S8T_S8N_F32T.cc +++ b/llm/src/ops/BMM_S8T_S8N_F32T.cc @@ -41,7 +41,7 @@ void BMM_S8T_S8N_F32T::forward(const Matrix3D &x, const Matrix3D params.C.qparams.q_min = -128; params.alpha = alpha; - matmul::MatmulOperator matmul_op = matmul::MatmulOperator(); + matmul::MatmulOperator &matmul_op = matmul::CreateMatmulOperator(); if (m == 1 && x.m_dim_x > 1) { // merge each batch params.A.row = x.m_dim_x; diff --git a/llm/src/ops/BMM_S8T_S8N_S8T.cc b/llm/src/ops/BMM_S8T_S8N_S8T.cc index 1dfb534e..1835366a 100644 --- a/llm/src/ops/BMM_S8T_S8N_S8T.cc +++ b/llm/src/ops/BMM_S8T_S8N_S8T.cc @@ -41,7 +41,7 @@ void BMM_S8T_S8N_S8T::forward(const Matrix3D &x, const Matrix3D params.C.qparams.q_min = -128; params.alpha = alpha; - matmul::MatmulOperator matmul_op = matmul::MatmulOperator(); + matmul::MatmulOperator &matmul_op = matmul::CreateMatmulOperator(); // process each batch if (m == 1 && x.m_dim_x > 1) { diff --git a/llm/src/ops/W8A8B8O8Linear.cc b/llm/src/ops/W8A8B8O8Linear.cc index 33e6098f..ade99e94 100644 --- a/llm/src/ops/W8A8B8O8Linear.cc +++ b/llm/src/ops/W8A8B8O8Linear.cc @@ -55,7 +55,7 @@ void W8A8B8O8Linear::forward(const Matrix3D &x, Matrix3D &output params.alpha = alpha; params.beta = beta; - matmul::MatmulOperator matmul_op = matmul::MatmulOperator(); + matmul::MatmulOperator &matmul_op = matmul::CreateMatmulOperator(); // printf("W8A8B8O8Linear-m,n,k: %d, %d, %d\n", m,n,k); if (m == 1) { diff --git a/llm/src/ops/W8A8B8O8LinearReLU.cc b/llm/src/ops/W8A8B8O8LinearReLU.cc index a965b007..89700556 100644 --- a/llm/src/ops/W8A8B8O8LinearReLU.cc +++ b/llm/src/ops/W8A8B8O8LinearReLU.cc @@ -57,7 +57,7 @@ void W8A8B8O8LinearReLU::forward(const Matrix3D &x, Matrix3D &ou params.alpha = alpha; params.beta = beta; - matmul::MatmulOperator matmul_op = matmul::MatmulOperator(); + matmul::MatmulOperator &matmul_op = matmul::CreateMatmulOperator(); if (m == 1) { // let's loop over the column dim instead of row diff --git a/llm/src/ops/W8A8BFP32OFP32Linear.cc b/llm/src/ops/W8A8BFP32OFP32Linear.cc index 0702e21f..dbbccd06 100644 --- a/llm/src/ops/W8A8BFP32OFP32Linear.cc +++ b/llm/src/ops/W8A8BFP32OFP32Linear.cc @@ -52,7 +52,7 @@ void W8A8BFP32OFP32Linear::forward(const Matrix3D &x, Matrix3D &o params.C.qparams.zero_point = 0; params.alpha = alpha; - matmul::MatmulOperator matmul_op = matmul::MatmulOperator(); + matmul::MatmulOperator &matmul_op = matmul::CreateMatmulOperator(); if (m == 1) { // let's loop over the column dim instead of row diff --git a/llm/src/ops/cuda/linear.cu b/llm/src/ops/cuda/linear.cu index 6e65c980..4868ec67 100644 --- a/llm/src/ops/cuda/linear.cu +++ b/llm/src/ops/cuda/linear.cu @@ -32,7 +32,7 @@ void Linear_half_int4::forward(const Matrix3D &x, Matrix3D params.int32_zero_point = this->zero_point.m_data; params.block_size = QK; - matmul::MatmulOperator op = matmul::MatmulOperator(); + matmul::MatmulOperator &op = matmul::CreateMatmulOperator(); op.gemv_forward_cuda(¶ms); PROFILE_END(profile_name); @@ -69,7 +69,7 @@ void Linear_FP16_int4_ref::forward_ref(const Matrix3D &a, Matri params.int32_zero_point = this->zero_point.m_data; params.block_size = QK; - matmul::MatmulOperator op = matmul::MatmulOperator(); + matmul::MatmulOperator &op = matmul::CreateMatmulOperator(); op.naive_mat_mul_fp16_int4((const struct matmul_params *)¶ms); PROFILE_END(profile_name); diff --git a/llm/src/ops/linear.cc b/llm/src/ops/linear.cc index 3be00536..63c2e6c2 100644 --- a/llm/src/ops/linear.cc +++ b/llm/src/ops/linear.cc @@ -62,7 +62,7 @@ void Linear_FP::forward(const Matrix3D &a, Matrix3D &c) { params.opt_params.blk_size = BLK_SIZE; params.opt_params.num_thread = NUM_THREAD; - matmul::MatmulOperator op = matmul::MatmulOperator(); + matmul::MatmulOperator &op = matmul::CreateMatmulOperator(); #ifndef QM_CUDA // not support yet if (this->has_bias) { params.bias.row = this->bias.m_dim_y; @@ -109,7 +109,7 @@ void Linear_FP_int4::forward_ref(const Matrix3D &a, Matrix3D &c) { params.zero_point = this->zero_point.m_data; params.block_size = QK; - matmul::MatmulOperator op = matmul::MatmulOperator(); + matmul::MatmulOperator &op = matmul::CreateMatmulOperator(); op.naive_mat_mul_int4((const struct matmul_params *)¶ms); PROFILE_END(profile_name); @@ -147,7 +147,7 @@ void Linear_FP_int4::forward_fast(const Matrix3D &x, Matrix3D &out params.offset = this->offset.m_data; params.block_size = QK; - matmul::MatmulOperator op = matmul::MatmulOperator(); + matmul::MatmulOperator &op = matmul::CreateMatmulOperator(); op.mat_mul_accelerator_int4_fast(¶ms); PROFILE_END(profile_name); @@ -201,7 +201,7 @@ void Linear_FP_int4::forward(const Matrix3D &x, Matrix3D &output) if (this->has_bias) params.bias.data_ptr = this->bias.m_data; - matmul::MatmulOperator op = matmul::MatmulOperator(); + matmul::MatmulOperator &op = matmul::CreateMatmulOperator(); #ifdef USE_INT8_INT4_PRODUCT if (!x_int8) this->initialize_memory(params.block_size); params.A.int8_data_ptr = x_int8;