From 2b536b7995d0a0d483632fa4b2c37afe9dabc206 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Mon, 23 Dec 2024 05:50:04 +0100 Subject: [PATCH 1/7] [Snippets][CPU] Added external repacking via BrgemmCopyB --- .../snippets/include/snippets/utils/utils.hpp | 15 +- src/common/snippets/src/utils/utils.cpp | 17 +- .../snippets/cpu_runtime_configurator.cpp | 9 +- .../snippets/cpu_runtime_configurator.hpp | 42 +++- .../emitters/snippets/x64/cpu_generator.cpp | 2 +- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 200 ++++++++++++++---- src/plugins/intel_cpu/src/nodes/subgraph.h | 51 ++++- .../lowered/external_repacking_adjuster.cpp | 61 +++++- 8 files changed, 330 insertions(+), 67 deletions(-) diff --git a/src/common/snippets/include/snippets/utils/utils.hpp b/src/common/snippets/include/snippets/utils/utils.hpp index ff4646f24d03b7..0569a230e91f32 100644 --- a/src/common/snippets/include/snippets/utils/utils.hpp +++ b/src/common/snippets/include/snippets/utils/utils.hpp @@ -290,13 +290,26 @@ std::shared_ptr get_leaf_node_of_first_child_shape_infer_seq(const std std::shared_ptr get_leaf_node_of_first_parent_shape_infer_seq(const std::shared_ptr& start_node); /** - * * @param Get stride of input/output dimension * @param expr_port target port that contains shape and layout info * @param idx index of the target dimension starting from the shape's end (default = 1) */ int64_t get_dim_stride(const lowered::ExpressionPort& expr_port, size_t idx = 1); +/** + * @brief Get stride of input dimension + * @param shape target shape + * @param layout target layout + * @param idx index of the target dimension starting from the shape's end (default = 1) + */ +int64_t get_dim_in_stride(const VectorDims& shape, const VectorDims& layout, size_t idx = 1); +/** + * @brief Get stride of output dimension + * @param shape target shape + * @param layout target layout + * @param idx index of the target dimension starting from the shape's end (default = 1) + */ +int64_t get_dim_out_stride(const VectorDims& shape, const VectorDims& layout, size_t idx = 1); /** * @brief Traverses path starting from "expr", and calls "func" for each expression. diff --git a/src/common/snippets/src/utils/utils.cpp b/src/common/snippets/src/utils/utils.cpp index e7381fe6754758..249970b65baa5d 100644 --- a/src/common/snippets/src/utils/utils.cpp +++ b/src/common/snippets/src/utils/utils.cpp @@ -317,14 +317,21 @@ std::shared_ptr get_leaf_node_of_first_parent_shape_infer_seq(const st } int64_t get_dim_stride(const lowered::ExpressionPort& expr_port, size_t idx) { - size_t dim_idx = 0; + const auto& shape = expr_port.get_descriptor_ptr()->get_shape(); const auto& layout = expr_port.get_descriptor_ptr()->get_layout(); switch (expr_port.get_type()) { - case lowered::ExpressionPort::Input: dim_idx = utils::get_input_dim_idx(layout, idx); break; - case lowered::ExpressionPort::Output: dim_idx = utils::get_output_dim_idx(layout, idx); break; - default: OPENVINO_THROW("Unsupported expression port type!"); + case lowered::ExpressionPort::Input: return get_dim_in_stride(shape, layout, idx); + case lowered::ExpressionPort::Output: return get_dim_out_stride(shape, layout, idx); } - return get_stride(dim_idx, expr_port.get_descriptor_ptr()->get_shape()); + OPENVINO_THROW("Unsupported expression port type!"); +} + +int64_t get_dim_in_stride(const VectorDims& shape, const VectorDims& layout, size_t idx) { + return get_stride(utils::get_input_dim_idx(layout, idx), shape); +} + +int64_t get_dim_out_stride(const VectorDims& shape, const VectorDims& layout, size_t idx) { + return get_stride(utils::get_output_dim_idx(layout, idx), shape); } void visit_path(const lowered::ExpressionPtr& expr, diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 65741d7031d289..3ad41d707bb96b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -7,7 +7,7 @@ #include "snippets/lowered/loop_manager.hpp" #include "snippets/utils/utils.hpp" -#ifndef OPENVINO_ARCH_ARM64 +#ifdef OPENVINO_ARCH_X86_64 # include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp" # include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp" #endif @@ -39,12 +39,13 @@ std::string CPURuntimeConfig::to_string() const { } #endif -CPURuntimeConfigurator::CPURuntimeConfigurator() - : ov::snippets::RuntimeConfigurator(std::make_shared()) {} +CPURuntimeConfigurator::CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache) + : ov::snippets::RuntimeConfigurator(std::make_shared()), + compiled_kernel_cache(std::move(cache)) {} void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) { RuntimeConfigurator::initialization(linear_ir); -#ifndef OPENVINO_ARCH_ARM64 +#ifdef OPENVINO_ARCH_X86_64 RuntimeOptimizer::register_if_applicable(m_intermediate_optimizers, linear_ir, this); RuntimeOptimizer::register_if_applicable(m_final_optimizers, linear_ir, this); #endif diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index 1706670ce870d1..a8bab52eb61513 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -5,6 +5,12 @@ #pragma once #include "emitters/snippets/jit_snippets_call_args.hpp" + +#ifdef OPENVINO_ARCH_X86_64 +# include "emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp" +#endif + +#include "cache/multi_cache.h" #include "memory_desc/cpu_blocked_memory_desc.h" #include "snippets/lowered/port_descriptor.hpp" #include "snippets/runtime_configurator.hpp" @@ -21,13 +27,39 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig { std::string to_string() const override; #endif +#ifdef OPENVINO_ARCH_X86_64 + struct RepackedInput { + RepackedInput() = default; + RepackedInput(CpuBlockedMemoryDescPtr desc_, + std::shared_ptr executor_, + VectorDims in_offsets_, + VectorDims out_offsets_) + : desc(std::move(desc_)), + executor(std::move(executor_)), + in_offsets(std::move(in_offsets_)), + out_offsets(std::move(out_offsets_)) {} + + CpuBlockedMemoryDescPtr desc{nullptr}; + std::shared_ptr executor{nullptr}; + VectorDims in_offsets{}; + VectorDims out_offsets{}; + }; + std::unordered_map repacked_inputs = {}; + + enum class RepackingImplType { + NONE, // no kernel-outside repacking + IN_PARALLEL, // should be executed in parallel_nt by each thread + SEPARATE, // should be separathy from kernel executed + }; + RepackingImplType repacking_impl_type = RepackingImplType::NONE; +#endif // OPENVINO_ARCH_X86_64 + std::vector loop_args = {}; - std::unordered_map m_in_requested_descs = {}; }; class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { public: - CPURuntimeConfigurator(); + CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache = {}); /** * @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig @@ -35,6 +67,10 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { */ void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const; + const ov::intel_cpu::MultiCacheWeakPtr& get_cache() const { + return compiled_kernel_cache; + } + protected: void update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; void update_tensor_rank(const ov::snippets::VectorDims& master_shape) const override; @@ -42,6 +78,8 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator { void initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override; static const size_t rank6D; + + ov::intel_cpu::MultiCacheWeakPtr compiled_kernel_cache; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp index 39e384837856a1..96da1fa30079a8 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp @@ -165,7 +165,7 @@ class jit_snippet : public dnnl::impl::cpu::x64::jit_generator { intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t host_isa, ov::intel_cpu::MultiCacheWeakPtr cache) - : TargetMachine(std::make_shared()), + : TargetMachine(std::make_shared(cache)), h(new jit_snippet()), isa(host_isa), compiled_kernel_cache(std::move(cache)) { diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 2b0c7b55fb043d..0f35c017ceded1 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -78,8 +78,15 @@ class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor { const std::vector& start_offset_in, const std::vector& start_offset_out, const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator) - : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) {} + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache) + : SubgraphExecutor(snippet_attrs, + snippet, + start_offset_in, + start_offset_out, + snippet_config, + allocator, + kernel_cache) {} void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) override { const auto& callable = m_schedule->get_callable(); @@ -87,7 +94,12 @@ class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor { auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { init_call_args(call_args, inMemPtrs, outMemPtrs, ithr); }; - auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes) { + + auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { +#ifdef OPENVINO_ARCH_X86_64 + if (should_repacking_be_in_parallel()) + in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args); +#endif // OPENVINO_ARCH_X86_64 callable(&call_args, indexes.data()); }; @@ -123,8 +135,15 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor { const std::vector& start_offset_in, const std::vector& start_offset_out, const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator) - : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) { + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache) + : SubgraphExecutor(snippet_attrs, + snippet, + start_offset_in, + start_offset_out, + snippet_config, + allocator, + kernel_cache) { buffer_offsets = snippet_config->buffer_cluster_offsets; data_offsets = snippet_config->io_data_offsets; loop_args = snippet_config->loop_args; @@ -149,8 +168,13 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor { auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) { init_call_args(call_args, ithr); }; - auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes) { + + auto caller = [&](jit_snippets_call_args& call_args, const std::vector& indexes, size_t ithr) { update_ptrs(call_args, src_ptrs, dst_ptrs, indexes); +#ifdef OPENVINO_ARCH_X86_64 + if (should_repacking_be_in_parallel()) + in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args); +#endif // OPENVINO_ARCH_X86_64 callable(&call_args); }; @@ -827,7 +851,8 @@ void Subgraph::prepareParams() { start_offset_in, start_offset_out, snippet_config, - allocator); + allocator, + cache); } else { // Static case: // 1. Update runtime config to get static scheduling data (io data offsets, parallel domain) which will be @@ -845,7 +870,8 @@ void Subgraph::prepareParams() { start_offset_in, start_offset_out, snippet_config, - allocator); + allocator, + cache); } }; @@ -936,7 +962,8 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr& start_offset_in, const std::vector& start_offset_out, const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator) + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache) : m_schedule(snippet->get()), m_start_offset_in(start_offset_in), m_start_offset_out(start_offset_out) { @@ -954,15 +981,34 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr(m_nthreads) * m_buffer_scratchpad_size; - m_in_requested_descs = snippet_config->m_in_requested_descs; - const auto external_repacking_buffer_size = - std::accumulate(m_in_requested_descs.begin(), - m_in_requested_descs.end(), + +#if defined(OPENVINO_ARCH_X86_64) + m_repacking_impl_type = snippet_config->repacking_impl_type; + m_repacked_inputs = snippet_config->repacked_inputs; + + auto external_buffer_size = + std::accumulate(m_repacked_inputs.begin(), + m_repacked_inputs.end(), size_t(0), - [](size_t sum, const std::pair& requested_desc_elem) { - return sum + requested_desc_elem.second->getCurrentMemSize(); + [](size_t sum, const std::pair& p) { + return sum + p.second.desc->getCurrentMemSize(); }); - m_buffer_scratchpad = allocator(m_internal_buffer_size + external_repacking_buffer_size); + + if (should_repacking_be_in_parallel()) { + // When external repacking is applied in parallel section, + // each thread should have own buffer to store repacked data + external_buffer_size *= m_nthreads; + + // To avoid extra overheads in runtime on unordered_map creation, + // we initialize `repacked_offsets_by_threads` by default here + for (int i = 0; i < m_nthreads; ++i) + m_repacked_offsets_by_threads[i] = {}; + } + +#else + const auto external_buffer_size = 0lu; +#endif // OPENVINO_ARCH_X86_64 + m_buffer_scratchpad = allocator(m_internal_buffer_size + external_buffer_size); #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) const auto target = std::dynamic_pointer_cast( @@ -971,6 +1017,84 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr Subgraph::SubgraphExecutor::separately_repack_inputs(const dnnl::stream& strm, + const std::vector& srcMemPtrs) { + auto get_batch_stride = [](const std::vector strides) { + for (size_t i = 2; i < strides.size(); ++i) + if (*(strides.rbegin() + i) != 0) // handle broadcasting pattern + return *(strides.rbegin() + i); + return (*++strides.rbegin()); + }; + + auto reordered_in_ptrs = srcMemPtrs; + size_t offset = m_internal_buffer_size; + for (const auto& p : m_repacked_inputs) { + const auto in_idx = p.first; + const auto& repacked_input = p.second; + const auto& desc = repacked_input.desc; + const void* data_ptr = m_buffer_scratchpad->getDataAs() + offset; + + OPENVINO_ASSERT(in_idx < srcMemPtrs.size(), "Incorrect index of input repacked mem ptr"); + const auto& src_mem = srcMemPtrs[in_idx]; + const auto& dst_mem = std::make_shared(strm.get_engine(), desc, data_ptr, false); + + const auto& shape = dst_mem->getShape().getDims(); + const auto batch = std::accumulate(shape.rbegin() + 2, shape.rend(), 1lu, std::multiplies()); + const auto in_stride = get_batch_stride(repacked_input.in_offsets); + const auto out_stride = get_batch_stride(repacked_input.out_offsets); + + const auto* src = src_mem->getDataAs(); + auto* dst = dst_mem->getDataAs(); + + const auto& executor = repacked_input.executor; + parallel_for(batch, [&](size_t b0) { + BrgemmCopyBKernel::call_args args; + args.src = src + b0 * in_stride + m_start_offset_in[in_idx]; + args.tr_src = dst + b0 * out_stride; + BrgemmCopyBKernelExecutor::execute(executor.get(), &args); + }); + + reordered_in_ptrs[in_idx] = dst_mem; + offset += desc->getCurrentMemSize(); + } + return reordered_in_ptrs; +} + +void Subgraph::SubgraphExecutor::in_parallel_repack_inputs(const std::vector& inMemPtrs, + const std::vector& indexes, + int ithr, + jit_snippets_call_args& call_args) { + for (const auto& p : m_repacked_inputs) { + const auto& in_idx = p.first; + const auto& repacked_in = p.second; + + const auto& src_offsets = repacked_in.in_offsets; + const auto& dst_offsets = repacked_in.out_offsets; + + size_t src_offset = m_start_offset_in[in_idx], dst_offset = 0; + for (size_t j = 0; j < indexes.size(); j++) { + src_offset += src_offsets[j] * indexes[j]; + dst_offset += dst_offsets[j] * indexes[j]; + } + + uint8_t* repacked_ptr = get_external_scratchpad_ptr(ithr, in_idx) + dst_offset; + + auto& offsets = m_repacked_offsets_by_threads.at(ithr)[in_idx]; + if (offsets.count(src_offset) == 0) { + BrgemmCopyBKernel::call_args args; + args.src = inMemPtrs[in_idx]->getDataAs() + src_offset; + args.tr_src = repacked_ptr; + BrgemmCopyBKernelExecutor::execute(repacked_in.executor.get(), &args); + + offsets.insert(src_offset); + } + + call_args.src_ptrs[in_idx] = repacked_ptr; + } +} +#endif // OPENVINO_ARCH_X86_64 + #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) void Subgraph::SubgraphExecutor::segfault_detector() { if (enabled_segfault_detector) { @@ -991,7 +1115,7 @@ void Subgraph::SubgraphExecutor::segfault_detector() { void Subgraph::SubgraphExecutor::parallel_for6d( const std::function& initializer, - const std::function&)>& caller) { + const std::function&, size_t)>& caller) { const auto& dom = m_parallel_exec_domain; #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) @@ -1018,7 +1142,7 @@ void Subgraph::SubgraphExecutor::parallel_for6d( indexes[4], dom[4]); for (size_t iwork = start; iwork < end; ++iwork) { - caller(call_args, indexes); + caller(call_args, indexes, ithr); parallel_it_step(indexes[0], dom[0], indexes[1], @@ -1030,12 +1154,16 @@ void Subgraph::SubgraphExecutor::parallel_for6d( indexes[4], dom[4]); } + +#ifdef OPENVINO_ARCH_X86_64 + clean_repacked_offsets(ithr); +#endif // OPENVINO_ARCH_X86_64 }); } void Subgraph::SubgraphExecutor::parallel_forNd( const std::function& initializer, - const std::function&)>& caller) { + const std::function&, size_t)>& caller) { const auto& dom = m_parallel_exec_domain; #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS) @@ -1057,37 +1185,25 @@ void Subgraph::SubgraphExecutor::parallel_forNd( tmp /= dom[j]; } - caller(call_args, indexes); + caller(call_args, indexes, ithr); } + +#ifdef OPENVINO_ARCH_X86_64 + clean_repacked_offsets(ithr); +#endif // OPENVINO_ARCH_X86_64 }); } void Subgraph::SubgraphExecutor::execute(const dnnl::stream& strm, const std::vector& inMemPtrs, const std::vector& outMemPtrs) { - if (!m_in_requested_descs.empty()) { - auto reorderedInMemPtrs = reorder_inputs(strm, inMemPtrs); - exec_impl(reorderedInMemPtrs, outMemPtrs); - } else { - exec_impl(inMemPtrs, outMemPtrs); - } -} - -std::vector Subgraph::SubgraphExecutor::reorder_inputs(const dnnl::stream& strm, - const std::vector& inMemPtrs) { - auto reordered_in_ptrs = inMemPtrs; - size_t offset = m_internal_buffer_size; - for (const auto& requested_descs_elem : m_in_requested_descs) { - const auto in_idx = requested_descs_elem.first; - const auto& requested_desc = requested_descs_elem.second; - - const void* data_ptr = m_buffer_scratchpad->getDataAs() + offset; - const auto scratch_mem = std::make_shared(strm.get_engine(), requested_desc, data_ptr, false); - scratch_mem->load(*reordered_in_ptrs[in_idx]); - reordered_in_ptrs[in_idx] = scratch_mem; - offset += requested_desc->getCurrentMemSize(); +#ifdef OPENVINO_ARCH_X86_64 + if (should_repacking_be_separately()) { + exec_impl(separately_repack_inputs(strm, inMemPtrs), outMemPtrs); + return; } - return reordered_in_ptrs; +#endif // OPENVINO_ARCH_X86_64 + exec_impl(inMemPtrs, outMemPtrs); } } // namespace node diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index aac0fa1ea2f535..74f84b11564989 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -128,7 +128,8 @@ class Subgraph::SubgraphExecutor { const std::vector& start_offset_in, const std::vector& start_offset_out, const std::shared_ptr& snippet_config, - const BufferScratchpadAllocator& allocator); + const BufferScratchpadAllocator& allocator, + const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache); virtual ~SubgraphExecutor() = default; void execute(const dnnl::stream& strm, @@ -139,9 +140,9 @@ class Subgraph::SubgraphExecutor { virtual void exec_impl(const std::vector& inMemPtrs, const std::vector& outMemPtrs) = 0; void parallel_for6d(const std::function& initializer, - const std::function&)>& caller); + const std::function&, size_t)>& caller); void parallel_forNd(const std::function& initializer, - const std::function&)>& caller); + const std::function&, size_t)>& caller); inline void update_scratchpad_ptr(void*& scratchpad_ptr, size_t ithr) const { if (m_buffer_scratchpad_size > 0) @@ -172,10 +173,48 @@ class Subgraph::SubgraphExecutor { inline void segfault_detector(); #endif -private: - std::vector reorder_inputs(const dnnl::stream& strm, const std::vector& inMemPtrs); +#ifdef OPENVINO_ARCH_X86_64 + std::vector separately_repack_inputs(const dnnl::stream& strm, const std::vector& srcMemPtrs); + void in_parallel_repack_inputs(const std::vector& inMemPtrs, + const std::vector& indexes, + int ithr, + jit_snippets_call_args& call_args); + + inline uint8_t* get_external_scratchpad_ptr(size_t ithr, size_t idx) const { + if (m_repacked_inputs.empty()) + return nullptr; + + uint8_t* data_ptr = m_buffer_scratchpad->getDataAs() + m_internal_buffer_size; + for (const auto& p : m_repacked_inputs) { + const auto& desc = p.second.desc; + const auto size = desc->getCurrentMemSize(); + if (p.first == idx) { + return data_ptr + ithr * size; + } + data_ptr += m_nthreads * size; + } + OPENVINO_THROW("External buffer pointer has not been found"); + } + + // [ Input index - > set of src offsets which are already repacked ] + using RepackedSrcOffsets = std::unordered_map>; + std::unordered_map m_repacked_offsets_by_threads = {}; + std::unordered_map m_repacked_inputs = {}; + + inline bool should_repacking_be_separately() const { + return m_repacking_impl_type == CPURuntimeConfig::RepackingImplType::SEPARATE; + } + inline bool should_repacking_be_in_parallel() const { + return m_repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL; + } + inline void clean_repacked_offsets(size_t ithr) { + if (should_repacking_be_in_parallel()) + m_repacked_offsets_by_threads.at(ithr).clear(); + } - std::unordered_map m_in_requested_descs = {}; +private: + CPURuntimeConfig::RepackingImplType m_repacking_impl_type = CPURuntimeConfig::RepackingImplType::NONE; +#endif // OPENVINO_ARCH_X86_64 }; } // namespace node diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp index 78f9b928298a9d..5b425473bbfa1d 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp @@ -38,9 +38,14 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snipp bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& linear_ir) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmExternalRepackingAdjuster") const auto& cpu_config = ov::as_type_ptr(m_configurator->get_config()); - auto& optimal_descs = cpu_config->m_in_requested_descs; + const float L2_cache_size = dnnl::utils::get_cache_size(2, true); + + bool fit_into_L2 = true; for (const auto& i : m_param_idces_with_external_repacking) { const auto& shape = cpu_config->io_shapes[i]; + if (shape == cpu_config->latest_shapes[i]) + continue; + const auto& K = *++shape.rbegin(); const auto& N = *shape.rbegin(); @@ -50,23 +55,67 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin // Firstly, batch dims are set VectorDims requested_blocked_shape(shape.begin(), shape.end() - brgemm_kernel_rank); // Then, the blocked dims are formed - requested_blocked_shape.insert(requested_blocked_shape.end(), - {snippets::utils::div_up(K, vnni_factor), - std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)), - vnni_factor}); + const auto new_K = snippets::utils::div_up(K, vnni_factor); + const auto new_N = std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)); + requested_blocked_shape.insert(requested_blocked_shape.end(), {new_K, new_N, vnni_factor}); VectorDims requested_order(shape.size() - brgemm_kernel_rank); std::iota(requested_order.begin(), requested_order.end(), 0); const auto last_idx = shape.size() - 1; requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1}); - optimal_descs[i] = + const auto desc = std::make_shared(precision, Shape(shape), requested_blocked_shape, requested_order); + auto config = BrgemmCopyBKernelConfig(precision, + precision, + dnnl::impl::cpu::x64::cpu_isa_t::avx512_core_amx, + false, + false, + brgemm_utils::repacking::compute_inner_n_block(precision)); + const auto executor = std::make_shared( + static_cast(m_configurator)->get_cache(), + config); + config.update(N, + N, + K, + K, + ov::snippets::utils::get_dim_in_stride(shape, cpu_config->io_layouts[i], 1) * precision.size(), + brgemm_utils::repacking::compute_LDB(N, precision)); + executor->update_by_config(config); + + // Save original input offsets for input before repacking. + const auto in_offsets = cpu_config->io_data_offsets[i]; + ov::snippets::VectorDims shape_for_offset(cpu_config->tensor_rank - shape.size(), 1); shape_for_offset.insert(shape_for_offset.end(), requested_blocked_shape.begin(), requested_blocked_shape.end()); m_configurator->compute_offsets(shape_for_offset, i, 0); + // Save new input offsets for input after repacking. + const auto out_offsets = cpu_config->io_data_offsets[i]; + + cpu_config->repacked_inputs[i] = CPURuntimeConfig::RepackedInput(desc, executor, in_offsets, out_offsets); + + const auto src_size = N * K * precision.size(); + const auto dst_size = new_N * new_K * precision.size(); + fit_into_L2 &= ((src_size + dst_size) < L2_cache_size); } + + if (!cpu_config->repacked_inputs.empty()) { + // Heuristic: If external repacking data doesn't fit in the cache L2, + // external repacking should be executed in seperate parallel section before kernel execution. + cpu_config->repacking_impl_type = fit_into_L2 ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL + : CPURuntimeConfig::RepackingImplType::SEPARATE; + + // In parallel case Kernel should not add offsets to repacked inputs because + // they will be applied during repacking in execution stage + if (cpu_config->repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL) { + for (const auto& in : cpu_config->repacked_inputs) { + auto& offsets = cpu_config->io_data_offsets[in.first]; + std::fill(offsets.begin(), offsets.end(), 0); + } + } + } + return true; } From b66565900baa0f52a8fb5ced85770bef41e20455 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Mon, 23 Dec 2024 15:42:13 +0400 Subject: [PATCH 2/7] [Snippets][CPU] Fixed build on non-x64 platforms --- .../src/emitters/snippets/cpu_runtime_configurator.cpp | 2 +- .../src/emitters/snippets/cpu_runtime_configurator.hpp | 6 +++--- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 3ad41d707bb96b..43b3ea14cc148a 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -7,7 +7,7 @@ #include "snippets/lowered/loop_manager.hpp" #include "snippets/utils/utils.hpp" -#ifdef OPENVINO_ARCH_X86_64 +#ifndef OPENVINO_ARCH_ARM64 # include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp" # include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp" #endif diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index a8bab52eb61513..513ff65fee912b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -6,7 +6,7 @@ #include "emitters/snippets/jit_snippets_call_args.hpp" -#ifdef OPENVINO_ARCH_X86_64 +#ifndef OPENVINO_ARCH_ARM64 # include "emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp" #endif @@ -27,7 +27,7 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig { std::string to_string() const override; #endif -#ifdef OPENVINO_ARCH_X86_64 +#ifndef OPENVINO_ARCH_ARM64 struct RepackedInput { RepackedInput() = default; RepackedInput(CpuBlockedMemoryDescPtr desc_, @@ -52,7 +52,7 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig { SEPARATE, // should be separathy from kernel executed }; RepackingImplType repacking_impl_type = RepackingImplType::NONE; -#endif // OPENVINO_ARCH_X86_64 +#endif // OPENVINO_ARCH_ARM64 std::vector loop_args = {}; }; diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 0f35c017ceded1..1cfc785fb26895 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -982,7 +982,7 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr(m_nthreads) * m_buffer_scratchpad_size; -#if defined(OPENVINO_ARCH_X86_64) +#ifdef OPENVINO_ARCH_X86_64 m_repacking_impl_type = snippet_config->repacking_impl_type; m_repacked_inputs = snippet_config->repacked_inputs; From 72bf13a567343432512bb268ed8e3b2bc93b85cb Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Tue, 24 Dec 2024 08:14:04 +0100 Subject: [PATCH 3/7] [Snippets][CPU] Updated heuristic --- .../x64/pass/lowered/external_repacking_adjuster.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp index 5b425473bbfa1d..1941523adfb834 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp @@ -38,9 +38,8 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snipp bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& linear_ir) { OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmExternalRepackingAdjuster") const auto& cpu_config = ov::as_type_ptr(m_configurator->get_config()); - const float L2_cache_size = dnnl::utils::get_cache_size(2, true); - bool fit_into_L2 = true; + size_t data_size = 0; for (const auto& i : m_param_idces_with_external_repacking) { const auto& shape = cpu_config->io_shapes[i]; if (shape == cpu_config->latest_shapes[i]) @@ -95,12 +94,13 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin cpu_config->repacked_inputs[i] = CPURuntimeConfig::RepackedInput(desc, executor, in_offsets, out_offsets); - const auto src_size = N * K * precision.size(); - const auto dst_size = new_N * new_K * precision.size(); - fit_into_L2 &= ((src_size + dst_size) < L2_cache_size); + // src data + dst data per kernel call + data_size += N * K * precision.size() + new_N * new_K * vnni_factor * precision.size(); } if (!cpu_config->repacked_inputs.empty()) { + const auto L2_cache_size = dnnl::utils::get_cache_size(2, true); + const auto fit_into_L2 = data_size < L2_cache_size; // Heuristic: If external repacking data doesn't fit in the cache L2, // external repacking should be executed in seperate parallel section before kernel execution. cpu_config->repacking_impl_type = fit_into_L2 ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL From 4b33eaa2f80487f8b50339c11fc74c2c41ddf782 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Wed, 25 Dec 2024 07:43:42 +0100 Subject: [PATCH 4/7] [Snippets][CPU] Added inplace-Transpose support --- .../snippets/include/snippets/op/reshape.hpp | 21 ++++++++++ .../shape_inference/shape_infer_instances.hpp | 8 ++++ .../include/snippets/snippets_isa_tbl.hpp | 1 + src/common/snippets/src/generator.cpp | 1 + src/common/snippets/src/op/reshape.cpp | 41 +++++++++++++++++++ src/common/snippets/src/op/subgraph.cpp | 1 + .../snippets/src/runtime_configurator.cpp | 19 ++++++++- .../shape_inference/shape_infer_instances.cpp | 11 +++++ .../src/shape_inference/shape_inference.cpp | 1 + .../emitters/snippets/x64/cpu_generator.cpp | 1 + src/plugins/intel_cpu/src/nodes/subgraph.cpp | 37 ++++++++--------- .../x64/pass/eliminate_brgemm_copy_b.cpp | 24 +++++++++-- .../adjust_brgemm_copy_b_loop_ports.cpp | 6 ++- .../lowered/external_repacking_adjuster.cpp | 32 +++++++-------- 14 files changed, 159 insertions(+), 45 deletions(-) diff --git a/src/common/snippets/include/snippets/op/reshape.hpp b/src/common/snippets/include/snippets/op/reshape.hpp index b4e0c9233c73f0..d80a02ebc33c9a 100644 --- a/src/common/snippets/include/snippets/op/reshape.hpp +++ b/src/common/snippets/include/snippets/op/reshape.hpp @@ -32,6 +32,27 @@ class Reshape : public ov::op::Op { ov::PartialShape m_target_shape = {}; }; +/** + * @interface ReshapeWithOrder + * @brief ReshapeWithOrder reshapes input tensor shape by reqiured target order. + * The tensor data is not updated. + * Note: Order is stored in input PortDescriptor + * @ingroup snippets + */ +class ReshapeWithOrder : public ov::op::Op { +public: + OPENVINO_OP("ReshapeWithOrder", "SnippetsOpset"); + ReshapeWithOrder() = default; + ReshapeWithOrder(const Output& x, std::vector order); + + bool visit_attributes(AttributeVisitor& visitor) override; + std::shared_ptr clone_with_new_inputs(const OutputVector& new_args) const override; + void validate_and_infer_types() override; + +private: + void custom_constructor_validate_and_infer_types(std::vector order); +}; + } // namespace op } // namespace snippets } // namespace ov diff --git a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp index 1b91ea573ab1c4..c062fed338638d 100644 --- a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp +++ b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp @@ -82,5 +82,13 @@ class ReshapeShapeInfer : public IShapeInferSnippets { explicit ReshapeShapeInfer(const std::shared_ptr& n); Result infer(const std::vector& input_shapes) override; }; + +class ReshapeWithOrderShapeInfer : public IShapeInferSnippets { + std::vector m_target_order {}; +public: + explicit ReshapeWithOrderShapeInfer(const std::shared_ptr& n); + Result infer(const std::vector& input_shapes) override; +}; + } // namespace snippets } // namespace ov diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp index 9b207b09fe411f..5c5e0f3701ad42 100644 --- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp +++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp @@ -17,6 +17,7 @@ OV_OP(LoopEnd, ov::snippets::op) OV_OP(Brgemm, ov::snippets::op) OV_OP(BroadcastLoad, ov::snippets::op) OV_OP(Reshape, ov::snippets::op) +OV_OP(ReshapeWithOrder, ov::snippets::op) OV_OP(Store, ov::snippets::op) diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp index d059ddd94d5724..7869b4427d579d 100644 --- a/src/common/snippets/src/generator.cpp +++ b/src/common/snippets/src/generator.cpp @@ -77,6 +77,7 @@ RegType Generator::get_op_out_reg_type(const ov::Output& out) const { std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) || + std::dynamic_pointer_cast(op) || std::dynamic_pointer_cast(op) #ifdef SNIPPETS_DEBUG_CAPS || std::dynamic_pointer_cast(op) diff --git a/src/common/snippets/src/op/reshape.cpp b/src/common/snippets/src/op/reshape.cpp index 72823d2815cdbf..ae7887e558b5f2 100644 --- a/src/common/snippets/src/op/reshape.cpp +++ b/src/common/snippets/src/op/reshape.cpp @@ -11,6 +11,7 @@ namespace ov { namespace snippets { namespace op { + Reshape::Reshape(const Output& arg, ov::PartialShape target_shape) : Op({arg}), m_target_shape(std::move(target_shape)) { constructor_validate_and_infer_types(); @@ -38,6 +39,46 @@ const ov::PartialShape& Reshape::get_target_shape() const { void Reshape::set_target_shape(ov::PartialShape shape) { m_target_shape = std::move(shape); } + +ReshapeWithOrder::ReshapeWithOrder(const Output& arg, std::vector order) + : Op({arg}) { + custom_constructor_validate_and_infer_types(std::move(order)); +} + +void ReshapeWithOrder::custom_constructor_validate_and_infer_types(std::vector order) { + INTERNAL_OP_SCOPE(ReshapeWithOrder_constructor_validate_and_infer_types); + + const auto& input_pshape = get_input_partial_shape(0); + OPENVINO_ASSERT(input_pshape.rank().is_static() && input_pshape.size() == order.size(), + "Incompatible shape and order sizes"); + + // During ctor call, ReshapeWithOrder doesn't know his port descriptors. + // So we use explicit layouts from parameters + set_output_type(0, get_input_element_type(0), ov::snippets::utils::get_planar_pshape(input_pshape, order)); +} + +void ReshapeWithOrder::validate_and_infer_types() { + const auto& input_pshape = get_input_partial_shape(0); + const auto order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout(); + OPENVINO_ASSERT(input_pshape.rank().is_static() && input_pshape.size() == order.size(), + "Incompatible shape and order sizes"); + const auto output_pshape = utils::get_planar_pshape(get_input_partial_shape(0), order); + set_output_type(0, get_input_element_type(0), output_pshape); +} + +std::shared_ptr ReshapeWithOrder::clone_with_new_inputs(const OutputVector& new_args) const { + INTERNAL_OP_SCOPE(ReshapeWithOrder); + check_new_args_count(this, new_args); + const auto& order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout(); + return std::make_shared(new_args.at(0), order); +} + +bool ReshapeWithOrder::visit_attributes(AttributeVisitor& visitor) { + auto order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout(); + visitor.on_attribute("target_order", order); + return true; +} + }// namespace op }// namespace snippets }// namespace ov \ No newline at end of file diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp index 98e3392a65e1e2..25934829b80e00 100644 --- a/src/common/snippets/src/op/subgraph.cpp +++ b/src/common/snippets/src/op/subgraph.cpp @@ -96,6 +96,7 @@ auto Subgraph::is_domain_sensitive_op(const std::shared_ptr& op) -> bo auto Subgraph::is_shape_infer_op(const std::shared_ptr& op) -> bool { return ov::is_type(op) || + ov::is_type(op) || ov::is_type(op); } diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 06beb8db94ae3d..4ddb4c19ea5a32 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -118,7 +118,23 @@ void RuntimeConfigurator::init_data_info(const lowered::LinearIRCPtr& linear_ir) // input->shape changing ops->load PortDescriptorPtr desc = nullptr; const auto& shape_infer_seq = utils::get_first_child_shape_infer_expr_seq(param); - const auto& mem_desc_expr = shape_infer_seq.empty() ? param : shape_infer_seq.back(); + ExpressionPtr mem_desc_expr = param; + if (!shape_infer_seq.empty()) { + // If there is ReshapeWithOrder, we should take its desc because it affects on shape by target order + const auto& reordered_reshape_it = std::find_if(shape_infer_seq.cbegin(), shape_infer_seq.cend(), + [](const ExpressionPtr& expr) { + return ov::is_type(expr->get_node()); + }); + if (reordered_reshape_it != shape_infer_seq.cend()) { + const auto& reshape = *reordered_reshape_it; + const auto& etype = reshape->get_node()->get_output_element_type(0); + update_io_parameters(reshape->get_input_port_descriptor(0), etype); + continue; + } + + mem_desc_expr = shape_infer_seq.back(); + } + auto consumer_inputs = mem_desc_expr->get_output_port_connector(0)->get_consumers(); for (const auto& child_input : consumer_inputs) { const auto ma = std::dynamic_pointer_cast(child_input.get_expr()->get_node()); @@ -127,6 +143,7 @@ void RuntimeConfigurator::init_data_info(const lowered::LinearIRCPtr& linear_ir) break; } } + OPENVINO_ASSERT(desc, "Descriptor is missed!"); const auto& etype = mem_desc_expr->get_node()->get_output_element_type(0); update_io_parameters(desc, etype); } diff --git a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp index a3e3d9652c0ac8..417996ae2a5f31 100644 --- a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp +++ b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp @@ -245,5 +245,16 @@ Result ReshapeShapeInfer::infer(const std::vector& input_shapes) return {{target_shape}, ShapeInferStatus::success}; } +ReshapeWithOrderShapeInfer::ReshapeWithOrderShapeInfer(const std::shared_ptr& n) { + const auto& reshape = as_type_ptr(n); + OPENVINO_ASSERT(reshape, "Invalid node passed to ReshapeWithOrderShapeInfer."); + m_target_order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(reshape->input(0))->get_layout(); +} + +Result ReshapeWithOrderShapeInfer::infer(const std::vector& input_shapes) { + OPENVINO_ASSERT(input_shapes.size() == 1, "Invalid number of shapes is passed in ReshapeWithOrderShapeInfer"); + return {{ov::snippets::utils::get_planar_vdims(input_shapes[0].get(), m_target_order)}, ShapeInferStatus::success}; +} + } // namespace snippets } // namespace ov diff --git a/src/common/snippets/src/shape_inference/shape_inference.cpp b/src/common/snippets/src/shape_inference/shape_inference.cpp index 76a4c491c66983..017567ea86bd55 100644 --- a/src/common/snippets/src/shape_inference/shape_inference.cpp +++ b/src/common/snippets/src/shape_inference/shape_inference.cpp @@ -58,6 +58,7 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry SHAPE_INFER_PREDEFINED(op::KernelDynamic, EmptyShapeInfer), SHAPE_INFER_PREDEFINED(op::Nop, EmptyShapeInfer), SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::Reshape, ReshapeShapeInfer), + SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::ReshapeWithOrder, ReshapeWithOrderShapeInfer), SHAPE_INFER_OP_SPECIFIC_EXTERNAL(opset1::Select, SelectShapeInfer), SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::Brgemm, BrgemmShapeInfer), SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::ReduceMax, ReduceShapeInfer), diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp index 96da1fa30079a8..014cd65426e083 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp @@ -177,6 +177,7 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho jitters[snippets::op::RankNormalization::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Reshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); + jitters[snippets::op::ReshapeWithOrder::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); jitters[snippets::op::LoadReshape::get_type_info_static()] = diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 1cfc785fb26895..54ff91c3204a2f 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -932,17 +932,16 @@ void Subgraph::executeDynamicImpl(dnnl::stream strm) { } namespace { -inline void init_parallel_domain(const std::shared_ptr& snippet_config, std::vector& domain) { - const auto& master_shape = snippet_config->master_shape; - const auto& tensor_rank = snippet_config->tensor_rank; - const auto& tile_rank = snippet_config->tile_rank; +inline void init_parallel_domain(const std::vector& master_shape, size_t tensor_rank, size_t tile_rank, std::vector& domain) { domain.resize(tensor_rank, 1); - std::fill(domain.begin(), domain.end(), 1); std::copy(master_shape.cbegin(), master_shape.cbegin() + (master_shape.size() - tile_rank), domain.begin() + (tensor_rank - master_shape.size())); } +inline void init_parallel_domain(const std::shared_ptr& snippet_config, std::vector& domain) { + init_parallel_domain(snippet_config->master_shape, snippet_config->tensor_rank, snippet_config->tile_rank, domain); +} } // namespace Subgraph::SubgraphCodeGenerator::SubgraphCodeGenerator(const std::shared_ptr& snippet_attrs, @@ -1020,13 +1019,6 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr Subgraph::SubgraphExecutor::separately_repack_inputs(const dnnl::stream& strm, const std::vector& srcMemPtrs) { - auto get_batch_stride = [](const std::vector strides) { - for (size_t i = 2; i < strides.size(); ++i) - if (*(strides.rbegin() + i) != 0) // handle broadcasting pattern - return *(strides.rbegin() + i); - return (*++strides.rbegin()); - }; - auto reordered_in_ptrs = srcMemPtrs; size_t offset = m_internal_buffer_size; for (const auto& p : m_repacked_inputs) { @@ -1039,19 +1031,24 @@ std::vector Subgraph::SubgraphExecutor::separately_repack_inputs(cons const auto& src_mem = srcMemPtrs[in_idx]; const auto& dst_mem = std::make_shared(strm.get_engine(), desc, data_ptr, false); - const auto& shape = dst_mem->getShape().getDims(); - const auto batch = std::accumulate(shape.rbegin() + 2, shape.rend(), 1lu, std::multiplies()); - const auto in_stride = get_batch_stride(repacked_input.in_offsets); - const auto out_stride = get_batch_stride(repacked_input.out_offsets); - const auto* src = src_mem->getDataAs(); auto* dst = dst_mem->getDataAs(); + VectorDims dom; + const auto& shape = dst_mem->getShape().getDims(); + OPENVINO_ASSERT(shape.size() <= rank6D, "Unsupported shape rank of repacking data"); + init_parallel_domain(shape, rank6D, 2lu, dom); + + const auto in_strides = repacked_input.in_offsets; + const auto out_strides = repacked_input.out_offsets; + OPENVINO_ASSERT(in_strides.size() == rank6D && out_strides.size() == rank6D && dom.size() == rank6D, + "Unsupported shape rank of repacking data"); + const auto& executor = repacked_input.executor; - parallel_for(batch, [&](size_t b0) { + parallel_for4d(dom[0], dom[1], dom[2], dom[3], [&](size_t d0, size_t d1, size_t d2, size_t d3) { BrgemmCopyBKernel::call_args args; - args.src = src + b0 * in_stride + m_start_offset_in[in_idx]; - args.tr_src = dst + b0 * out_stride; + args.src = src + d0 * in_strides[0] + d1 * in_strides[1] + d2 * in_strides[2] + d3 * in_strides[3] + m_start_offset_in[in_idx]; + args.tr_src = dst + d0 * out_strides[0] + d1 * out_strides[1] + d2 * out_strides[2] + d3 * out_strides[3]; BrgemmCopyBKernelExecutor::execute(executor.get(), &args); }); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp index 939ae93ad92b18..02abb74cb7ad2f 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp @@ -10,6 +10,7 @@ #include "openvino/pass/pattern/op/wrap_type.hpp" #include "snippets/itt.hpp" #include "snippets/op/rank_normalization.hpp" +#include "snippets/op/reshape.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" namespace ov { @@ -30,12 +31,27 @@ pass::EliminateBrgemmCopyB::EliminateBrgemmCopyB() { const auto& in_desc = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(copy_b_node->input(0)); const auto& layout = in_desc->get_layout(); - // TODO: - // 1. Ticket 157340: support external repacking for copyB with compensations - // 2. Ticket 157339: support external repacking for non-planar layout - if (!ov::snippets::utils::is_planar_layout(layout) || + + auto is_supported_layout = [](const std::vector& layout) { + return layout.empty() || (layout.size() - 1 == layout.back()); + }; + + // TODO [157340]: support external repacking for copyB with compensations + if (!is_supported_layout(layout) || brgemm_utils::with_compensations(copy_b_node->get_type()) || transformation_callback(copy_b_node)) return false; + + // If there is non-empty and non-planar layout, we should insert reshape to support shape inference + if (!layout.empty() && !ov::snippets::utils::is_planar_layout(layout)) { + const auto& subtensor = in_desc->get_subtensor(); + const auto& reshape = std::make_shared(copy_b_node->input_value(0), layout); + ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->input(0), subtensor, layout); + ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->output(0), subtensor); + ov::replace_node(copy_b_node, reshape); + return true; + } + + // If there is no layout, we can just remove BrgemmCopyB from the subgraph return ov::replace_output_update_name(copy_b_out, copy_b_node->input_value(0)); }; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp index 1cb8263d189d18..5661f04d496cd2 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp @@ -70,8 +70,10 @@ bool pass::AdjustBrgemmCopyBLoopPorts::run(const snippets::lowered::LinearIR& li auto get_repacking_loop_idces = [](const snippets::lowered::ExpressionPtr& brgemm_expr) { // Repacking may be extracted outside the snippets kernel. In this case, brgemm parent expression is a // parameter. - if (is_type( - brgemm_expr->get_input_port_connector(1)->get_source().get_expr()->get_node())) + const auto& brgemm_in1 = brgemm_expr->get_input_port_connector(1)->get_source(); + const auto& shape_infer_seq = ov::snippets::utils::get_first_parent_shape_infer_expr_seq(brgemm_in1.get_expr()); + const auto source = shape_infer_seq.empty() ? brgemm_in1 : shape_infer_seq.back()->get_input_port_connector(0)->get_source(); + if (is_type(source.get_expr()->get_node())) return std::vector{}; const auto repacking_expr = brgemm_utils::repacking::get_copy_b_expr(brgemm_expr); OPENVINO_ASSERT(repacking_expr, "BrgemmCopyB expression is not found"); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp index 1941523adfb834..950f1dbaa04603 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp @@ -20,18 +20,16 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snipp const auto& params = linear_ir->get_parameters(); for (size_t i = 0; i < params.size(); ++i) { const auto& param = params[i]; - const auto consumers = param->get_output_port_connector(0)->get_consumers(); + const auto& shape_infer_consumers = ov::snippets::utils::get_first_child_shape_infer_expr_seq(param); + const auto& out = shape_infer_consumers.empty() ? param->get_output_port(0) : shape_infer_consumers.back()->get_output_port(0); + const auto consumers = out.get_connected_ports(); const bool brgemm_with_extracted_repacking = std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) { auto brgemm = ov::as_type_ptr(port.get_expr()->get_node()); return brgemm && brgemm_utils::with_repacking(brgemm->get_type()) && port.get_index() == 1; }); - if (brgemm_with_extracted_repacking) { + if (brgemm_with_extracted_repacking) m_param_idces_with_external_repacking.insert(i); - // Ticket 157339: Support non-planar layout - OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(configurator->get_io_descs()[i]->get_layout()), - "Non-planar layout is not supported for external repacking"); - } } } @@ -45,26 +43,28 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin if (shape == cpu_config->latest_shapes[i]) continue; - const auto& K = *++shape.rbegin(); - const auto& N = *shape.rbegin(); + const auto& layout = cpu_config->io_layouts[i]; + const auto planar_shape = ov::snippets::utils::get_planar_vdims(shape, layout); + const auto& K = *++planar_shape.rbegin(); + const auto& N = *planar_shape.rbegin(); const auto& precision = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0); const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision); const size_t brgemm_kernel_rank = 2; // Firstly, batch dims are set - VectorDims requested_blocked_shape(shape.begin(), shape.end() - brgemm_kernel_rank); + VectorDims requested_blocked_shape(planar_shape.begin(), planar_shape.end() - brgemm_kernel_rank); // Then, the blocked dims are formed const auto new_K = snippets::utils::div_up(K, vnni_factor); const auto new_N = std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)); requested_blocked_shape.insert(requested_blocked_shape.end(), {new_K, new_N, vnni_factor}); - VectorDims requested_order(shape.size() - brgemm_kernel_rank); + VectorDims requested_order(planar_shape.size() - brgemm_kernel_rank); std::iota(requested_order.begin(), requested_order.end(), 0); - const auto last_idx = shape.size() - 1; + const auto last_idx = planar_shape.size() - 1; requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1}); const auto desc = - std::make_shared(precision, Shape(shape), requested_blocked_shape, requested_order); + std::make_shared(precision, Shape(planar_shape), requested_blocked_shape, requested_order); auto config = BrgemmCopyBKernelConfig(precision, precision, @@ -75,12 +75,8 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin const auto executor = std::make_shared( static_cast(m_configurator)->get_cache(), config); - config.update(N, - N, - K, - K, - ov::snippets::utils::get_dim_in_stride(shape, cpu_config->io_layouts[i], 1) * precision.size(), - brgemm_utils::repacking::compute_LDB(N, precision)); + const auto copy_wei_stride = ov::snippets::utils::get_dim_in_stride(shape, cpu_config->io_layouts[i], 1) * precision.size(); + config.update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, precision)); executor->update_by_config(config); // Save original input offsets for input before repacking. From f1c7435486acccf9949b385cbf394a33b0693d44 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Thu, 26 Dec 2024 09:05:45 +0100 Subject: [PATCH 5/7] [Snippets][CPU] Applied Ivan comments --- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 13 ++++++++----- src/plugins/intel_cpu/src/nodes/subgraph.h | 7 +++---- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 54ff91c3204a2f..367bca6210ed9f 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -1000,8 +1000,9 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr& indexes, int ithr, jit_snippets_call_args& call_args) { + size_t repacked_offset_idx = 0; for (const auto& p : m_repacked_inputs) { const auto& in_idx = p.first; const auto& repacked_in = p.second; @@ -1077,17 +1079,18 @@ void Subgraph::SubgraphExecutor::in_parallel_repack_inputs(const std::vectorgetDataAs() + src_offset; args.tr_src = repacked_ptr; BrgemmCopyBKernelExecutor::execute(repacked_in.executor.get(), &args); - offsets.insert(src_offset); + last_processed_src_offset = src_offset; } call_args.src_ptrs[in_idx] = repacked_ptr; + ++repacked_offset_idx; } } #endif // OPENVINO_ARCH_X86_64 diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index 74f84b11564989..ddbe0c51ee1f16 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -196,9 +196,8 @@ class Subgraph::SubgraphExecutor { OPENVINO_THROW("External buffer pointer has not been found"); } - // [ Input index - > set of src offsets which are already repacked ] - using RepackedSrcOffsets = std::unordered_map>; - std::unordered_map m_repacked_offsets_by_threads = {}; + // [ Thread Index -> Index of input with repacking data - > last repacked src_offset ] + std::vector> m_repacked_offsets_by_threads = {}; std::unordered_map m_repacked_inputs = {}; inline bool should_repacking_be_separately() const { @@ -209,7 +208,7 @@ class Subgraph::SubgraphExecutor { } inline void clean_repacked_offsets(size_t ithr) { if (should_repacking_be_in_parallel()) - m_repacked_offsets_by_threads.at(ithr).clear(); + m_repacked_offsets_by_threads[ithr].assign(m_repacked_inputs.size(), std::numeric_limits::max()); } private: From b6ffdaf92054691468e2fd7b52ae1dca66548f1c Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Thu, 26 Dec 2024 09:12:25 +0100 Subject: [PATCH 6/7] [Snippets][CPU] Fixed code style --- .../src/emitters/snippets/x64/cpu_generator.cpp | 3 ++- src/plugins/intel_cpu/src/nodes/subgraph.cpp | 9 ++++++--- .../snippets/x64/pass/eliminate_brgemm_copy_b.cpp | 7 ++++--- .../pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp | 3 ++- .../x64/pass/lowered/external_repacking_adjuster.cpp | 12 ++++++++---- 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp index 014cd65426e083..7835f17adb97be 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp @@ -177,7 +177,8 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho jitters[snippets::op::RankNormalization::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Reshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); - jitters[snippets::op::ReshapeWithOrder::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); + jitters[snippets::op::ReshapeWithOrder::get_type_info_static()] = + CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter); jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter); jitters[snippets::op::LoadReshape::get_type_info_static()] = diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 367bca6210ed9f..0d84d707318248 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -932,7 +932,10 @@ void Subgraph::executeDynamicImpl(dnnl::stream strm) { } namespace { -inline void init_parallel_domain(const std::vector& master_shape, size_t tensor_rank, size_t tile_rank, std::vector& domain) { +inline void init_parallel_domain(const std::vector& master_shape, + size_t tensor_rank, + size_t tile_rank, + std::vector& domain) { domain.resize(tensor_rank, 1); std::fill(domain.begin(), domain.end(), 1); std::copy(master_shape.cbegin(), @@ -1032,7 +1035,7 @@ std::vector Subgraph::SubgraphExecutor::separately_repack_inputs(cons const auto& src_mem = srcMemPtrs[in_idx]; const auto& dst_mem = std::make_shared(strm.get_engine(), desc, data_ptr, false); - const auto* src = src_mem->getDataAs(); + const auto* src = src_mem->getDataAs() + m_start_offset_in[in_idx]; auto* dst = dst_mem->getDataAs(); VectorDims dom; @@ -1048,7 +1051,7 @@ std::vector Subgraph::SubgraphExecutor::separately_repack_inputs(cons const auto& executor = repacked_input.executor; parallel_for4d(dom[0], dom[1], dom[2], dom[3], [&](size_t d0, size_t d1, size_t d2, size_t d3) { BrgemmCopyBKernel::call_args args; - args.src = src + d0 * in_strides[0] + d1 * in_strides[1] + d2 * in_strides[2] + d3 * in_strides[3] + m_start_offset_in[in_idx]; + args.src = src + d0 * in_strides[0] + d1 * in_strides[1] + d2 * in_strides[2] + d3 * in_strides[3]; args.tr_src = dst + d0 * out_strides[0] + d1 * out_strides[1] + d2 * out_strides[2] + d3 * out_strides[3]; BrgemmCopyBKernelExecutor::execute(executor.get(), &args); }); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp index 02abb74cb7ad2f..6176e99ebc3a9a 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp @@ -37,14 +37,15 @@ pass::EliminateBrgemmCopyB::EliminateBrgemmCopyB() { }; // TODO [157340]: support external repacking for copyB with compensations - if (!is_supported_layout(layout) || - brgemm_utils::with_compensations(copy_b_node->get_type()) || transformation_callback(copy_b_node)) + if (!is_supported_layout(layout) || brgemm_utils::with_compensations(copy_b_node->get_type()) || + transformation_callback(copy_b_node)) return false; // If there is non-empty and non-planar layout, we should insert reshape to support shape inference if (!layout.empty() && !ov::snippets::utils::is_planar_layout(layout)) { const auto& subtensor = in_desc->get_subtensor(); - const auto& reshape = std::make_shared(copy_b_node->input_value(0), layout); + const auto& reshape = + std::make_shared(copy_b_node->input_value(0), layout); ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->input(0), subtensor, layout); ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->output(0), subtensor); ov::replace_node(copy_b_node, reshape); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp index 5661f04d496cd2..16df97bb209ed9 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp @@ -72,7 +72,8 @@ bool pass::AdjustBrgemmCopyBLoopPorts::run(const snippets::lowered::LinearIR& li // parameter. const auto& brgemm_in1 = brgemm_expr->get_input_port_connector(1)->get_source(); const auto& shape_infer_seq = ov::snippets::utils::get_first_parent_shape_infer_expr_seq(brgemm_in1.get_expr()); - const auto source = shape_infer_seq.empty() ? brgemm_in1 : shape_infer_seq.back()->get_input_port_connector(0)->get_source(); + const auto source = + shape_infer_seq.empty() ? brgemm_in1 : shape_infer_seq.back()->get_input_port_connector(0)->get_source(); if (is_type(source.get_expr()->get_node())) return std::vector{}; const auto repacking_expr = brgemm_utils::repacking::get_copy_b_expr(brgemm_expr); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp index 950f1dbaa04603..430e6b655a55d5 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp @@ -21,7 +21,8 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snipp for (size_t i = 0; i < params.size(); ++i) { const auto& param = params[i]; const auto& shape_infer_consumers = ov::snippets::utils::get_first_child_shape_infer_expr_seq(param); - const auto& out = shape_infer_consumers.empty() ? param->get_output_port(0) : shape_infer_consumers.back()->get_output_port(0); + const auto& out = shape_infer_consumers.empty() ? param->get_output_port(0) + : shape_infer_consumers.back()->get_output_port(0); const auto consumers = out.get_connected_ports(); const bool brgemm_with_extracted_repacking = std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) { @@ -63,8 +64,10 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin const auto last_idx = planar_shape.size() - 1; requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1}); - const auto desc = - std::make_shared(precision, Shape(planar_shape), requested_blocked_shape, requested_order); + const auto desc = std::make_shared(precision, + Shape(planar_shape), + requested_blocked_shape, + requested_order); auto config = BrgemmCopyBKernelConfig(precision, precision, @@ -75,7 +78,8 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin const auto executor = std::make_shared( static_cast(m_configurator)->get_cache(), config); - const auto copy_wei_stride = ov::snippets::utils::get_dim_in_stride(shape, cpu_config->io_layouts[i], 1) * precision.size(); + const auto copy_wei_stride = + ov::snippets::utils::get_dim_in_stride(shape, cpu_config->io_layouts[i], 1) * precision.size(); config.update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, precision)); executor->update_by_config(config); From cced16d147c8de0fadaae82beec2427a47c66077 Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Thu, 26 Dec 2024 16:28:45 +0400 Subject: [PATCH 7/7] [Snippets][CPU] Fixed prim isa --- .../snippets/cpu_runtime_configurator.cpp | 32 +++++++++++++ .../snippets/cpu_runtime_configurator.hpp | 28 ++++++------ src/plugins/intel_cpu/src/nodes/subgraph.cpp | 18 ++++---- src/plugins/intel_cpu/src/nodes/subgraph.h | 2 +- .../lowered/external_repacking_adjuster.cpp | 45 ++++++++++--------- .../lowered/external_repacking_adjuster.hpp | 4 +- 6 files changed, 84 insertions(+), 45 deletions(-) diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp index 43b3ea14cc148a..0971e9e69a661f 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp @@ -39,6 +39,38 @@ std::string CPURuntimeConfig::to_string() const { } #endif +#ifndef OPENVINO_ARCH_ARM64 + +CPURuntimeConfig::RepackedInput::RepackedInput(std::shared_ptr kernel, + CpuBlockedMemoryDescPtr desc, + VectorDims in_offsets, + VectorDims out_offsets) + : m_kernel(std::move(kernel)), + m_desc(std::move(desc)), + m_in_offsets(std::move(in_offsets)), + m_out_offsets(std::move(out_offsets)) { + OPENVINO_ASSERT(m_in_offsets.size() == m_out_offsets.size(), "Incorrect size of offsets"); + OPENVINO_ASSERT(m_desc, "Descriptor is empty"); +} + +const CpuBlockedMemoryDescPtr& CPURuntimeConfig::RepackedInput::desc() const { + return m_desc; +} + +const std::shared_ptr& CPURuntimeConfig::RepackedInput::kernel() const { + return m_kernel; +} + +const VectorDims& CPURuntimeConfig::RepackedInput::in_offsets() const { + return m_in_offsets; +} + +const VectorDims& CPURuntimeConfig::RepackedInput::out_offsets() const { + return m_out_offsets; +} + +#endif // OPENVINO_ARCH_ARM64 + CPURuntimeConfigurator::CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache) : ov::snippets::RuntimeConfigurator(std::make_shared()), compiled_kernel_cache(std::move(cache)) {} diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp index 513ff65fee912b..abec42bbbe0abb 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp @@ -30,19 +30,21 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig { #ifndef OPENVINO_ARCH_ARM64 struct RepackedInput { RepackedInput() = default; - RepackedInput(CpuBlockedMemoryDescPtr desc_, - std::shared_ptr executor_, - VectorDims in_offsets_, - VectorDims out_offsets_) - : desc(std::move(desc_)), - executor(std::move(executor_)), - in_offsets(std::move(in_offsets_)), - out_offsets(std::move(out_offsets_)) {} - - CpuBlockedMemoryDescPtr desc{nullptr}; - std::shared_ptr executor{nullptr}; - VectorDims in_offsets{}; - VectorDims out_offsets{}; + RepackedInput(std::shared_ptr kernel, + CpuBlockedMemoryDescPtr desc, + VectorDims in_offsets, + VectorDims out_offsets); + + const std::shared_ptr& kernel() const; + const CpuBlockedMemoryDescPtr& desc() const; + const VectorDims& in_offsets() const; + const VectorDims& out_offsets() const; + + private: + std::shared_ptr m_kernel{nullptr}; + CpuBlockedMemoryDescPtr m_desc{nullptr}; + VectorDims m_in_offsets{}; + VectorDims m_out_offsets{}; }; std::unordered_map repacked_inputs = {}; diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index 0d84d707318248..fb657263fc3161 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -993,7 +993,7 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr& p) { - return sum + p.second.desc->getCurrentMemSize(); + return sum + p.second.desc()->getCurrentMemSize(); }); if (should_repacking_be_in_parallel()) { @@ -1028,7 +1028,7 @@ std::vector Subgraph::SubgraphExecutor::separately_repack_inputs(cons for (const auto& p : m_repacked_inputs) { const auto in_idx = p.first; const auto& repacked_input = p.second; - const auto& desc = repacked_input.desc; + const auto& desc = repacked_input.desc(); const void* data_ptr = m_buffer_scratchpad->getDataAs() + offset; OPENVINO_ASSERT(in_idx < srcMemPtrs.size(), "Incorrect index of input repacked mem ptr"); @@ -1043,17 +1043,17 @@ std::vector Subgraph::SubgraphExecutor::separately_repack_inputs(cons OPENVINO_ASSERT(shape.size() <= rank6D, "Unsupported shape rank of repacking data"); init_parallel_domain(shape, rank6D, 2lu, dom); - const auto in_strides = repacked_input.in_offsets; - const auto out_strides = repacked_input.out_offsets; + const auto& in_strides = repacked_input.in_offsets(); + const auto& out_strides = repacked_input.out_offsets(); OPENVINO_ASSERT(in_strides.size() == rank6D && out_strides.size() == rank6D && dom.size() == rank6D, "Unsupported shape rank of repacking data"); - const auto& executor = repacked_input.executor; + const auto& kernel = repacked_input.kernel(); parallel_for4d(dom[0], dom[1], dom[2], dom[3], [&](size_t d0, size_t d1, size_t d2, size_t d3) { BrgemmCopyBKernel::call_args args; args.src = src + d0 * in_strides[0] + d1 * in_strides[1] + d2 * in_strides[2] + d3 * in_strides[3]; args.tr_src = dst + d0 * out_strides[0] + d1 * out_strides[1] + d2 * out_strides[2] + d3 * out_strides[3]; - BrgemmCopyBKernelExecutor::execute(executor.get(), &args); + (*kernel)(&args); }); reordered_in_ptrs[in_idx] = dst_mem; @@ -1071,8 +1071,8 @@ void Subgraph::SubgraphExecutor::in_parallel_repack_inputs(const std::vectorgetDataAs() + src_offset; args.tr_src = repacked_ptr; - BrgemmCopyBKernelExecutor::execute(repacked_in.executor.get(), &args); + (*repacked_in.kernel())(&args); last_processed_src_offset = src_offset; } diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h index ddbe0c51ee1f16..b7864ce539371b 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.h +++ b/src/plugins/intel_cpu/src/nodes/subgraph.h @@ -186,7 +186,7 @@ class Subgraph::SubgraphExecutor { uint8_t* data_ptr = m_buffer_scratchpad->getDataAs() + m_internal_buffer_size; for (const auto& p : m_repacked_inputs) { - const auto& desc = p.second.desc; + const auto& desc = p.second.desc(); const auto size = desc->getCurrentMemSize(); if (p.first == idx) { return data_ptr + ithr * size; diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp index 430e6b655a55d5..8555ec1e958048 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp @@ -24,13 +24,20 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snipp const auto& out = shape_infer_consumers.empty() ? param->get_output_port(0) : shape_infer_consumers.back()->get_output_port(0); const auto consumers = out.get_connected_ports(); - const bool brgemm_with_extracted_repacking = - std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) { - auto brgemm = ov::as_type_ptr(port.get_expr()->get_node()); - return brgemm && brgemm_utils::with_repacking(brgemm->get_type()) && port.get_index() == 1; - }); - if (brgemm_with_extracted_repacking) - m_param_idces_with_external_repacking.insert(i); + + for (const auto& consumer : consumers) { + auto brgemm = ov::as_type_ptr(consumer.get_expr()->get_node()); + if (brgemm && brgemm_utils::with_repacking(brgemm->get_type()) && consumer.get_index() == 1) { + const auto src_prc = brgemm->get_input_element_type(0); + const auto wei_prc = brgemm->get_input_element_type(1); + const auto isa = brgemm_utils::get_primitive_isa(src_prc, brgemm_utils::with_amx(brgemm->get_type())); + const auto inner_n_block = brgemm_utils::repacking::compute_inner_n_block(wei_prc); + auto config = BrgemmCopyBKernelConfig(src_prc, wei_prc, isa, false, false, inner_n_block); + m_executors[i] = std::make_shared( + static_cast(m_configurator)->get_cache(), + config); + } + } } } @@ -39,7 +46,8 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin const auto& cpu_config = ov::as_type_ptr(m_configurator->get_config()); size_t data_size = 0; - for (const auto& i : m_param_idces_with_external_repacking) { + for (const auto& p : m_executors) { + const auto& i = p.first; const auto& shape = cpu_config->io_shapes[i]; if (shape == cpu_config->latest_shapes[i]) continue; @@ -49,6 +57,7 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin const auto& K = *++planar_shape.rbegin(); const auto& N = *planar_shape.rbegin(); + // Create CPU Memory descriptor const auto& precision = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0); const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision); const size_t brgemm_kernel_rank = 2; @@ -69,19 +78,14 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin requested_blocked_shape, requested_order); - auto config = BrgemmCopyBKernelConfig(precision, - precision, - dnnl::impl::cpu::x64::cpu_isa_t::avx512_core_amx, - false, - false, - brgemm_utils::repacking::compute_inner_n_block(precision)); - const auto executor = std::make_shared( - static_cast(m_configurator)->get_cache(), - config); + // Create Kernel using BrgemmCopyBExecutor + const auto& executor = p.second; const auto copy_wei_stride = ov::snippets::utils::get_dim_in_stride(shape, cpu_config->io_layouts[i], 1) * precision.size(); - config.update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, precision)); - executor->update_by_config(config); + const auto generic_config = executor->get_config().get_clone_ptr(); + auto config = static_cast(generic_config.get()); + config->update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, precision)); + executor->update_by_config(*config); // Save original input offsets for input before repacking. const auto in_offsets = cpu_config->io_data_offsets[i]; @@ -92,7 +96,8 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin // Save new input offsets for input after repacking. const auto out_offsets = cpu_config->io_data_offsets[i]; - cpu_config->repacked_inputs[i] = CPURuntimeConfig::RepackedInput(desc, executor, in_offsets, out_offsets); + cpu_config->repacked_inputs[i] = + CPURuntimeConfig::RepackedInput(executor->get_kernel(), desc, in_offsets, out_offsets); // src data + dst data per kernel call data_size += N * K * precision.size() + new_N * new_K * vnni_factor * precision.size(); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp index 4d0c9586f3be31..6f4e3942b1f581 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp @@ -24,11 +24,11 @@ class BrgemmExternalRepackingAdjuster : public ov::snippets::lowered::pass::Runt bool run(const snippets::lowered::LinearIR& linear_ir) override; bool applicable() const override { - return !m_param_idces_with_external_repacking.empty(); + return !m_executors.empty(); } private: - std::set m_param_idces_with_external_repacking; + std::unordered_map> m_executors; }; } // namespace intel_cpu