From 2b536b7995d0a0d483632fa4b2c37afe9dabc206 Mon Sep 17 00:00:00 2001
From: Alexandra Sidorova <alexandra.sidorova@intel.com>
Date: Mon, 23 Dec 2024 05:50:04 +0100
Subject: [PATCH 1/7] [Snippets][CPU] Added external repacking via BrgemmCopyB

---
 .../snippets/include/snippets/utils/utils.hpp |  15 +-
 src/common/snippets/src/utils/utils.cpp       |  17 +-
 .../snippets/cpu_runtime_configurator.cpp     |   9 +-
 .../snippets/cpu_runtime_configurator.hpp     |  42 +++-
 .../emitters/snippets/x64/cpu_generator.cpp   |   2 +-
 src/plugins/intel_cpu/src/nodes/subgraph.cpp  | 200 ++++++++++++++----
 src/plugins/intel_cpu/src/nodes/subgraph.h    |  51 ++++-
 .../lowered/external_repacking_adjuster.cpp   |  61 +++++-
 8 files changed, 330 insertions(+), 67 deletions(-)

diff --git a/src/common/snippets/include/snippets/utils/utils.hpp b/src/common/snippets/include/snippets/utils/utils.hpp
index ff4646f24d03b7..0569a230e91f32 100644
--- a/src/common/snippets/include/snippets/utils/utils.hpp
+++ b/src/common/snippets/include/snippets/utils/utils.hpp
@@ -290,13 +290,26 @@ std::shared_ptr<ov::Node> get_leaf_node_of_first_child_shape_infer_seq(const std
 std::shared_ptr<ov::Node> get_leaf_node_of_first_parent_shape_infer_seq(const std::shared_ptr<ov::Node>& start_node);
 
 /**
- *
  * @param Get stride of input/output dimension
  * @param expr_port target port that contains shape and layout info
  * @param idx index of the target dimension starting from the shape's end (default = 1)
  */
 
 int64_t get_dim_stride(const lowered::ExpressionPort& expr_port, size_t idx = 1);
+/**
+ * @brief Get stride of input dimension
+ * @param shape target shape
+ * @param layout target layout
+ * @param idx index of the target dimension starting from the shape's end (default = 1)
+ */
+int64_t get_dim_in_stride(const VectorDims& shape, const VectorDims& layout, size_t idx = 1);
+/**
+ * @brief Get stride of output dimension
+ * @param shape target shape
+ * @param layout target layout
+ * @param idx index of the target dimension starting from the shape's end (default = 1)
+ */
+int64_t get_dim_out_stride(const VectorDims& shape, const VectorDims& layout, size_t idx = 1);
 
 /**
  * @brief Traverses path starting from "expr", and calls "func" for each expression.
diff --git a/src/common/snippets/src/utils/utils.cpp b/src/common/snippets/src/utils/utils.cpp
index e7381fe6754758..249970b65baa5d 100644
--- a/src/common/snippets/src/utils/utils.cpp
+++ b/src/common/snippets/src/utils/utils.cpp
@@ -317,14 +317,21 @@ std::shared_ptr<ov::Node> get_leaf_node_of_first_parent_shape_infer_seq(const st
 }
 
 int64_t get_dim_stride(const lowered::ExpressionPort& expr_port, size_t idx) {
-    size_t dim_idx = 0;
+    const auto& shape = expr_port.get_descriptor_ptr()->get_shape();
     const auto& layout = expr_port.get_descriptor_ptr()->get_layout();
     switch (expr_port.get_type()) {
-        case lowered::ExpressionPort::Input: dim_idx = utils::get_input_dim_idx(layout, idx); break;
-        case lowered::ExpressionPort::Output: dim_idx = utils::get_output_dim_idx(layout, idx); break;
-        default: OPENVINO_THROW("Unsupported expression port type!");
+        case lowered::ExpressionPort::Input: return get_dim_in_stride(shape, layout, idx);
+        case lowered::ExpressionPort::Output: return get_dim_out_stride(shape, layout, idx);
     }
-    return get_stride(dim_idx, expr_port.get_descriptor_ptr()->get_shape());
+    OPENVINO_THROW("Unsupported expression port type!");
+}
+
+int64_t get_dim_in_stride(const VectorDims& shape, const VectorDims& layout, size_t idx) {
+    return get_stride(utils::get_input_dim_idx(layout, idx), shape);
+}
+
+int64_t get_dim_out_stride(const VectorDims& shape, const VectorDims& layout, size_t idx) {
+    return get_stride(utils::get_output_dim_idx(layout, idx), shape);
 }
 
 void visit_path(const lowered::ExpressionPtr& expr,
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
index 65741d7031d289..3ad41d707bb96b 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
@@ -7,7 +7,7 @@
 #include "snippets/lowered/loop_manager.hpp"
 #include "snippets/utils/utils.hpp"
 
-#ifndef OPENVINO_ARCH_ARM64
+#ifdef OPENVINO_ARCH_X86_64
 #    include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp"
 #    include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp"
 #endif
@@ -39,12 +39,13 @@ std::string CPURuntimeConfig::to_string() const {
 }
 #endif
 
-CPURuntimeConfigurator::CPURuntimeConfigurator()
-    : ov::snippets::RuntimeConfigurator(std::make_shared<CPURuntimeConfig>()) {}
+CPURuntimeConfigurator::CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache)
+    : ov::snippets::RuntimeConfigurator(std::make_shared<CPURuntimeConfig>()),
+      compiled_kernel_cache(std::move(cache)) {}
 
 void CPURuntimeConfigurator::initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) {
     RuntimeConfigurator::initialization(linear_ir);
-#ifndef OPENVINO_ARCH_ARM64
+#ifdef OPENVINO_ARCH_X86_64
     RuntimeOptimizer::register_if_applicable<BrgemmCopyBLoopPortsAdjuster>(m_intermediate_optimizers, linear_ir, this);
     RuntimeOptimizer::register_if_applicable<BrgemmExternalRepackingAdjuster>(m_final_optimizers, linear_ir, this);
 #endif
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
index 1706670ce870d1..a8bab52eb61513 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
@@ -5,6 +5,12 @@
 #pragma once
 
 #include "emitters/snippets/jit_snippets_call_args.hpp"
+
+#ifdef OPENVINO_ARCH_X86_64
+#    include "emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp"
+#endif
+
+#include "cache/multi_cache.h"
 #include "memory_desc/cpu_blocked_memory_desc.h"
 #include "snippets/lowered/port_descriptor.hpp"
 #include "snippets/runtime_configurator.hpp"
@@ -21,13 +27,39 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig {
     std::string to_string() const override;
 #endif
 
+#ifdef OPENVINO_ARCH_X86_64
+    struct RepackedInput {
+        RepackedInput() = default;
+        RepackedInput(CpuBlockedMemoryDescPtr desc_,
+                      std::shared_ptr<BrgemmCopyBKernelExecutor> executor_,
+                      VectorDims in_offsets_,
+                      VectorDims out_offsets_)
+            : desc(std::move(desc_)),
+              executor(std::move(executor_)),
+              in_offsets(std::move(in_offsets_)),
+              out_offsets(std::move(out_offsets_)) {}
+
+        CpuBlockedMemoryDescPtr desc{nullptr};
+        std::shared_ptr<BrgemmCopyBKernelExecutor> executor{nullptr};
+        VectorDims in_offsets{};
+        VectorDims out_offsets{};
+    };
+    std::unordered_map<size_t, RepackedInput> repacked_inputs = {};
+
+    enum class RepackingImplType {
+        NONE,         // no kernel-outside repacking
+        IN_PARALLEL,  // should be executed in parallel_nt by each thread
+        SEPARATE,     // should be separathy from kernel executed
+    };
+    RepackingImplType repacking_impl_type = RepackingImplType::NONE;
+#endif  // OPENVINO_ARCH_X86_64
+
     std::vector<jit_snippets_call_args::loop_args_t> loop_args = {};
-    std::unordered_map<size_t, CpuBlockedMemoryDescPtr> m_in_requested_descs = {};
 };
 
 class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
 public:
-    CPURuntimeConfigurator();
+    CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache = {});
 
     /**
      * @brief Calculate Loop parameters of Loop emitters and update these values in CPURuntimeConfig
@@ -35,6 +67,10 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
      */
     void update_loop_args(const ov::snippets::lowered::LinearIRCPtr& linear_ir) const;
 
+    const ov::intel_cpu::MultiCacheWeakPtr& get_cache() const {
+        return compiled_kernel_cache;
+    }
+
 protected:
     void update(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override;
     void update_tensor_rank(const ov::snippets::VectorDims& master_shape) const override;
@@ -42,6 +78,8 @@ class CPURuntimeConfigurator : public ov::snippets::RuntimeConfigurator {
     void initialization(const ov::snippets::lowered::LinearIRCPtr& linear_ir) override;
 
     static const size_t rank6D;
+
+    ov::intel_cpu::MultiCacheWeakPtr compiled_kernel_cache;
 };
 
 }  // namespace intel_cpu
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
index 39e384837856a1..96da1fa30079a8 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
@@ -165,7 +165,7 @@ class jit_snippet : public dnnl::impl::cpu::x64::jit_generator {
 
 intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t host_isa,
                                               ov::intel_cpu::MultiCacheWeakPtr cache)
-    : TargetMachine(std::make_shared<CPURuntimeConfigurator>()),
+    : TargetMachine(std::make_shared<CPURuntimeConfigurator>(cache)),
       h(new jit_snippet()),
       isa(host_isa),
       compiled_kernel_cache(std::move(cache)) {
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
index 2b0c7b55fb043d..0f35c017ceded1 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -78,8 +78,15 @@ class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor {
                            const std::vector<ptrdiff_t>& start_offset_in,
                            const std::vector<ptrdiff_t>& start_offset_out,
                            const std::shared_ptr<CPURuntimeConfig>& snippet_config,
-                           const BufferScratchpadAllocator& allocator)
-        : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) {}
+                           const BufferScratchpadAllocator& allocator,
+                           const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache)
+        : SubgraphExecutor(snippet_attrs,
+                           snippet,
+                           start_offset_in,
+                           start_offset_out,
+                           snippet_config,
+                           allocator,
+                           kernel_cache) {}
 
     void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) override {
         const auto& callable = m_schedule->get_callable<kernel>();
@@ -87,7 +94,12 @@ class SubgraphStaticExecutor : public Subgraph::SubgraphExecutor {
         auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
             init_call_args(call_args, inMemPtrs, outMemPtrs, ithr);
         };
-        auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes) {
+
+        auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
+#ifdef OPENVINO_ARCH_X86_64
+            if (should_repacking_be_in_parallel())
+                in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args);
+#endif  // OPENVINO_ARCH_X86_64
             callable(&call_args, indexes.data());
         };
 
@@ -123,8 +135,15 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor {
                                        const std::vector<ptrdiff_t>& start_offset_in,
                                        const std::vector<ptrdiff_t>& start_offset_out,
                                        const std::shared_ptr<CPURuntimeConfig>& snippet_config,
-                                       const BufferScratchpadAllocator& allocator)
-        : SubgraphExecutor(snippet_attrs, snippet, start_offset_in, start_offset_out, snippet_config, allocator) {
+                                       const BufferScratchpadAllocator& allocator,
+                                       const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache)
+        : SubgraphExecutor(snippet_attrs,
+                           snippet,
+                           start_offset_in,
+                           start_offset_out,
+                           snippet_config,
+                           allocator,
+                           kernel_cache) {
         buffer_offsets = snippet_config->buffer_cluster_offsets;
         data_offsets = snippet_config->io_data_offsets;
         loop_args = snippet_config->loop_args;
@@ -149,8 +168,13 @@ class SubgraphDynamicSpecializedExecutor : public Subgraph::SubgraphExecutor {
         auto initializer = [&](jit_snippets_call_args& call_args, size_t ithr) {
             init_call_args(call_args, ithr);
         };
-        auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes) {
+
+        auto caller = [&](jit_snippets_call_args& call_args, const std::vector<size_t>& indexes, size_t ithr) {
             update_ptrs(call_args, src_ptrs, dst_ptrs, indexes);
+#ifdef OPENVINO_ARCH_X86_64
+            if (should_repacking_be_in_parallel())
+                in_parallel_repack_inputs(inMemPtrs, indexes, ithr, call_args);
+#endif  // OPENVINO_ARCH_X86_64
             callable(&call_args);
         };
 
@@ -827,7 +851,8 @@ void Subgraph::prepareParams() {
                                                                         start_offset_in,
                                                                         start_offset_out,
                                                                         snippet_config,
-                                                                        allocator);
+                                                                        allocator,
+                                                                        cache);
         } else {
             // Static case:
             // 1. Update runtime config to get static scheduling data (io data offsets, parallel domain) which will be
@@ -845,7 +870,8 @@ void Subgraph::prepareParams() {
                                                             start_offset_in,
                                                             start_offset_out,
                                                             snippet_config,
-                                                            allocator);
+                                                            allocator,
+                                                            cache);
         }
     };
 
@@ -936,7 +962,8 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<Subgraph::Sub
                                              const std::vector<ptrdiff_t>& start_offset_in,
                                              const std::vector<ptrdiff_t>& start_offset_out,
                                              const std::shared_ptr<CPURuntimeConfig>& snippet_config,
-                                             const BufferScratchpadAllocator& allocator)
+                                             const BufferScratchpadAllocator& allocator,
+                                             const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache)
     : m_schedule(snippet->get()),
       m_start_offset_in(start_offset_in),
       m_start_offset_out(start_offset_out) {
@@ -954,15 +981,34 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<Subgraph::Sub
     OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(m_buffer_scratchpad_size),
                     "Undefined buffer scratchpad size!");
     m_internal_buffer_size = static_cast<size_t>(m_nthreads) * m_buffer_scratchpad_size;
-    m_in_requested_descs = snippet_config->m_in_requested_descs;
-    const auto external_repacking_buffer_size =
-        std::accumulate(m_in_requested_descs.begin(),
-                        m_in_requested_descs.end(),
+
+#if defined(OPENVINO_ARCH_X86_64)
+    m_repacking_impl_type = snippet_config->repacking_impl_type;
+    m_repacked_inputs = snippet_config->repacked_inputs;
+
+    auto external_buffer_size =
+        std::accumulate(m_repacked_inputs.begin(),
+                        m_repacked_inputs.end(),
                         size_t(0),
-                        [](size_t sum, const std::pair<size_t, ov::intel_cpu::MemoryDescPtr>& requested_desc_elem) {
-                            return sum + requested_desc_elem.second->getCurrentMemSize();
+                        [](size_t sum, const std::pair<size_t, CPURuntimeConfig::RepackedInput>& p) {
+                            return sum + p.second.desc->getCurrentMemSize();
                         });
-    m_buffer_scratchpad = allocator(m_internal_buffer_size + external_repacking_buffer_size);
+
+    if (should_repacking_be_in_parallel()) {
+        // When external repacking is applied in parallel section,
+        // each thread should have own buffer to store repacked data
+        external_buffer_size *= m_nthreads;
+
+        // To avoid extra overheads in runtime on unordered_map creation,
+        // we initialize `repacked_offsets_by_threads` by default here
+        for (int i = 0; i < m_nthreads; ++i)
+            m_repacked_offsets_by_threads[i] = {};
+    }
+
+#else
+    const auto external_buffer_size = 0lu;
+#endif  // OPENVINO_ARCH_X86_64
+    m_buffer_scratchpad = allocator(m_internal_buffer_size + external_buffer_size);
 
 #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS)
     const auto target = std::dynamic_pointer_cast<const CPUTargetMachine>(
@@ -971,6 +1017,84 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<Subgraph::Sub
 #endif
 }
 
+#ifdef OPENVINO_ARCH_X86_64
+std::vector<MemoryPtr> Subgraph::SubgraphExecutor::separately_repack_inputs(const dnnl::stream& strm,
+                                                                            const std::vector<MemoryPtr>& srcMemPtrs) {
+    auto get_batch_stride = [](const std::vector<size_t> strides) {
+        for (size_t i = 2; i < strides.size(); ++i)
+            if (*(strides.rbegin() + i) != 0)  // handle broadcasting pattern
+                return *(strides.rbegin() + i);
+        return (*++strides.rbegin());
+    };
+
+    auto reordered_in_ptrs = srcMemPtrs;
+    size_t offset = m_internal_buffer_size;
+    for (const auto& p : m_repacked_inputs) {
+        const auto in_idx = p.first;
+        const auto& repacked_input = p.second;
+        const auto& desc = repacked_input.desc;
+        const void* data_ptr = m_buffer_scratchpad->getDataAs<uint8_t>() + offset;
+
+        OPENVINO_ASSERT(in_idx < srcMemPtrs.size(), "Incorrect index of input repacked mem ptr");
+        const auto& src_mem = srcMemPtrs[in_idx];
+        const auto& dst_mem = std::make_shared<Memory>(strm.get_engine(), desc, data_ptr, false);
+
+        const auto& shape = dst_mem->getShape().getDims();
+        const auto batch = std::accumulate(shape.rbegin() + 2, shape.rend(), 1lu, std::multiplies<size_t>());
+        const auto in_stride = get_batch_stride(repacked_input.in_offsets);
+        const auto out_stride = get_batch_stride(repacked_input.out_offsets);
+
+        const auto* src = src_mem->getDataAs<const uint8_t>();
+        auto* dst = dst_mem->getDataAs<uint8_t>();
+
+        const auto& executor = repacked_input.executor;
+        parallel_for(batch, [&](size_t b0) {
+            BrgemmCopyBKernel::call_args args;
+            args.src = src + b0 * in_stride + m_start_offset_in[in_idx];
+            args.tr_src = dst + b0 * out_stride;
+            BrgemmCopyBKernelExecutor::execute(executor.get(), &args);
+        });
+
+        reordered_in_ptrs[in_idx] = dst_mem;
+        offset += desc->getCurrentMemSize();
+    }
+    return reordered_in_ptrs;
+}
+
+void Subgraph::SubgraphExecutor::in_parallel_repack_inputs(const std::vector<MemoryPtr>& inMemPtrs,
+                                                           const std::vector<size_t>& indexes,
+                                                           int ithr,
+                                                           jit_snippets_call_args& call_args) {
+    for (const auto& p : m_repacked_inputs) {
+        const auto& in_idx = p.first;
+        const auto& repacked_in = p.second;
+
+        const auto& src_offsets = repacked_in.in_offsets;
+        const auto& dst_offsets = repacked_in.out_offsets;
+
+        size_t src_offset = m_start_offset_in[in_idx], dst_offset = 0;
+        for (size_t j = 0; j < indexes.size(); j++) {
+            src_offset += src_offsets[j] * indexes[j];
+            dst_offset += dst_offsets[j] * indexes[j];
+        }
+
+        uint8_t* repacked_ptr = get_external_scratchpad_ptr(ithr, in_idx) + dst_offset;
+
+        auto& offsets = m_repacked_offsets_by_threads.at(ithr)[in_idx];
+        if (offsets.count(src_offset) == 0) {
+            BrgemmCopyBKernel::call_args args;
+            args.src = inMemPtrs[in_idx]->getDataAs<const uint8_t>() + src_offset;
+            args.tr_src = repacked_ptr;
+            BrgemmCopyBKernelExecutor::execute(repacked_in.executor.get(), &args);
+
+            offsets.insert(src_offset);
+        }
+
+        call_args.src_ptrs[in_idx] = repacked_ptr;
+    }
+}
+#endif  // OPENVINO_ARCH_X86_64
+
 #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS)
 void Subgraph::SubgraphExecutor::segfault_detector() {
     if (enabled_segfault_detector) {
@@ -991,7 +1115,7 @@ void Subgraph::SubgraphExecutor::segfault_detector() {
 
 void Subgraph::SubgraphExecutor::parallel_for6d(
     const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-    const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&)>& caller) {
+    const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) {
     const auto& dom = m_parallel_exec_domain;
 
 #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS)
@@ -1018,7 +1142,7 @@ void Subgraph::SubgraphExecutor::parallel_for6d(
                          indexes[4],
                          dom[4]);
         for (size_t iwork = start; iwork < end; ++iwork) {
-            caller(call_args, indexes);
+            caller(call_args, indexes, ithr);
             parallel_it_step(indexes[0],
                              dom[0],
                              indexes[1],
@@ -1030,12 +1154,16 @@ void Subgraph::SubgraphExecutor::parallel_for6d(
                              indexes[4],
                              dom[4]);
         }
+
+#ifdef OPENVINO_ARCH_X86_64
+        clean_repacked_offsets(ithr);
+#endif  // OPENVINO_ARCH_X86_64
     });
 }
 
 void Subgraph::SubgraphExecutor::parallel_forNd(
     const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-    const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&)>& caller) {
+    const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller) {
     const auto& dom = m_parallel_exec_domain;
 
 #if defined(__linux__) && defined(OPENVINO_ARCH_X86_64) && defined(SNIPPETS_DEBUG_CAPS)
@@ -1057,37 +1185,25 @@ void Subgraph::SubgraphExecutor::parallel_forNd(
                 tmp /= dom[j];
             }
 
-            caller(call_args, indexes);
+            caller(call_args, indexes, ithr);
         }
+
+#ifdef OPENVINO_ARCH_X86_64
+        clean_repacked_offsets(ithr);
+#endif  // OPENVINO_ARCH_X86_64
     });
 }
 
 void Subgraph::SubgraphExecutor::execute(const dnnl::stream& strm,
                                          const std::vector<MemoryPtr>& inMemPtrs,
                                          const std::vector<MemoryPtr>& outMemPtrs) {
-    if (!m_in_requested_descs.empty()) {
-        auto reorderedInMemPtrs = reorder_inputs(strm, inMemPtrs);
-        exec_impl(reorderedInMemPtrs, outMemPtrs);
-    } else {
-        exec_impl(inMemPtrs, outMemPtrs);
-    }
-}
-
-std::vector<MemoryPtr> Subgraph::SubgraphExecutor::reorder_inputs(const dnnl::stream& strm,
-                                                                  const std::vector<MemoryPtr>& inMemPtrs) {
-    auto reordered_in_ptrs = inMemPtrs;
-    size_t offset = m_internal_buffer_size;
-    for (const auto& requested_descs_elem : m_in_requested_descs) {
-        const auto in_idx = requested_descs_elem.first;
-        const auto& requested_desc = requested_descs_elem.second;
-
-        const void* data_ptr = m_buffer_scratchpad->getDataAs<uint8_t>() + offset;
-        const auto scratch_mem = std::make_shared<Memory>(strm.get_engine(), requested_desc, data_ptr, false);
-        scratch_mem->load(*reordered_in_ptrs[in_idx]);
-        reordered_in_ptrs[in_idx] = scratch_mem;
-        offset += requested_desc->getCurrentMemSize();
+#ifdef OPENVINO_ARCH_X86_64
+    if (should_repacking_be_separately()) {
+        exec_impl(separately_repack_inputs(strm, inMemPtrs), outMemPtrs);
+        return;
     }
-    return reordered_in_ptrs;
+#endif  // OPENVINO_ARCH_X86_64
+    exec_impl(inMemPtrs, outMemPtrs);
 }
 
 }  // namespace node
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h
index aac0fa1ea2f535..74f84b11564989 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.h
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.h
@@ -128,7 +128,8 @@ class Subgraph::SubgraphExecutor {
                      const std::vector<ptrdiff_t>& start_offset_in,
                      const std::vector<ptrdiff_t>& start_offset_out,
                      const std::shared_ptr<CPURuntimeConfig>& snippet_config,
-                     const BufferScratchpadAllocator& allocator);
+                     const BufferScratchpadAllocator& allocator,
+                     const ov::intel_cpu::MultiCacheWeakPtr& kernel_cache);
     virtual ~SubgraphExecutor() = default;
 
     void execute(const dnnl::stream& strm,
@@ -139,9 +140,9 @@ class Subgraph::SubgraphExecutor {
     virtual void exec_impl(const std::vector<MemoryPtr>& inMemPtrs, const std::vector<MemoryPtr>& outMemPtrs) = 0;
 
     void parallel_for6d(const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-                        const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&)>& caller);
+                        const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller);
     void parallel_forNd(const std::function<void(jit_snippets_call_args&, size_t)>& initializer,
-                        const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&)>& caller);
+                        const std::function<void(jit_snippets_call_args&, const std::vector<size_t>&, size_t)>& caller);
 
     inline void update_scratchpad_ptr(void*& scratchpad_ptr, size_t ithr) const {
         if (m_buffer_scratchpad_size > 0)
@@ -172,10 +173,48 @@ class Subgraph::SubgraphExecutor {
     inline void segfault_detector();
 #endif
 
-private:
-    std::vector<MemoryPtr> reorder_inputs(const dnnl::stream& strm, const std::vector<MemoryPtr>& inMemPtrs);
+#ifdef OPENVINO_ARCH_X86_64
+    std::vector<MemoryPtr> separately_repack_inputs(const dnnl::stream& strm, const std::vector<MemoryPtr>& srcMemPtrs);
+    void in_parallel_repack_inputs(const std::vector<MemoryPtr>& inMemPtrs,
+                                   const std::vector<size_t>& indexes,
+                                   int ithr,
+                                   jit_snippets_call_args& call_args);
+
+    inline uint8_t* get_external_scratchpad_ptr(size_t ithr, size_t idx) const {
+        if (m_repacked_inputs.empty())
+            return nullptr;
+
+        uint8_t* data_ptr = m_buffer_scratchpad->getDataAs<uint8_t>() + m_internal_buffer_size;
+        for (const auto& p : m_repacked_inputs) {
+            const auto& desc = p.second.desc;
+            const auto size = desc->getCurrentMemSize();
+            if (p.first == idx) {
+                return data_ptr + ithr * size;
+            }
+            data_ptr += m_nthreads * size;
+        }
+        OPENVINO_THROW("External buffer pointer has not been found");
+    }
+
+    // [ Input index - > set of src offsets which are already repacked ]
+    using RepackedSrcOffsets = std::unordered_map<size_t, std::set<size_t>>;
+    std::unordered_map<int, RepackedSrcOffsets> m_repacked_offsets_by_threads = {};
+    std::unordered_map<size_t, CPURuntimeConfig::RepackedInput> m_repacked_inputs = {};
+
+    inline bool should_repacking_be_separately() const {
+        return m_repacking_impl_type == CPURuntimeConfig::RepackingImplType::SEPARATE;
+    }
+    inline bool should_repacking_be_in_parallel() const {
+        return m_repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL;
+    }
+    inline void clean_repacked_offsets(size_t ithr) {
+        if (should_repacking_be_in_parallel())
+            m_repacked_offsets_by_threads.at(ithr).clear();
+    }
 
-    std::unordered_map<size_t, CpuBlockedMemoryDescPtr> m_in_requested_descs = {};
+private:
+    CPURuntimeConfig::RepackingImplType m_repacking_impl_type = CPURuntimeConfig::RepackingImplType::NONE;
+#endif  // OPENVINO_ARCH_X86_64
 };
 
 }  // namespace node
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
index 78f9b928298a9d..5b425473bbfa1d 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
@@ -38,9 +38,14 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snipp
 bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& linear_ir) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmExternalRepackingAdjuster")
     const auto& cpu_config = ov::as_type_ptr<CPURuntimeConfig>(m_configurator->get_config());
-    auto& optimal_descs = cpu_config->m_in_requested_descs;
+    const float L2_cache_size = dnnl::utils::get_cache_size(2, true);
+
+    bool fit_into_L2 = true;
     for (const auto& i : m_param_idces_with_external_repacking) {
         const auto& shape = cpu_config->io_shapes[i];
+        if (shape == cpu_config->latest_shapes[i])
+            continue;
+
         const auto& K = *++shape.rbegin();
         const auto& N = *shape.rbegin();
 
@@ -50,23 +55,67 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
         // Firstly, batch dims are set
         VectorDims requested_blocked_shape(shape.begin(), shape.end() - brgemm_kernel_rank);
         // Then, the blocked dims are formed
-        requested_blocked_shape.insert(requested_blocked_shape.end(),
-                                       {snippets::utils::div_up(K, vnni_factor),
-                                        std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision)),
-                                        vnni_factor});
+        const auto new_K = snippets::utils::div_up(K, vnni_factor);
+        const auto new_N = std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision));
+        requested_blocked_shape.insert(requested_blocked_shape.end(), {new_K, new_N, vnni_factor});
 
         VectorDims requested_order(shape.size() - brgemm_kernel_rank);
         std::iota(requested_order.begin(), requested_order.end(), 0);
         const auto last_idx = shape.size() - 1;
         requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1});
 
-        optimal_descs[i] =
+        const auto desc =
             std::make_shared<CpuBlockedMemoryDesc>(precision, Shape(shape), requested_blocked_shape, requested_order);
 
+        auto config = BrgemmCopyBKernelConfig(precision,
+                                              precision,
+                                              dnnl::impl::cpu::x64::cpu_isa_t::avx512_core_amx,
+                                              false,
+                                              false,
+                                              brgemm_utils::repacking::compute_inner_n_block(precision));
+        const auto executor = std::make_shared<BrgemmCopyBKernelExecutor>(
+            static_cast<const CPURuntimeConfigurator*>(m_configurator)->get_cache(),
+            config);
+        config.update(N,
+                      N,
+                      K,
+                      K,
+                      ov::snippets::utils::get_dim_in_stride(shape, cpu_config->io_layouts[i], 1) * precision.size(),
+                      brgemm_utils::repacking::compute_LDB(N, precision));
+        executor->update_by_config(config);
+
+        // Save original input offsets for input before repacking.
+        const auto in_offsets = cpu_config->io_data_offsets[i];
+
         ov::snippets::VectorDims shape_for_offset(cpu_config->tensor_rank - shape.size(), 1);
         shape_for_offset.insert(shape_for_offset.end(), requested_blocked_shape.begin(), requested_blocked_shape.end());
         m_configurator->compute_offsets(shape_for_offset, i, 0);
+        // Save new input offsets for input after repacking.
+        const auto out_offsets = cpu_config->io_data_offsets[i];
+
+        cpu_config->repacked_inputs[i] = CPURuntimeConfig::RepackedInput(desc, executor, in_offsets, out_offsets);
+
+        const auto src_size = N * K * precision.size();
+        const auto dst_size = new_N * new_K * precision.size();
+        fit_into_L2 &= ((src_size + dst_size) < L2_cache_size);
     }
+
+    if (!cpu_config->repacked_inputs.empty()) {
+        // Heuristic: If external repacking data doesn't fit in the cache L2,
+        //            external repacking should be executed in seperate parallel section before kernel execution.
+        cpu_config->repacking_impl_type = fit_into_L2 ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL
+                                                      : CPURuntimeConfig::RepackingImplType::SEPARATE;
+
+        // In parallel case Kernel should not add offsets to repacked inputs because
+        // they will be applied during repacking in execution stage
+        if (cpu_config->repacking_impl_type == CPURuntimeConfig::RepackingImplType::IN_PARALLEL) {
+            for (const auto& in : cpu_config->repacked_inputs) {
+                auto& offsets = cpu_config->io_data_offsets[in.first];
+                std::fill(offsets.begin(), offsets.end(), 0);
+            }
+        }
+    }
+
     return true;
 }
 

From b66565900baa0f52a8fb5ced85770bef41e20455 Mon Sep 17 00:00:00 2001
From: Alexandra Sidorova <alexandra.sidorova@intel.com>
Date: Mon, 23 Dec 2024 15:42:13 +0400
Subject: [PATCH 2/7] [Snippets][CPU] Fixed build on non-x64 platforms

---
 .../src/emitters/snippets/cpu_runtime_configurator.cpp      | 2 +-
 .../src/emitters/snippets/cpu_runtime_configurator.hpp      | 6 +++---
 src/plugins/intel_cpu/src/nodes/subgraph.cpp                | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
index 3ad41d707bb96b..43b3ea14cc148a 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
@@ -7,7 +7,7 @@
 #include "snippets/lowered/loop_manager.hpp"
 #include "snippets/utils/utils.hpp"
 
-#ifdef OPENVINO_ARCH_X86_64
+#ifndef OPENVINO_ARCH_ARM64
 #    include "transformations/snippets/x64/pass/lowered/brgemm_copy_b_loop_ports_adjuster.hpp"
 #    include "transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp"
 #endif
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
index a8bab52eb61513..513ff65fee912b 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
@@ -6,7 +6,7 @@
 
 #include "emitters/snippets/jit_snippets_call_args.hpp"
 
-#ifdef OPENVINO_ARCH_X86_64
+#ifndef OPENVINO_ARCH_ARM64
 #    include "emitters/snippets/x64/kernel_executors/brgemm_copy_b.hpp"
 #endif
 
@@ -27,7 +27,7 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig {
     std::string to_string() const override;
 #endif
 
-#ifdef OPENVINO_ARCH_X86_64
+#ifndef OPENVINO_ARCH_ARM64
     struct RepackedInput {
         RepackedInput() = default;
         RepackedInput(CpuBlockedMemoryDescPtr desc_,
@@ -52,7 +52,7 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig {
         SEPARATE,     // should be separathy from kernel executed
     };
     RepackingImplType repacking_impl_type = RepackingImplType::NONE;
-#endif  // OPENVINO_ARCH_X86_64
+#endif  // OPENVINO_ARCH_ARM64
 
     std::vector<jit_snippets_call_args::loop_args_t> loop_args = {};
 };
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
index 0f35c017ceded1..1cfc785fb26895 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -982,7 +982,7 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<Subgraph::Sub
                     "Undefined buffer scratchpad size!");
     m_internal_buffer_size = static_cast<size_t>(m_nthreads) * m_buffer_scratchpad_size;
 
-#if defined(OPENVINO_ARCH_X86_64)
+#ifdef OPENVINO_ARCH_X86_64
     m_repacking_impl_type = snippet_config->repacking_impl_type;
     m_repacked_inputs = snippet_config->repacked_inputs;
 

From 72bf13a567343432512bb268ed8e3b2bc93b85cb Mon Sep 17 00:00:00 2001
From: Alexandra Sidorova <alexandra.sidorova@intel.com>
Date: Tue, 24 Dec 2024 08:14:04 +0100
Subject: [PATCH 3/7] [Snippets][CPU] Updated heuristic

---
 .../x64/pass/lowered/external_repacking_adjuster.cpp   | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
index 5b425473bbfa1d..1941523adfb834 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
@@ -38,9 +38,8 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snipp
 bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& linear_ir) {
     OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::BrgemmExternalRepackingAdjuster")
     const auto& cpu_config = ov::as_type_ptr<CPURuntimeConfig>(m_configurator->get_config());
-    const float L2_cache_size = dnnl::utils::get_cache_size(2, true);
 
-    bool fit_into_L2 = true;
+    size_t data_size = 0;
     for (const auto& i : m_param_idces_with_external_repacking) {
         const auto& shape = cpu_config->io_shapes[i];
         if (shape == cpu_config->latest_shapes[i])
@@ -95,12 +94,13 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
 
         cpu_config->repacked_inputs[i] = CPURuntimeConfig::RepackedInput(desc, executor, in_offsets, out_offsets);
 
-        const auto src_size = N * K * precision.size();
-        const auto dst_size = new_N * new_K * precision.size();
-        fit_into_L2 &= ((src_size + dst_size) < L2_cache_size);
+        // src data + dst data per kernel call
+        data_size += N * K * precision.size() + new_N * new_K * vnni_factor * precision.size();
     }
 
     if (!cpu_config->repacked_inputs.empty()) {
+        const auto L2_cache_size = dnnl::utils::get_cache_size(2, true);
+        const auto fit_into_L2 = data_size < L2_cache_size;
         // Heuristic: If external repacking data doesn't fit in the cache L2,
         //            external repacking should be executed in seperate parallel section before kernel execution.
         cpu_config->repacking_impl_type = fit_into_L2 ? CPURuntimeConfig::RepackingImplType::IN_PARALLEL

From 4b33eaa2f80487f8b50339c11fc74c2c41ddf782 Mon Sep 17 00:00:00 2001
From: Alexandra Sidorova <alexandra.sidorova@intel.com>
Date: Wed, 25 Dec 2024 07:43:42 +0100
Subject: [PATCH 4/7] [Snippets][CPU] Added inplace-Transpose support

---
 .../snippets/include/snippets/op/reshape.hpp  | 21 ++++++++++
 .../shape_inference/shape_infer_instances.hpp |  8 ++++
 .../include/snippets/snippets_isa_tbl.hpp     |  1 +
 src/common/snippets/src/generator.cpp         |  1 +
 src/common/snippets/src/op/reshape.cpp        | 41 +++++++++++++++++++
 src/common/snippets/src/op/subgraph.cpp       |  1 +
 .../snippets/src/runtime_configurator.cpp     | 19 ++++++++-
 .../shape_inference/shape_infer_instances.cpp | 11 +++++
 .../src/shape_inference/shape_inference.cpp   |  1 +
 .../emitters/snippets/x64/cpu_generator.cpp   |  1 +
 src/plugins/intel_cpu/src/nodes/subgraph.cpp  | 37 ++++++++---------
 .../x64/pass/eliminate_brgemm_copy_b.cpp      | 24 +++++++++--
 .../adjust_brgemm_copy_b_loop_ports.cpp       |  6 ++-
 .../lowered/external_repacking_adjuster.cpp   | 32 +++++++--------
 14 files changed, 159 insertions(+), 45 deletions(-)

diff --git a/src/common/snippets/include/snippets/op/reshape.hpp b/src/common/snippets/include/snippets/op/reshape.hpp
index b4e0c9233c73f0..d80a02ebc33c9a 100644
--- a/src/common/snippets/include/snippets/op/reshape.hpp
+++ b/src/common/snippets/include/snippets/op/reshape.hpp
@@ -32,6 +32,27 @@ class Reshape : public ov::op::Op {
     ov::PartialShape m_target_shape = {};
 };
 
+/**
+ * @interface ReshapeWithOrder
+ * @brief ReshapeWithOrder reshapes input tensor shape by reqiured target order.
+ *        The tensor data is not updated.
+ *        Note: Order is stored in input PortDescriptor
+ * @ingroup snippets
+ */
+class ReshapeWithOrder : public ov::op::Op {
+public:
+    OPENVINO_OP("ReshapeWithOrder", "SnippetsOpset");
+    ReshapeWithOrder() = default;
+    ReshapeWithOrder(const Output<Node>& x, std::vector<size_t> order);
+
+    bool visit_attributes(AttributeVisitor& visitor) override;
+    std::shared_ptr<Node> clone_with_new_inputs(const OutputVector& new_args) const override;
+    void validate_and_infer_types() override;
+
+private:
+    void custom_constructor_validate_and_infer_types(std::vector<size_t> order);
+};
+
 } // namespace op
 } // namespace snippets
 } // namespace ov
diff --git a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp
index 1b91ea573ab1c4..c062fed338638d 100644
--- a/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp
+++ b/src/common/snippets/include/snippets/shape_inference/shape_infer_instances.hpp
@@ -82,5 +82,13 @@ class ReshapeShapeInfer : public IShapeInferSnippets {
     explicit ReshapeShapeInfer(const std::shared_ptr<Node>& n);
     Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
 };
+
+class ReshapeWithOrderShapeInfer : public IShapeInferSnippets {
+    std::vector<size_t> m_target_order {};
+public:
+    explicit ReshapeWithOrderShapeInfer(const std::shared_ptr<Node>& n);
+    Result infer(const std::vector<VectorDimsRef>& input_shapes) override;
+};
+
 } // namespace snippets
 } // namespace ov
diff --git a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
index 9b207b09fe411f..5c5e0f3701ad42 100644
--- a/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
+++ b/src/common/snippets/include/snippets/snippets_isa_tbl.hpp
@@ -17,6 +17,7 @@ OV_OP(LoopEnd, ov::snippets::op)
 OV_OP(Brgemm, ov::snippets::op)
 OV_OP(BroadcastLoad, ov::snippets::op)
 OV_OP(Reshape, ov::snippets::op)
+OV_OP(ReshapeWithOrder, ov::snippets::op)
 
 OV_OP(Store, ov::snippets::op)
 
diff --git a/src/common/snippets/src/generator.cpp b/src/common/snippets/src/generator.cpp
index d059ddd94d5724..7869b4427d579d 100644
--- a/src/common/snippets/src/generator.cpp
+++ b/src/common/snippets/src/generator.cpp
@@ -77,6 +77,7 @@ RegType Generator::get_op_out_reg_type(const ov::Output<Node>& out) const {
         std::dynamic_pointer_cast<op::Buffer>(op) ||
         std::dynamic_pointer_cast<op::RankNormalization>(op) ||
         std::dynamic_pointer_cast<op::Reshape>(op) ||
+        std::dynamic_pointer_cast<op::ReshapeWithOrder>(op) ||
         std::dynamic_pointer_cast<snippets::op::Store>(op)
 #ifdef SNIPPETS_DEBUG_CAPS
         || std::dynamic_pointer_cast<op::PerfCountBeginBase>(op)
diff --git a/src/common/snippets/src/op/reshape.cpp b/src/common/snippets/src/op/reshape.cpp
index 72823d2815cdbf..ae7887e558b5f2 100644
--- a/src/common/snippets/src/op/reshape.cpp
+++ b/src/common/snippets/src/op/reshape.cpp
@@ -11,6 +11,7 @@
 namespace ov {
 namespace snippets {
 namespace op {
+
 Reshape::Reshape(const Output<Node>& arg, ov::PartialShape target_shape)
     : Op({arg}), m_target_shape(std::move(target_shape)) {
     constructor_validate_and_infer_types();
@@ -38,6 +39,46 @@ const ov::PartialShape& Reshape::get_target_shape() const {
 void Reshape::set_target_shape(ov::PartialShape shape) {
     m_target_shape = std::move(shape);
 }
+
+ReshapeWithOrder::ReshapeWithOrder(const Output<Node>& arg, std::vector<size_t> order)
+    : Op({arg}) {
+    custom_constructor_validate_and_infer_types(std::move(order));
+}
+
+void ReshapeWithOrder::custom_constructor_validate_and_infer_types(std::vector<size_t> order) {
+    INTERNAL_OP_SCOPE(ReshapeWithOrder_constructor_validate_and_infer_types);
+
+    const auto& input_pshape = get_input_partial_shape(0);
+    OPENVINO_ASSERT(input_pshape.rank().is_static() && input_pshape.size() == order.size(),
+                   "Incompatible shape and order sizes");
+
+    // During ctor call, ReshapeWithOrder doesn't know his port descriptors.
+    // So we use explicit layouts from parameters
+    set_output_type(0, get_input_element_type(0), ov::snippets::utils::get_planar_pshape(input_pshape, order));
+}
+
+void ReshapeWithOrder::validate_and_infer_types() {
+    const auto& input_pshape = get_input_partial_shape(0);
+    const auto order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout();
+    OPENVINO_ASSERT(input_pshape.rank().is_static() && input_pshape.size() == order.size(),
+                    "Incompatible shape and order sizes");
+    const auto output_pshape = utils::get_planar_pshape(get_input_partial_shape(0), order);
+    set_output_type(0, get_input_element_type(0), output_pshape);
+}
+
+std::shared_ptr<Node> ReshapeWithOrder::clone_with_new_inputs(const OutputVector& new_args) const {
+    INTERNAL_OP_SCOPE(ReshapeWithOrder);
+    check_new_args_count(this, new_args);
+    const auto& order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout();
+    return std::make_shared<ReshapeWithOrder>(new_args.at(0), order);
+}
+
+bool ReshapeWithOrder::visit_attributes(AttributeVisitor& visitor) {
+    auto order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(input(0))->get_layout();
+    visitor.on_attribute("target_order", order);
+    return true;
+}
+
 }// namespace op
 }// namespace snippets
 }// namespace ov
\ No newline at end of file
diff --git a/src/common/snippets/src/op/subgraph.cpp b/src/common/snippets/src/op/subgraph.cpp
index 98e3392a65e1e2..25934829b80e00 100644
--- a/src/common/snippets/src/op/subgraph.cpp
+++ b/src/common/snippets/src/op/subgraph.cpp
@@ -96,6 +96,7 @@ auto Subgraph::is_domain_sensitive_op(const std::shared_ptr<ov::Node>& op) -> bo
 
 auto Subgraph::is_shape_infer_op(const std::shared_ptr<ov::Node>& op) -> bool {
     return ov::is_type<snippets::op::Reshape>(op) ||
+           ov::is_type<snippets::op::ReshapeWithOrder>(op) ||
            ov::is_type<snippets::op::RankNormalization>(op);
 }
 
diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp
index 06beb8db94ae3d..4ddb4c19ea5a32 100644
--- a/src/common/snippets/src/runtime_configurator.cpp
+++ b/src/common/snippets/src/runtime_configurator.cpp
@@ -118,7 +118,23 @@ void RuntimeConfigurator::init_data_info(const lowered::LinearIRCPtr& linear_ir)
         // input->shape changing ops->load
         PortDescriptorPtr desc = nullptr;
         const auto& shape_infer_seq = utils::get_first_child_shape_infer_expr_seq(param);
-        const auto& mem_desc_expr = shape_infer_seq.empty() ? param : shape_infer_seq.back();
+        ExpressionPtr mem_desc_expr = param;
+        if (!shape_infer_seq.empty()) {
+            // If there is ReshapeWithOrder, we should take its desc because it affects on shape by target order
+            const auto& reordered_reshape_it = std::find_if(shape_infer_seq.cbegin(), shape_infer_seq.cend(),
+                                                            [](const ExpressionPtr& expr) {
+                                                               return ov::is_type<op::ReshapeWithOrder>(expr->get_node());
+                                                            });
+            if (reordered_reshape_it != shape_infer_seq.cend()) {
+                const auto& reshape = *reordered_reshape_it;
+                const auto& etype = reshape->get_node()->get_output_element_type(0);
+                update_io_parameters(reshape->get_input_port_descriptor(0), etype);
+                continue;
+            }
+
+            mem_desc_expr = shape_infer_seq.back();
+        }
+
         auto consumer_inputs = mem_desc_expr->get_output_port_connector(0)->get_consumers();
         for (const auto& child_input : consumer_inputs) {
             const auto ma = std::dynamic_pointer_cast<snippets::modifier::MemoryAccess>(child_input.get_expr()->get_node());
@@ -127,6 +143,7 @@ void RuntimeConfigurator::init_data_info(const lowered::LinearIRCPtr& linear_ir)
                 break;
             }
         }
+        OPENVINO_ASSERT(desc, "Descriptor is missed!");
         const auto& etype = mem_desc_expr->get_node()->get_output_element_type(0);
         update_io_parameters(desc, etype);
     }
diff --git a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp
index a3e3d9652c0ac8..417996ae2a5f31 100644
--- a/src/common/snippets/src/shape_inference/shape_infer_instances.cpp
+++ b/src/common/snippets/src/shape_inference/shape_infer_instances.cpp
@@ -245,5 +245,16 @@ Result ReshapeShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes)
     return {{target_shape}, ShapeInferStatus::success};
 }
 
+ReshapeWithOrderShapeInfer::ReshapeWithOrderShapeInfer(const std::shared_ptr<Node>& n) {
+    const auto& reshape = as_type_ptr<ov::snippets::op::ReshapeWithOrder>(n);
+    OPENVINO_ASSERT(reshape, "Invalid node passed to ReshapeWithOrderShapeInfer.");
+    m_target_order = lowered::PortDescriptorUtils::get_port_descriptor_ptr(reshape->input(0))->get_layout();
+}
+
+Result ReshapeWithOrderShapeInfer::infer(const std::vector<VectorDimsRef>& input_shapes) {
+    OPENVINO_ASSERT(input_shapes.size() == 1, "Invalid number of shapes is passed in ReshapeWithOrderShapeInfer");
+    return {{ov::snippets::utils::get_planar_vdims(input_shapes[0].get(), m_target_order)}, ShapeInferStatus::success};
+}
+
 } // namespace snippets
 } // namespace ov
diff --git a/src/common/snippets/src/shape_inference/shape_inference.cpp b/src/common/snippets/src/shape_inference/shape_inference.cpp
index 76a4c491c66983..017567ea86bd55 100644
--- a/src/common/snippets/src/shape_inference/shape_inference.cpp
+++ b/src/common/snippets/src/shape_inference/shape_inference.cpp
@@ -58,6 +58,7 @@ const IShapeInferSnippetsFactory::TRegistry IShapeInferSnippetsFactory::registry
         SHAPE_INFER_PREDEFINED(op::KernelDynamic, EmptyShapeInfer),
         SHAPE_INFER_PREDEFINED(op::Nop, EmptyShapeInfer),
         SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::Reshape, ReshapeShapeInfer),
+        SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::ReshapeWithOrder, ReshapeWithOrderShapeInfer),
         SHAPE_INFER_OP_SPECIFIC_EXTERNAL(opset1::Select, SelectShapeInfer),
         SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::Brgemm, BrgemmShapeInfer),
         SHAPE_INFER_OP_SPECIFIC_EXTERNAL(op::ReduceMax, ReduceShapeInfer),
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
index 96da1fa30079a8..014cd65426e083 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
@@ -177,6 +177,7 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho
     jitters[snippets::op::RankNormalization::get_type_info_static()] =
         CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
     jitters[snippets::op::Reshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
+    jitters[snippets::op::ReshapeWithOrder::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
 
     jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
     jitters[snippets::op::LoadReshape::get_type_info_static()] =
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
index 1cfc785fb26895..54ff91c3204a2f 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -932,17 +932,16 @@ void Subgraph::executeDynamicImpl(dnnl::stream strm) {
 }
 
 namespace {
-inline void init_parallel_domain(const std::shared_ptr<CPURuntimeConfig>& snippet_config, std::vector<size_t>& domain) {
-    const auto& master_shape = snippet_config->master_shape;
-    const auto& tensor_rank = snippet_config->tensor_rank;
-    const auto& tile_rank = snippet_config->tile_rank;
+inline void init_parallel_domain(const std::vector<size_t>& master_shape, size_t tensor_rank, size_t tile_rank, std::vector<size_t>& domain) {
     domain.resize(tensor_rank, 1);
-
     std::fill(domain.begin(), domain.end(), 1);
     std::copy(master_shape.cbegin(),
               master_shape.cbegin() + (master_shape.size() - tile_rank),
               domain.begin() + (tensor_rank - master_shape.size()));
 }
+inline void init_parallel_domain(const std::shared_ptr<CPURuntimeConfig>& snippet_config, std::vector<size_t>& domain) {
+    init_parallel_domain(snippet_config->master_shape, snippet_config->tensor_rank, snippet_config->tile_rank, domain);
+}
 }  // namespace
 
 Subgraph::SubgraphCodeGenerator::SubgraphCodeGenerator(const std::shared_ptr<Subgraph::SubgraphAttrs>& snippet_attrs,
@@ -1020,13 +1019,6 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<Subgraph::Sub
 #ifdef OPENVINO_ARCH_X86_64
 std::vector<MemoryPtr> Subgraph::SubgraphExecutor::separately_repack_inputs(const dnnl::stream& strm,
                                                                             const std::vector<MemoryPtr>& srcMemPtrs) {
-    auto get_batch_stride = [](const std::vector<size_t> strides) {
-        for (size_t i = 2; i < strides.size(); ++i)
-            if (*(strides.rbegin() + i) != 0)  // handle broadcasting pattern
-                return *(strides.rbegin() + i);
-        return (*++strides.rbegin());
-    };
-
     auto reordered_in_ptrs = srcMemPtrs;
     size_t offset = m_internal_buffer_size;
     for (const auto& p : m_repacked_inputs) {
@@ -1039,19 +1031,24 @@ std::vector<MemoryPtr> Subgraph::SubgraphExecutor::separately_repack_inputs(cons
         const auto& src_mem = srcMemPtrs[in_idx];
         const auto& dst_mem = std::make_shared<Memory>(strm.get_engine(), desc, data_ptr, false);
 
-        const auto& shape = dst_mem->getShape().getDims();
-        const auto batch = std::accumulate(shape.rbegin() + 2, shape.rend(), 1lu, std::multiplies<size_t>());
-        const auto in_stride = get_batch_stride(repacked_input.in_offsets);
-        const auto out_stride = get_batch_stride(repacked_input.out_offsets);
-
         const auto* src = src_mem->getDataAs<const uint8_t>();
         auto* dst = dst_mem->getDataAs<uint8_t>();
 
+        VectorDims dom;
+        const auto& shape = dst_mem->getShape().getDims();
+        OPENVINO_ASSERT(shape.size() <= rank6D, "Unsupported shape rank of repacking data");
+        init_parallel_domain(shape, rank6D, 2lu, dom);
+
+        const auto in_strides = repacked_input.in_offsets;
+        const auto out_strides = repacked_input.out_offsets;
+        OPENVINO_ASSERT(in_strides.size() == rank6D && out_strides.size() == rank6D && dom.size() == rank6D,
+                        "Unsupported shape rank of repacking data");
+
         const auto& executor = repacked_input.executor;
-        parallel_for(batch, [&](size_t b0) {
+        parallel_for4d(dom[0], dom[1], dom[2], dom[3], [&](size_t d0, size_t d1, size_t d2, size_t d3) {
             BrgemmCopyBKernel::call_args args;
-            args.src = src + b0 * in_stride + m_start_offset_in[in_idx];
-            args.tr_src = dst + b0 * out_stride;
+            args.src = src + d0 * in_strides[0] + d1 * in_strides[1] + d2 * in_strides[2] + d3 * in_strides[3] + m_start_offset_in[in_idx];
+            args.tr_src = dst + d0 * out_strides[0] + d1 * out_strides[1] + d2 * out_strides[2] + d3 * out_strides[3];
             BrgemmCopyBKernelExecutor::execute(executor.get(), &args);
         });
 
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp
index 939ae93ad92b18..02abb74cb7ad2f 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp
@@ -10,6 +10,7 @@
 #include "openvino/pass/pattern/op/wrap_type.hpp"
 #include "snippets/itt.hpp"
 #include "snippets/op/rank_normalization.hpp"
+#include "snippets/op/reshape.hpp"
 #include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
 
 namespace ov {
@@ -30,12 +31,27 @@ pass::EliminateBrgemmCopyB::EliminateBrgemmCopyB() {
 
         const auto& in_desc = snippets::lowered::PortDescriptorUtils::get_port_descriptor_ptr(copy_b_node->input(0));
         const auto& layout = in_desc->get_layout();
-        // TODO:
-        // 1. Ticket 157340: support external repacking for copyB with compensations
-        // 2. Ticket 157339: support external repacking for non-planar layout
-        if (!ov::snippets::utils::is_planar_layout(layout) ||
+
+        auto is_supported_layout = [](const std::vector<size_t>& layout) {
+            return layout.empty() || (layout.size() - 1 == layout.back());
+        };
+
+        // TODO [157340]: support external repacking for copyB with compensations
+        if (!is_supported_layout(layout) ||
             brgemm_utils::with_compensations(copy_b_node->get_type()) || transformation_callback(copy_b_node))
             return false;
+
+        // If there is non-empty and non-planar layout, we should insert reshape to support shape inference
+        if (!layout.empty() && !ov::snippets::utils::is_planar_layout(layout)) {
+            const auto& subtensor = in_desc->get_subtensor();
+            const auto& reshape = std::make_shared<ov::snippets::op::ReshapeWithOrder>(copy_b_node->input_value(0), layout);
+            ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->input(0), subtensor, layout);
+            ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->output(0), subtensor);
+            ov::replace_node(copy_b_node, reshape);
+            return true;
+        }
+
+        // If there is no layout, we can just remove BrgemmCopyB from the subgraph
         return ov::replace_output_update_name(copy_b_out, copy_b_node->input_value(0));
     };
 
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp
index 1cb8263d189d18..5661f04d496cd2 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp
@@ -70,8 +70,10 @@ bool pass::AdjustBrgemmCopyBLoopPorts::run(const snippets::lowered::LinearIR& li
     auto get_repacking_loop_idces = [](const snippets::lowered::ExpressionPtr& brgemm_expr) {
         // Repacking may be extracted outside the snippets kernel. In this case, brgemm parent expression is a
         // parameter.
-        if (is_type<ov::op::v0::Parameter>(
-                brgemm_expr->get_input_port_connector(1)->get_source().get_expr()->get_node()))
+        const auto& brgemm_in1 = brgemm_expr->get_input_port_connector(1)->get_source();
+        const auto& shape_infer_seq = ov::snippets::utils::get_first_parent_shape_infer_expr_seq(brgemm_in1.get_expr());
+        const auto source = shape_infer_seq.empty() ? brgemm_in1 : shape_infer_seq.back()->get_input_port_connector(0)->get_source();
+        if (is_type<ov::op::v0::Parameter>(source.get_expr()->get_node()))
             return std::vector<size_t>{};
         const auto repacking_expr = brgemm_utils::repacking::get_copy_b_expr(brgemm_expr);
         OPENVINO_ASSERT(repacking_expr, "BrgemmCopyB expression is not found");
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
index 1941523adfb834..950f1dbaa04603 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
@@ -20,18 +20,16 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snipp
     const auto& params = linear_ir->get_parameters();
     for (size_t i = 0; i < params.size(); ++i) {
         const auto& param = params[i];
-        const auto consumers = param->get_output_port_connector(0)->get_consumers();
+        const auto& shape_infer_consumers = ov::snippets::utils::get_first_child_shape_infer_expr_seq(param);
+        const auto& out = shape_infer_consumers.empty() ? param->get_output_port(0) : shape_infer_consumers.back()->get_output_port(0);
+        const auto consumers = out.get_connected_ports();
         const bool brgemm_with_extracted_repacking =
             std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) {
                 auto brgemm = ov::as_type_ptr<ov::intel_cpu::BrgemmCPU>(port.get_expr()->get_node());
                 return brgemm && brgemm_utils::with_repacking(brgemm->get_type()) && port.get_index() == 1;
             });
-        if (brgemm_with_extracted_repacking) {
+        if (brgemm_with_extracted_repacking)
             m_param_idces_with_external_repacking.insert(i);
-            // Ticket 157339: Support non-planar layout
-            OPENVINO_ASSERT(ov::snippets::utils::is_planar_layout(configurator->get_io_descs()[i]->get_layout()),
-                            "Non-planar layout is not supported for external repacking");
-        }
     }
 }
 
@@ -45,26 +43,28 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
         if (shape == cpu_config->latest_shapes[i])
             continue;
 
-        const auto& K = *++shape.rbegin();
-        const auto& N = *shape.rbegin();
+        const auto& layout = cpu_config->io_layouts[i];
+        const auto planar_shape = ov::snippets::utils::get_planar_vdims(shape, layout);
+        const auto& K = *++planar_shape.rbegin();
+        const auto& N = *planar_shape.rbegin();
 
         const auto& precision = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0);
         const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision);
         const size_t brgemm_kernel_rank = 2;
         // Firstly, batch dims are set
-        VectorDims requested_blocked_shape(shape.begin(), shape.end() - brgemm_kernel_rank);
+        VectorDims requested_blocked_shape(planar_shape.begin(), planar_shape.end() - brgemm_kernel_rank);
         // Then, the blocked dims are formed
         const auto new_K = snippets::utils::div_up(K, vnni_factor);
         const auto new_N = std::max(N, brgemm_utils::repacking::compute_inner_n_block(precision));
         requested_blocked_shape.insert(requested_blocked_shape.end(), {new_K, new_N, vnni_factor});
 
-        VectorDims requested_order(shape.size() - brgemm_kernel_rank);
+        VectorDims requested_order(planar_shape.size() - brgemm_kernel_rank);
         std::iota(requested_order.begin(), requested_order.end(), 0);
-        const auto last_idx = shape.size() - 1;
+        const auto last_idx = planar_shape.size() - 1;
         requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1});
 
         const auto desc =
-            std::make_shared<CpuBlockedMemoryDesc>(precision, Shape(shape), requested_blocked_shape, requested_order);
+            std::make_shared<CpuBlockedMemoryDesc>(precision, Shape(planar_shape), requested_blocked_shape, requested_order);
 
         auto config = BrgemmCopyBKernelConfig(precision,
                                               precision,
@@ -75,12 +75,8 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
         const auto executor = std::make_shared<BrgemmCopyBKernelExecutor>(
             static_cast<const CPURuntimeConfigurator*>(m_configurator)->get_cache(),
             config);
-        config.update(N,
-                      N,
-                      K,
-                      K,
-                      ov::snippets::utils::get_dim_in_stride(shape, cpu_config->io_layouts[i], 1) * precision.size(),
-                      brgemm_utils::repacking::compute_LDB(N, precision));
+        const auto copy_wei_stride =  ov::snippets::utils::get_dim_in_stride(shape, cpu_config->io_layouts[i], 1) * precision.size();
+        config.update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, precision));
         executor->update_by_config(config);
 
         // Save original input offsets for input before repacking.

From f1c7435486acccf9949b385cbf394a33b0693d44 Mon Sep 17 00:00:00 2001
From: Alexandra Sidorova <alexandra.sidorova@intel.com>
Date: Thu, 26 Dec 2024 09:05:45 +0100
Subject: [PATCH 5/7] [Snippets][CPU] Applied Ivan comments

---
 src/plugins/intel_cpu/src/nodes/subgraph.cpp | 13 ++++++++-----
 src/plugins/intel_cpu/src/nodes/subgraph.h   |  7 +++----
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
index 54ff91c3204a2f..367bca6210ed9f 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -1000,8 +1000,9 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<Subgraph::Sub
 
         // To avoid extra overheads in runtime on unordered_map creation,
         // we initialize `repacked_offsets_by_threads` by default here
-        for (int i = 0; i < m_nthreads; ++i)
-            m_repacked_offsets_by_threads[i] = {};
+        m_repacked_offsets_by_threads.resize(m_nthreads);
+        for (size_t i = 0; i < m_repacked_offsets_by_threads.size(); ++i)
+            clean_repacked_offsets(i);
     }
 
 #else
@@ -1062,6 +1063,7 @@ void Subgraph::SubgraphExecutor::in_parallel_repack_inputs(const std::vector<Mem
                                                            const std::vector<size_t>& indexes,
                                                            int ithr,
                                                            jit_snippets_call_args& call_args) {
+    size_t repacked_offset_idx = 0;
     for (const auto& p : m_repacked_inputs) {
         const auto& in_idx = p.first;
         const auto& repacked_in = p.second;
@@ -1077,17 +1079,18 @@ void Subgraph::SubgraphExecutor::in_parallel_repack_inputs(const std::vector<Mem
 
         uint8_t* repacked_ptr = get_external_scratchpad_ptr(ithr, in_idx) + dst_offset;
 
-        auto& offsets = m_repacked_offsets_by_threads.at(ithr)[in_idx];
-        if (offsets.count(src_offset) == 0) {
+        auto& last_processed_src_offset = m_repacked_offsets_by_threads[ithr][repacked_offset_idx];
+        if (src_offset != last_processed_src_offset) {
             BrgemmCopyBKernel::call_args args;
             args.src = inMemPtrs[in_idx]->getDataAs<const uint8_t>() + src_offset;
             args.tr_src = repacked_ptr;
             BrgemmCopyBKernelExecutor::execute(repacked_in.executor.get(), &args);
 
-            offsets.insert(src_offset);
+            last_processed_src_offset = src_offset;
         }
 
         call_args.src_ptrs[in_idx] = repacked_ptr;
+        ++repacked_offset_idx;
     }
 }
 #endif  // OPENVINO_ARCH_X86_64
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h
index 74f84b11564989..ddbe0c51ee1f16 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.h
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.h
@@ -196,9 +196,8 @@ class Subgraph::SubgraphExecutor {
         OPENVINO_THROW("External buffer pointer has not been found");
     }
 
-    // [ Input index - > set of src offsets which are already repacked ]
-    using RepackedSrcOffsets = std::unordered_map<size_t, std::set<size_t>>;
-    std::unordered_map<int, RepackedSrcOffsets> m_repacked_offsets_by_threads = {};
+    // [ Thread Index -> Index of input with repacking data - > last repacked src_offset ]
+    std::vector<std::vector<size_t>> m_repacked_offsets_by_threads = {};
     std::unordered_map<size_t, CPURuntimeConfig::RepackedInput> m_repacked_inputs = {};
 
     inline bool should_repacking_be_separately() const {
@@ -209,7 +208,7 @@ class Subgraph::SubgraphExecutor {
     }
     inline void clean_repacked_offsets(size_t ithr) {
         if (should_repacking_be_in_parallel())
-            m_repacked_offsets_by_threads.at(ithr).clear();
+            m_repacked_offsets_by_threads[ithr].assign(m_repacked_inputs.size(), std::numeric_limits<size_t>::max());
     }
 
 private:

From b6ffdaf92054691468e2fd7b52ae1dca66548f1c Mon Sep 17 00:00:00 2001
From: Alexandra Sidorova <alexandra.sidorova@intel.com>
Date: Thu, 26 Dec 2024 09:12:25 +0100
Subject: [PATCH 6/7] [Snippets][CPU] Fixed code style

---
 .../src/emitters/snippets/x64/cpu_generator.cpp      |  3 ++-
 src/plugins/intel_cpu/src/nodes/subgraph.cpp         |  9 ++++++---
 .../snippets/x64/pass/eliminate_brgemm_copy_b.cpp    |  7 ++++---
 .../pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp |  3 ++-
 .../x64/pass/lowered/external_repacking_adjuster.cpp | 12 ++++++++----
 5 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
index 014cd65426e083..7835f17adb97be 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/cpu_generator.cpp
@@ -177,7 +177,8 @@ intel_cpu::CPUTargetMachine::CPUTargetMachine(dnnl::impl::cpu::x64::cpu_isa_t ho
     jitters[snippets::op::RankNormalization::get_type_info_static()] =
         CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
     jitters[snippets::op::Reshape::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
-    jitters[snippets::op::ReshapeWithOrder::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
+    jitters[snippets::op::ReshapeWithOrder::get_type_info_static()] =
+        CREATE_SNIPPETS_EMITTER(intel_cpu::jit_nop_emitter);
 
     jitters[snippets::op::Load::get_type_info_static()] = CREATE_SNIPPETS_EMITTER(intel_cpu::jit_load_memory_emitter);
     jitters[snippets::op::LoadReshape::get_type_info_static()] =
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
index 367bca6210ed9f..0d84d707318248 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -932,7 +932,10 @@ void Subgraph::executeDynamicImpl(dnnl::stream strm) {
 }
 
 namespace {
-inline void init_parallel_domain(const std::vector<size_t>& master_shape, size_t tensor_rank, size_t tile_rank, std::vector<size_t>& domain) {
+inline void init_parallel_domain(const std::vector<size_t>& master_shape,
+                                 size_t tensor_rank,
+                                 size_t tile_rank,
+                                 std::vector<size_t>& domain) {
     domain.resize(tensor_rank, 1);
     std::fill(domain.begin(), domain.end(), 1);
     std::copy(master_shape.cbegin(),
@@ -1032,7 +1035,7 @@ std::vector<MemoryPtr> Subgraph::SubgraphExecutor::separately_repack_inputs(cons
         const auto& src_mem = srcMemPtrs[in_idx];
         const auto& dst_mem = std::make_shared<Memory>(strm.get_engine(), desc, data_ptr, false);
 
-        const auto* src = src_mem->getDataAs<const uint8_t>();
+        const auto* src = src_mem->getDataAs<const uint8_t>() + m_start_offset_in[in_idx];
         auto* dst = dst_mem->getDataAs<uint8_t>();
 
         VectorDims dom;
@@ -1048,7 +1051,7 @@ std::vector<MemoryPtr> Subgraph::SubgraphExecutor::separately_repack_inputs(cons
         const auto& executor = repacked_input.executor;
         parallel_for4d(dom[0], dom[1], dom[2], dom[3], [&](size_t d0, size_t d1, size_t d2, size_t d3) {
             BrgemmCopyBKernel::call_args args;
-            args.src = src + d0 * in_strides[0] + d1 * in_strides[1] + d2 * in_strides[2] + d3 * in_strides[3] + m_start_offset_in[in_idx];
+            args.src = src + d0 * in_strides[0] + d1 * in_strides[1] + d2 * in_strides[2] + d3 * in_strides[3];
             args.tr_src = dst + d0 * out_strides[0] + d1 * out_strides[1] + d2 * out_strides[2] + d3 * out_strides[3];
             BrgemmCopyBKernelExecutor::execute(executor.get(), &args);
         });
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp
index 02abb74cb7ad2f..6176e99ebc3a9a 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/eliminate_brgemm_copy_b.cpp
@@ -37,14 +37,15 @@ pass::EliminateBrgemmCopyB::EliminateBrgemmCopyB() {
         };
 
         // TODO [157340]: support external repacking for copyB with compensations
-        if (!is_supported_layout(layout) ||
-            brgemm_utils::with_compensations(copy_b_node->get_type()) || transformation_callback(copy_b_node))
+        if (!is_supported_layout(layout) || brgemm_utils::with_compensations(copy_b_node->get_type()) ||
+            transformation_callback(copy_b_node))
             return false;
 
         // If there is non-empty and non-planar layout, we should insert reshape to support shape inference
         if (!layout.empty() && !ov::snippets::utils::is_planar_layout(layout)) {
             const auto& subtensor = in_desc->get_subtensor();
-            const auto& reshape = std::make_shared<ov::snippets::op::ReshapeWithOrder>(copy_b_node->input_value(0), layout);
+            const auto& reshape =
+                std::make_shared<ov::snippets::op::ReshapeWithOrder>(copy_b_node->input_value(0), layout);
             ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->input(0), subtensor, layout);
             ov::snippets::lowered::PortDescriptorUtils::set_port_descriptor(reshape->output(0), subtensor);
             ov::replace_node(copy_b_node, reshape);
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp
index 5661f04d496cd2..16df97bb209ed9 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/adjust_brgemm_copy_b_loop_ports.cpp
@@ -72,7 +72,8 @@ bool pass::AdjustBrgemmCopyBLoopPorts::run(const snippets::lowered::LinearIR& li
         // parameter.
         const auto& brgemm_in1 = brgemm_expr->get_input_port_connector(1)->get_source();
         const auto& shape_infer_seq = ov::snippets::utils::get_first_parent_shape_infer_expr_seq(brgemm_in1.get_expr());
-        const auto source = shape_infer_seq.empty() ? brgemm_in1 : shape_infer_seq.back()->get_input_port_connector(0)->get_source();
+        const auto source =
+            shape_infer_seq.empty() ? brgemm_in1 : shape_infer_seq.back()->get_input_port_connector(0)->get_source();
         if (is_type<ov::op::v0::Parameter>(source.get_expr()->get_node()))
             return std::vector<size_t>{};
         const auto repacking_expr = brgemm_utils::repacking::get_copy_b_expr(brgemm_expr);
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
index 950f1dbaa04603..430e6b655a55d5 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
@@ -21,7 +21,8 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snipp
     for (size_t i = 0; i < params.size(); ++i) {
         const auto& param = params[i];
         const auto& shape_infer_consumers = ov::snippets::utils::get_first_child_shape_infer_expr_seq(param);
-        const auto& out = shape_infer_consumers.empty() ? param->get_output_port(0) : shape_infer_consumers.back()->get_output_port(0);
+        const auto& out = shape_infer_consumers.empty() ? param->get_output_port(0)
+                                                        : shape_infer_consumers.back()->get_output_port(0);
         const auto consumers = out.get_connected_ports();
         const bool brgemm_with_extracted_repacking =
             std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) {
@@ -63,8 +64,10 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
         const auto last_idx = planar_shape.size() - 1;
         requested_order.insert(requested_order.end(), {last_idx - 1, last_idx, last_idx - 1});
 
-        const auto desc =
-            std::make_shared<CpuBlockedMemoryDesc>(precision, Shape(planar_shape), requested_blocked_shape, requested_order);
+        const auto desc = std::make_shared<CpuBlockedMemoryDesc>(precision,
+                                                                 Shape(planar_shape),
+                                                                 requested_blocked_shape,
+                                                                 requested_order);
 
         auto config = BrgemmCopyBKernelConfig(precision,
                                               precision,
@@ -75,7 +78,8 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
         const auto executor = std::make_shared<BrgemmCopyBKernelExecutor>(
             static_cast<const CPURuntimeConfigurator*>(m_configurator)->get_cache(),
             config);
-        const auto copy_wei_stride =  ov::snippets::utils::get_dim_in_stride(shape, cpu_config->io_layouts[i], 1) * precision.size();
+        const auto copy_wei_stride =
+            ov::snippets::utils::get_dim_in_stride(shape, cpu_config->io_layouts[i], 1) * precision.size();
         config.update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, precision));
         executor->update_by_config(config);
 

From cced16d147c8de0fadaae82beec2427a47c66077 Mon Sep 17 00:00:00 2001
From: Alexandra Sidorova <alexandra.sidorova@intel.com>
Date: Thu, 26 Dec 2024 16:28:45 +0400
Subject: [PATCH 7/7] [Snippets][CPU] Fixed prim isa

---
 .../snippets/cpu_runtime_configurator.cpp     | 32 +++++++++++++
 .../snippets/cpu_runtime_configurator.hpp     | 28 ++++++------
 src/plugins/intel_cpu/src/nodes/subgraph.cpp  | 18 ++++----
 src/plugins/intel_cpu/src/nodes/subgraph.h    |  2 +-
 .../lowered/external_repacking_adjuster.cpp   | 45 ++++++++++---------
 .../lowered/external_repacking_adjuster.hpp   |  4 +-
 6 files changed, 84 insertions(+), 45 deletions(-)

diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
index 43b3ea14cc148a..0971e9e69a661f 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.cpp
@@ -39,6 +39,38 @@ std::string CPURuntimeConfig::to_string() const {
 }
 #endif
 
+#ifndef OPENVINO_ARCH_ARM64
+
+CPURuntimeConfig::RepackedInput::RepackedInput(std::shared_ptr<const BrgemmCopyBKernel> kernel,
+                                               CpuBlockedMemoryDescPtr desc,
+                                               VectorDims in_offsets,
+                                               VectorDims out_offsets)
+    : m_kernel(std::move(kernel)),
+      m_desc(std::move(desc)),
+      m_in_offsets(std::move(in_offsets)),
+      m_out_offsets(std::move(out_offsets)) {
+    OPENVINO_ASSERT(m_in_offsets.size() == m_out_offsets.size(), "Incorrect size of offsets");
+    OPENVINO_ASSERT(m_desc, "Descriptor is empty");
+}
+
+const CpuBlockedMemoryDescPtr& CPURuntimeConfig::RepackedInput::desc() const {
+    return m_desc;
+}
+
+const std::shared_ptr<const BrgemmCopyBKernel>& CPURuntimeConfig::RepackedInput::kernel() const {
+    return m_kernel;
+}
+
+const VectorDims& CPURuntimeConfig::RepackedInput::in_offsets() const {
+    return m_in_offsets;
+}
+
+const VectorDims& CPURuntimeConfig::RepackedInput::out_offsets() const {
+    return m_out_offsets;
+}
+
+#endif  // OPENVINO_ARCH_ARM64
+
 CPURuntimeConfigurator::CPURuntimeConfigurator(ov::intel_cpu::MultiCacheWeakPtr cache)
     : ov::snippets::RuntimeConfigurator(std::make_shared<CPURuntimeConfig>()),
       compiled_kernel_cache(std::move(cache)) {}
diff --git a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
index 513ff65fee912b..abec42bbbe0abb 100644
--- a/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
+++ b/src/plugins/intel_cpu/src/emitters/snippets/cpu_runtime_configurator.hpp
@@ -30,19 +30,21 @@ class CPURuntimeConfig : public ov::snippets::RuntimeConfig {
 #ifndef OPENVINO_ARCH_ARM64
     struct RepackedInput {
         RepackedInput() = default;
-        RepackedInput(CpuBlockedMemoryDescPtr desc_,
-                      std::shared_ptr<BrgemmCopyBKernelExecutor> executor_,
-                      VectorDims in_offsets_,
-                      VectorDims out_offsets_)
-            : desc(std::move(desc_)),
-              executor(std::move(executor_)),
-              in_offsets(std::move(in_offsets_)),
-              out_offsets(std::move(out_offsets_)) {}
-
-        CpuBlockedMemoryDescPtr desc{nullptr};
-        std::shared_ptr<BrgemmCopyBKernelExecutor> executor{nullptr};
-        VectorDims in_offsets{};
-        VectorDims out_offsets{};
+        RepackedInput(std::shared_ptr<const BrgemmCopyBKernel> kernel,
+                      CpuBlockedMemoryDescPtr desc,
+                      VectorDims in_offsets,
+                      VectorDims out_offsets);
+
+        const std::shared_ptr<const BrgemmCopyBKernel>& kernel() const;
+        const CpuBlockedMemoryDescPtr& desc() const;
+        const VectorDims& in_offsets() const;
+        const VectorDims& out_offsets() const;
+
+    private:
+        std::shared_ptr<const BrgemmCopyBKernel> m_kernel{nullptr};
+        CpuBlockedMemoryDescPtr m_desc{nullptr};
+        VectorDims m_in_offsets{};
+        VectorDims m_out_offsets{};
     };
     std::unordered_map<size_t, RepackedInput> repacked_inputs = {};
 
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
index 0d84d707318248..fb657263fc3161 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp
@@ -993,7 +993,7 @@ Subgraph::SubgraphExecutor::SubgraphExecutor(const std::shared_ptr<Subgraph::Sub
                         m_repacked_inputs.end(),
                         size_t(0),
                         [](size_t sum, const std::pair<size_t, CPURuntimeConfig::RepackedInput>& p) {
-                            return sum + p.second.desc->getCurrentMemSize();
+                            return sum + p.second.desc()->getCurrentMemSize();
                         });
 
     if (should_repacking_be_in_parallel()) {
@@ -1028,7 +1028,7 @@ std::vector<MemoryPtr> Subgraph::SubgraphExecutor::separately_repack_inputs(cons
     for (const auto& p : m_repacked_inputs) {
         const auto in_idx = p.first;
         const auto& repacked_input = p.second;
-        const auto& desc = repacked_input.desc;
+        const auto& desc = repacked_input.desc();
         const void* data_ptr = m_buffer_scratchpad->getDataAs<uint8_t>() + offset;
 
         OPENVINO_ASSERT(in_idx < srcMemPtrs.size(), "Incorrect index of input repacked mem ptr");
@@ -1043,17 +1043,17 @@ std::vector<MemoryPtr> Subgraph::SubgraphExecutor::separately_repack_inputs(cons
         OPENVINO_ASSERT(shape.size() <= rank6D, "Unsupported shape rank of repacking data");
         init_parallel_domain(shape, rank6D, 2lu, dom);
 
-        const auto in_strides = repacked_input.in_offsets;
-        const auto out_strides = repacked_input.out_offsets;
+        const auto& in_strides = repacked_input.in_offsets();
+        const auto& out_strides = repacked_input.out_offsets();
         OPENVINO_ASSERT(in_strides.size() == rank6D && out_strides.size() == rank6D && dom.size() == rank6D,
                         "Unsupported shape rank of repacking data");
 
-        const auto& executor = repacked_input.executor;
+        const auto& kernel = repacked_input.kernel();
         parallel_for4d(dom[0], dom[1], dom[2], dom[3], [&](size_t d0, size_t d1, size_t d2, size_t d3) {
             BrgemmCopyBKernel::call_args args;
             args.src = src + d0 * in_strides[0] + d1 * in_strides[1] + d2 * in_strides[2] + d3 * in_strides[3];
             args.tr_src = dst + d0 * out_strides[0] + d1 * out_strides[1] + d2 * out_strides[2] + d3 * out_strides[3];
-            BrgemmCopyBKernelExecutor::execute(executor.get(), &args);
+            (*kernel)(&args);
         });
 
         reordered_in_ptrs[in_idx] = dst_mem;
@@ -1071,8 +1071,8 @@ void Subgraph::SubgraphExecutor::in_parallel_repack_inputs(const std::vector<Mem
         const auto& in_idx = p.first;
         const auto& repacked_in = p.second;
 
-        const auto& src_offsets = repacked_in.in_offsets;
-        const auto& dst_offsets = repacked_in.out_offsets;
+        const auto& src_offsets = repacked_in.in_offsets();
+        const auto& dst_offsets = repacked_in.out_offsets();
 
         size_t src_offset = m_start_offset_in[in_idx], dst_offset = 0;
         for (size_t j = 0; j < indexes.size(); j++) {
@@ -1087,7 +1087,7 @@ void Subgraph::SubgraphExecutor::in_parallel_repack_inputs(const std::vector<Mem
             BrgemmCopyBKernel::call_args args;
             args.src = inMemPtrs[in_idx]->getDataAs<const uint8_t>() + src_offset;
             args.tr_src = repacked_ptr;
-            BrgemmCopyBKernelExecutor::execute(repacked_in.executor.get(), &args);
+            (*repacked_in.kernel())(&args);
 
             last_processed_src_offset = src_offset;
         }
diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.h b/src/plugins/intel_cpu/src/nodes/subgraph.h
index ddbe0c51ee1f16..b7864ce539371b 100644
--- a/src/plugins/intel_cpu/src/nodes/subgraph.h
+++ b/src/plugins/intel_cpu/src/nodes/subgraph.h
@@ -186,7 +186,7 @@ class Subgraph::SubgraphExecutor {
 
         uint8_t* data_ptr = m_buffer_scratchpad->getDataAs<uint8_t>() + m_internal_buffer_size;
         for (const auto& p : m_repacked_inputs) {
-            const auto& desc = p.second.desc;
+            const auto& desc = p.second.desc();
             const auto size = desc->getCurrentMemSize();
             if (p.first == idx) {
                 return data_ptr + ithr * size;
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
index 430e6b655a55d5..8555ec1e958048 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.cpp
@@ -24,13 +24,20 @@ BrgemmExternalRepackingAdjuster::BrgemmExternalRepackingAdjuster(const ov::snipp
         const auto& out = shape_infer_consumers.empty() ? param->get_output_port(0)
                                                         : shape_infer_consumers.back()->get_output_port(0);
         const auto consumers = out.get_connected_ports();
-        const bool brgemm_with_extracted_repacking =
-            std::any_of(consumers.begin(), consumers.end(), [](const ov::snippets::lowered::ExpressionPort& port) {
-                auto brgemm = ov::as_type_ptr<ov::intel_cpu::BrgemmCPU>(port.get_expr()->get_node());
-                return brgemm && brgemm_utils::with_repacking(brgemm->get_type()) && port.get_index() == 1;
-            });
-        if (brgemm_with_extracted_repacking)
-            m_param_idces_with_external_repacking.insert(i);
+
+        for (const auto& consumer : consumers) {
+            auto brgemm = ov::as_type_ptr<ov::intel_cpu::BrgemmCPU>(consumer.get_expr()->get_node());
+            if (brgemm && brgemm_utils::with_repacking(brgemm->get_type()) && consumer.get_index() == 1) {
+                const auto src_prc = brgemm->get_input_element_type(0);
+                const auto wei_prc = brgemm->get_input_element_type(1);
+                const auto isa = brgemm_utils::get_primitive_isa(src_prc, brgemm_utils::with_amx(brgemm->get_type()));
+                const auto inner_n_block = brgemm_utils::repacking::compute_inner_n_block(wei_prc);
+                auto config = BrgemmCopyBKernelConfig(src_prc, wei_prc, isa, false, false, inner_n_block);
+                m_executors[i] = std::make_shared<BrgemmCopyBKernelExecutor>(
+                    static_cast<const CPURuntimeConfigurator*>(m_configurator)->get_cache(),
+                    config);
+            }
+        }
     }
 }
 
@@ -39,7 +46,8 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
     const auto& cpu_config = ov::as_type_ptr<CPURuntimeConfig>(m_configurator->get_config());
 
     size_t data_size = 0;
-    for (const auto& i : m_param_idces_with_external_repacking) {
+    for (const auto& p : m_executors) {
+        const auto& i = p.first;
         const auto& shape = cpu_config->io_shapes[i];
         if (shape == cpu_config->latest_shapes[i])
             continue;
@@ -49,6 +57,7 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
         const auto& K = *++planar_shape.rbegin();
         const auto& N = *planar_shape.rbegin();
 
+        // Create CPU Memory descriptor
         const auto& precision = linear_ir.get_parameters()[i]->get_node()->get_output_element_type(0);
         const auto vnni_factor = brgemm_utils::compute_vnni_factor(precision);
         const size_t brgemm_kernel_rank = 2;
@@ -69,19 +78,14 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
                                                                  requested_blocked_shape,
                                                                  requested_order);
 
-        auto config = BrgemmCopyBKernelConfig(precision,
-                                              precision,
-                                              dnnl::impl::cpu::x64::cpu_isa_t::avx512_core_amx,
-                                              false,
-                                              false,
-                                              brgemm_utils::repacking::compute_inner_n_block(precision));
-        const auto executor = std::make_shared<BrgemmCopyBKernelExecutor>(
-            static_cast<const CPURuntimeConfigurator*>(m_configurator)->get_cache(),
-            config);
+        // Create Kernel using BrgemmCopyBExecutor
+        const auto& executor = p.second;
         const auto copy_wei_stride =
             ov::snippets::utils::get_dim_in_stride(shape, cpu_config->io_layouts[i], 1) * precision.size();
-        config.update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, precision));
-        executor->update_by_config(config);
+        const auto generic_config = executor->get_config().get_clone_ptr();
+        auto config = static_cast<BrgemmCopyBKernelConfig*>(generic_config.get());
+        config->update(N, N, K, K, copy_wei_stride, brgemm_utils::repacking::compute_LDB(N, precision));
+        executor->update_by_config(*config);
 
         // Save original input offsets for input before repacking.
         const auto in_offsets = cpu_config->io_data_offsets[i];
@@ -92,7 +96,8 @@ bool BrgemmExternalRepackingAdjuster::run(const snippets::lowered::LinearIR& lin
         // Save new input offsets for input after repacking.
         const auto out_offsets = cpu_config->io_data_offsets[i];
 
-        cpu_config->repacked_inputs[i] = CPURuntimeConfig::RepackedInput(desc, executor, in_offsets, out_offsets);
+        cpu_config->repacked_inputs[i] =
+            CPURuntimeConfig::RepackedInput(executor->get_kernel(), desc, in_offsets, out_offsets);
 
         // src data + dst data per kernel call
         data_size += N * K * precision.size() + new_N * new_K * vnni_factor * precision.size();
diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp
index 4d0c9586f3be31..6f4e3942b1f581 100644
--- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp
+++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/external_repacking_adjuster.hpp
@@ -24,11 +24,11 @@ class BrgemmExternalRepackingAdjuster : public ov::snippets::lowered::pass::Runt
 
     bool run(const snippets::lowered::LinearIR& linear_ir) override;
     bool applicable() const override {
-        return !m_param_idces_with_external_repacking.empty();
+        return !m_executors.empty();
     }
 
 private:
-    std::set<size_t> m_param_idces_with_external_repacking;
+    std::unordered_map<size_t, std::shared_ptr<BrgemmCopyBKernelExecutor>> m_executors;
 };
 
 }  // namespace intel_cpu