Switch to two-input scheme

openvinotoolkit · Oct 15, 2024 · 67aa2a0 · 67aa2a0
1 parent 5dce2ad
commit 67aa2a0
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 22 deletions.
diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp
@@ -405,9 +405,9 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::maybe_evict_cache_block
         cache_eviction_algo.register_new_token_scores(attention_scores_for_all_decoder_layers);
         auto logical_blocks_to_evict = cache_eviction_algo.evict_logical_blocks();
 
-        size_t num_blocks_before_eviction = m_scheduler->get_block_tables(seq_id)[0].size();
 
         for (size_t i = 0; i < logical_blocks_to_evict.size(); i++) {
+            size_t num_blocks_before_eviction = m_scheduler->get_block_tables(seq_id)[i].size();
             auto rotation_multipliers =
                 m_cache_rotation_calculator->get_rotation_multipliers(logical_blocks_to_evict[i],
                                                                       num_blocks_before_eviction);

diff --git a/src/cpp/src/model_runner.hpp b/src/cpp/src/model_runner.hpp
@@ -170,7 +170,7 @@ class ModelRunner {
         m_request.set_tensor("past_lens", past_lens);
         m_request.set_tensor("subsequence_begins", subsequence_begins);
 
-        _set_block_indices(m_request, sequence_groups, scheduler_output, total_num_blocks);
+        _set_block_indices(sequence_groups, scheduler_output, total_num_blocks);
 
         if (!m_cache_rotation_coefficients.empty()) {
             _set_cache_rotation_coefficients();
@@ -206,22 +206,12 @@ class ModelRunner {
     }
 
 private:
-    void _set_block_indices(ov::InferRequest& infer_request, const std::vector<SequenceGroup::Ptr> & sequence_groups, const Scheduler::Output& scheduler_output,
-                            size_t total_num_blocks) {
+    void _fill_indices_from_block_tables(const std::vector<std::string>& dst_tensor_names,
+                                        const std::vector<SequenceGroup::Ptr> & sequence_groups,
+                                        const Scheduler::Output& scheduler_output,
+                                        const std::vector<size_t>& fill_n_last_vec) {
+        OPENVINO_ASSERT(fill_n_last_vec.size() == dst_tensor_names.size() || fill_n_last_vec.empty());
         size_t num_sequence_groups = scheduler_output.m_scheduled_sequence_groups_ids.size();
-        std::vector<std::string> tensor_names = {"block_indices"};
-
-        if (m_scheduler_config.use_cache_eviction) {
-            tensor_names.resize(m_num_decoder_layers);
-            for (size_t i = 0; i < tensor_names.size(); i++) {
-                tensor_names[i] = std::string("block_indices.") + std::to_string(i);
-            }
-        }
-
-        for (auto& name : tensor_names) {
-            m_request.get_tensor(name).set_shape({total_num_blocks});
-        }
-
         size_t block_offset = 0;
         for (size_t i = 0; i < num_sequence_groups; ++i) {
             size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i];
@@ -235,25 +225,60 @@ class ModelRunner {
                 size_t num_blocks = (sequence_group->get_context_len()  - sequence_group->get_num_evicted_tokens() +  m_scheduler_config.block_size - 1) / m_scheduler_config.block_size;
                 const auto & kv_blocks = scheduler_output.m_block_tables.at(sequence->get_id());
 
-                for (size_t layer_idx = 0; layer_idx < tensor_names.size(); layer_idx++) {
-                    auto input_tensor = infer_request.get_tensor(tensor_names[layer_idx]);
+                for (size_t layer_idx = 0; layer_idx < dst_tensor_names.size(); layer_idx++) {
+                    size_t fill_n_last = num_blocks;
+                    if (!fill_n_last_vec.empty()) {
+                        fill_n_last = fill_n_last_vec[layer_idx];
+                    }
+                    OPENVINO_ASSERT(num_blocks >= fill_n_last);
+                    size_t starting_offset = num_blocks - fill_n_last;
+                    auto input_tensor = m_request.get_tensor(dst_tensor_names[layer_idx]);
                     auto block_indices_data = input_tensor.data<int32_t>() + block_offset;
-                    for (size_t block_id = 0; block_id < num_blocks; ++block_id)
+                    for (size_t block_id = 0; block_id < fill_n_last; ++block_id)
                         // In case no cache eviction is requested, all per-layer block tables are expected to be identical
                         // at all times
-                        block_indices_data[block_id] = kv_blocks[layer_idx][block_id]->get_index();
+                        block_indices_data[block_id] = kv_blocks[layer_idx][starting_offset + block_id]->get_index();
                 }
 
                 block_offset += num_blocks;
             }
         }
     }
+    void _set_block_indices(const std::vector<SequenceGroup::Ptr> & sequence_groups, const Scheduler::Output& scheduler_output,
+                            size_t total_num_blocks) {
+        std::vector<std::string> tensor_names = {"block_indices"};
 
-    void _set_cache_rotation_coefficients() {
+        if (m_scheduler_config.use_cache_eviction) {
+            tensor_names.resize(m_num_decoder_layers);
+            for (size_t i = 0; i < tensor_names.size(); i++) {
+                tensor_names[i] = std::string("block_indices.") + std::to_string(i);
+            }
+        }
+
+        for (auto& name : tensor_names) {
+            m_request.get_tensor(name).set_shape({total_num_blocks});
+        }
+
+        _fill_indices_from_block_tables(tensor_names, sequence_groups, scheduler_output, {});
+    }
+
+    void _set_cache_rotation_coefficients(const std::vector<SequenceGroup::Ptr> & sequence_groups, const Scheduler::Output& scheduler_output) {
         for (size_t i = 0; i < m_num_decoder_layers; i++) {
             auto tensor_name = std::string("cache_rotation_coefficients.") + std::to_string(i);
             m_request.set_tensor(tensor_name, m_cache_rotation_coefficients[i]);
         }
+
+        std::vector<std::string> rotation_indices_tensor_names(m_num_decoder_layers);
+        std::vector<size_t> rotation_indices_sizes_in_blocks(m_num_decoder_layers);
+        for (size_t i = 0; i < m_num_decoder_layers; i++) {
+            auto tensor_name = std::string("rotated_block_indices.") + std::to_string(i);
+            rotation_indices_tensor_names[i] = tensor_name;
+            size_t size_in_blocks = m_cache_rotation_coefficients[i].get_size() / m_scheduler_config.block_size;
+            m_request.get_tensor(tensor_name).set_shape({size_in_blocks});
+            rotation_indices_sizes_in_blocks[i] = size_in_blocks;
+        }
+
+        _fill_indices_from_block_tables(rotation_indices_tensor_names, sequence_groups, scheduler_output, rotation_indices_sizes_in_blocks);
     }
 
     void _collect_attention_scores(const std::vector<SequenceGroup::Ptr> & sequence_groups, const Scheduler::Output& scheduler_output) {