diff --git a/src/cpp/src/continuous_batching_impl.cpp b/src/cpp/src/continuous_batching_impl.cpp index 74dbf26ec2..316ea90e50 100644 --- a/src/cpp/src/continuous_batching_impl.cpp +++ b/src/cpp/src/continuous_batching_impl.cpp @@ -405,9 +405,9 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::maybe_evict_cache_block cache_eviction_algo.register_new_token_scores(attention_scores_for_all_decoder_layers); auto logical_blocks_to_evict = cache_eviction_algo.evict_logical_blocks(); - size_t num_blocks_before_eviction = m_scheduler->get_block_tables(seq_id)[0].size(); for (size_t i = 0; i < logical_blocks_to_evict.size(); i++) { + size_t num_blocks_before_eviction = m_scheduler->get_block_tables(seq_id)[i].size(); auto rotation_multipliers = m_cache_rotation_calculator->get_rotation_multipliers(logical_blocks_to_evict[i], num_blocks_before_eviction); diff --git a/src/cpp/src/model_runner.hpp b/src/cpp/src/model_runner.hpp index ac33c43422..68254844f4 100644 --- a/src/cpp/src/model_runner.hpp +++ b/src/cpp/src/model_runner.hpp @@ -170,7 +170,7 @@ class ModelRunner { m_request.set_tensor("past_lens", past_lens); m_request.set_tensor("subsequence_begins", subsequence_begins); - _set_block_indices(m_request, sequence_groups, scheduler_output, total_num_blocks); + _set_block_indices(sequence_groups, scheduler_output, total_num_blocks); if (!m_cache_rotation_coefficients.empty()) { _set_cache_rotation_coefficients(); @@ -206,22 +206,12 @@ class ModelRunner { } private: - void _set_block_indices(ov::InferRequest& infer_request, const std::vector & sequence_groups, const Scheduler::Output& scheduler_output, - size_t total_num_blocks) { + void _fill_indices_from_block_tables(const std::vector& dst_tensor_names, + const std::vector & sequence_groups, + const Scheduler::Output& scheduler_output, + const std::vector& fill_n_last_vec) { + OPENVINO_ASSERT(fill_n_last_vec.size() == dst_tensor_names.size() || fill_n_last_vec.empty()); size_t num_sequence_groups = scheduler_output.m_scheduled_sequence_groups_ids.size(); - std::vector tensor_names = {"block_indices"}; - - if (m_scheduler_config.use_cache_eviction) { - tensor_names.resize(m_num_decoder_layers); - for (size_t i = 0; i < tensor_names.size(); i++) { - tensor_names[i] = std::string("block_indices.") + std::to_string(i); - } - } - - for (auto& name : tensor_names) { - m_request.get_tensor(name).set_shape({total_num_blocks}); - } - size_t block_offset = 0; for (size_t i = 0; i < num_sequence_groups; ++i) { size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i]; @@ -235,25 +225,60 @@ class ModelRunner { size_t num_blocks = (sequence_group->get_context_len() - sequence_group->get_num_evicted_tokens() + m_scheduler_config.block_size - 1) / m_scheduler_config.block_size; const auto & kv_blocks = scheduler_output.m_block_tables.at(sequence->get_id()); - for (size_t layer_idx = 0; layer_idx < tensor_names.size(); layer_idx++) { - auto input_tensor = infer_request.get_tensor(tensor_names[layer_idx]); + for (size_t layer_idx = 0; layer_idx < dst_tensor_names.size(); layer_idx++) { + size_t fill_n_last = num_blocks; + if (!fill_n_last_vec.empty()) { + fill_n_last = fill_n_last_vec[layer_idx]; + } + OPENVINO_ASSERT(num_blocks >= fill_n_last); + size_t starting_offset = num_blocks - fill_n_last; + auto input_tensor = m_request.get_tensor(dst_tensor_names[layer_idx]); auto block_indices_data = input_tensor.data() + block_offset; - for (size_t block_id = 0; block_id < num_blocks; ++block_id) + for (size_t block_id = 0; block_id < fill_n_last; ++block_id) // In case no cache eviction is requested, all per-layer block tables are expected to be identical // at all times - block_indices_data[block_id] = kv_blocks[layer_idx][block_id]->get_index(); + block_indices_data[block_id] = kv_blocks[layer_idx][starting_offset + block_id]->get_index(); } block_offset += num_blocks; } } } + void _set_block_indices(const std::vector & sequence_groups, const Scheduler::Output& scheduler_output, + size_t total_num_blocks) { + std::vector tensor_names = {"block_indices"}; - void _set_cache_rotation_coefficients() { + if (m_scheduler_config.use_cache_eviction) { + tensor_names.resize(m_num_decoder_layers); + for (size_t i = 0; i < tensor_names.size(); i++) { + tensor_names[i] = std::string("block_indices.") + std::to_string(i); + } + } + + for (auto& name : tensor_names) { + m_request.get_tensor(name).set_shape({total_num_blocks}); + } + + _fill_indices_from_block_tables(tensor_names, sequence_groups, scheduler_output, {}); + } + + void _set_cache_rotation_coefficients(const std::vector & sequence_groups, const Scheduler::Output& scheduler_output) { for (size_t i = 0; i < m_num_decoder_layers; i++) { auto tensor_name = std::string("cache_rotation_coefficients.") + std::to_string(i); m_request.set_tensor(tensor_name, m_cache_rotation_coefficients[i]); } + + std::vector rotation_indices_tensor_names(m_num_decoder_layers); + std::vector rotation_indices_sizes_in_blocks(m_num_decoder_layers); + for (size_t i = 0; i < m_num_decoder_layers; i++) { + auto tensor_name = std::string("rotated_block_indices.") + std::to_string(i); + rotation_indices_tensor_names[i] = tensor_name; + size_t size_in_blocks = m_cache_rotation_coefficients[i].get_size() / m_scheduler_config.block_size; + m_request.get_tensor(tensor_name).set_shape({size_in_blocks}); + rotation_indices_sizes_in_blocks[i] = size_in_blocks; + } + + _fill_indices_from_block_tables(rotation_indices_tensor_names, sequence_groups, scheduler_output, rotation_indices_sizes_in_blocks); } void _collect_attention_scores(const std::vector & sequence_groups, const Scheduler::Output& scheduler_output) {