Skip to content

Commit

Permalink
Switch to two-input scheme
Browse files Browse the repository at this point in the history
  • Loading branch information
vshampor committed Oct 15, 2024
1 parent 5dce2ad commit 67aa2a0
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 22 deletions.
2 changes: 1 addition & 1 deletion src/cpp/src/continuous_batching_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -405,9 +405,9 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::maybe_evict_cache_block
cache_eviction_algo.register_new_token_scores(attention_scores_for_all_decoder_layers);
auto logical_blocks_to_evict = cache_eviction_algo.evict_logical_blocks();

size_t num_blocks_before_eviction = m_scheduler->get_block_tables(seq_id)[0].size();

for (size_t i = 0; i < logical_blocks_to_evict.size(); i++) {
size_t num_blocks_before_eviction = m_scheduler->get_block_tables(seq_id)[i].size();
auto rotation_multipliers =
m_cache_rotation_calculator->get_rotation_multipliers(logical_blocks_to_evict[i],
num_blocks_before_eviction);
Expand Down
67 changes: 46 additions & 21 deletions src/cpp/src/model_runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ class ModelRunner {
m_request.set_tensor("past_lens", past_lens);
m_request.set_tensor("subsequence_begins", subsequence_begins);

_set_block_indices(m_request, sequence_groups, scheduler_output, total_num_blocks);
_set_block_indices(sequence_groups, scheduler_output, total_num_blocks);

if (!m_cache_rotation_coefficients.empty()) {
_set_cache_rotation_coefficients();
Expand Down Expand Up @@ -206,22 +206,12 @@ class ModelRunner {
}

private:
void _set_block_indices(ov::InferRequest& infer_request, const std::vector<SequenceGroup::Ptr> & sequence_groups, const Scheduler::Output& scheduler_output,
size_t total_num_blocks) {
void _fill_indices_from_block_tables(const std::vector<std::string>& dst_tensor_names,
const std::vector<SequenceGroup::Ptr> & sequence_groups,
const Scheduler::Output& scheduler_output,
const std::vector<size_t>& fill_n_last_vec) {
OPENVINO_ASSERT(fill_n_last_vec.size() == dst_tensor_names.size() || fill_n_last_vec.empty());
size_t num_sequence_groups = scheduler_output.m_scheduled_sequence_groups_ids.size();
std::vector<std::string> tensor_names = {"block_indices"};

if (m_scheduler_config.use_cache_eviction) {
tensor_names.resize(m_num_decoder_layers);
for (size_t i = 0; i < tensor_names.size(); i++) {
tensor_names[i] = std::string("block_indices.") + std::to_string(i);
}
}

for (auto& name : tensor_names) {
m_request.get_tensor(name).set_shape({total_num_blocks});
}

size_t block_offset = 0;
for (size_t i = 0; i < num_sequence_groups; ++i) {
size_t seq_group_id = scheduler_output.m_scheduled_sequence_groups_ids[i];
Expand All @@ -235,25 +225,60 @@ class ModelRunner {
size_t num_blocks = (sequence_group->get_context_len() - sequence_group->get_num_evicted_tokens() + m_scheduler_config.block_size - 1) / m_scheduler_config.block_size;
const auto & kv_blocks = scheduler_output.m_block_tables.at(sequence->get_id());

for (size_t layer_idx = 0; layer_idx < tensor_names.size(); layer_idx++) {
auto input_tensor = infer_request.get_tensor(tensor_names[layer_idx]);
for (size_t layer_idx = 0; layer_idx < dst_tensor_names.size(); layer_idx++) {
size_t fill_n_last = num_blocks;
if (!fill_n_last_vec.empty()) {
fill_n_last = fill_n_last_vec[layer_idx];
}
OPENVINO_ASSERT(num_blocks >= fill_n_last);
size_t starting_offset = num_blocks - fill_n_last;
auto input_tensor = m_request.get_tensor(dst_tensor_names[layer_idx]);
auto block_indices_data = input_tensor.data<int32_t>() + block_offset;
for (size_t block_id = 0; block_id < num_blocks; ++block_id)
for (size_t block_id = 0; block_id < fill_n_last; ++block_id)
// In case no cache eviction is requested, all per-layer block tables are expected to be identical
// at all times
block_indices_data[block_id] = kv_blocks[layer_idx][block_id]->get_index();
block_indices_data[block_id] = kv_blocks[layer_idx][starting_offset + block_id]->get_index();
}

block_offset += num_blocks;
}
}
}
void _set_block_indices(const std::vector<SequenceGroup::Ptr> & sequence_groups, const Scheduler::Output& scheduler_output,
size_t total_num_blocks) {
std::vector<std::string> tensor_names = {"block_indices"};

void _set_cache_rotation_coefficients() {
if (m_scheduler_config.use_cache_eviction) {
tensor_names.resize(m_num_decoder_layers);
for (size_t i = 0; i < tensor_names.size(); i++) {
tensor_names[i] = std::string("block_indices.") + std::to_string(i);
}
}

for (auto& name : tensor_names) {
m_request.get_tensor(name).set_shape({total_num_blocks});
}

_fill_indices_from_block_tables(tensor_names, sequence_groups, scheduler_output, {});
}

void _set_cache_rotation_coefficients(const std::vector<SequenceGroup::Ptr> & sequence_groups, const Scheduler::Output& scheduler_output) {
for (size_t i = 0; i < m_num_decoder_layers; i++) {
auto tensor_name = std::string("cache_rotation_coefficients.") + std::to_string(i);
m_request.set_tensor(tensor_name, m_cache_rotation_coefficients[i]);
}

std::vector<std::string> rotation_indices_tensor_names(m_num_decoder_layers);
std::vector<size_t> rotation_indices_sizes_in_blocks(m_num_decoder_layers);
for (size_t i = 0; i < m_num_decoder_layers; i++) {
auto tensor_name = std::string("rotated_block_indices.") + std::to_string(i);
rotation_indices_tensor_names[i] = tensor_name;
size_t size_in_blocks = m_cache_rotation_coefficients[i].get_size() / m_scheduler_config.block_size;
m_request.get_tensor(tensor_name).set_shape({size_in_blocks});
rotation_indices_sizes_in_blocks[i] = size_in_blocks;
}

_fill_indices_from_block_tables(rotation_indices_tensor_names, sequence_groups, scheduler_output, rotation_indices_sizes_in_blocks);
}

void _collect_attention_scores(const std::vector<SequenceGroup::Ptr> & sequence_groups, const Scheduler::Output& scheduler_output) {
Expand Down

0 comments on commit 67aa2a0

Please sign in to comment.