openvinotoolkit · ilya-lavrenov · Aug 6, 2024 · Aug 2, 2024 · Aug 5, 2024 · Aug 5, 2024
diff --git a/src/cpp/include/openvino/genai/generation_handle.hpp b/src/cpp/include/openvino/genai/generation_handle.hpp
@@ -58,6 +58,8 @@ class GenerationStream;
 class OPENVINO_GENAI_EXPORTS GenerationHandleImpl {
     std::shared_ptr<GenerationStream> m_generation_stream;
     ov::genai::GenerationConfig m_sampling_params;
+
+    bool is_dropped();
 
 public:
     GenerationHandleImpl(std::shared_ptr<GenerationStream> generation_stream, const ov::genai::GenerationConfig& sampling_params) :
@@ -74,12 +76,14 @@ class OPENVINO_GENAI_EXPORTS GenerationHandleImpl {
 
     bool can_read();
 
+    void drop();
+
     GenerationOutputs back();
     // Reads result of a generation for single iteration
     GenerationOutputs read();
     // Reads all generated tokens for all sequences
     std::vector<GenerationOutput> read_all();
 };
 
-using GenerationHandle = std::unique_ptr<GenerationHandleImpl>;
+using GenerationHandle = std::shared_ptr<GenerationHandleImpl>;
 }
diff --git a/src/cpp/src/continuous_batching_pipeline.cpp b/src/cpp/src/continuous_batching_pipeline.cpp
@@ -60,6 +60,15 @@ class ContinuousBatchingPipeline::Impl {
     ChatHistory m_history;
 
 
+    void _notify_requests_dropped_by_handle() {
+        // Notify the last time by pushing empty output
+        // This causes read() to unblock by adding anything to the queue
+        for (SequenceGroup::Ptr& request : m_requests) {
+            if (request->handle_dropped())
+                request->push_empty_outputs();
+        }
+    }
+
     void _free_non_running_requests() {
         std::vector<SequenceGroup::Ptr>::iterator requests_iterator = m_requests.begin();
         while (requests_iterator != m_requests.end()) {
@@ -136,7 +145,7 @@ class ContinuousBatchingPipeline::Impl {
             std::lock_guard<std::mutex> lock{m_awaiting_requests_mutex};
             m_awaiting_requests.push_back(sequence_group);
         }
-        return std::make_unique<GenerationHandleImpl>(sequence_group->get_generation_stream(), sampling_params);
+        return std::make_shared<GenerationHandleImpl>(sequence_group->get_generation_stream(), sampling_params);
     }
 
     GenerationHandle add_request(uint64_t request_id, const std::string& prompt, ov::genai::GenerationConfig sampling_params) {
@@ -227,6 +236,15 @@ class ContinuousBatchingPipeline::Impl {
             timer.end();
         }
 
+        // notify requests dropped by handle
+
+        {
+            static ManualTimer timer("notify requests dropped by handle");
+            timer.start();
+            _notify_requests_dropped_by_handle();
+            timer.end();
+        }
+
         // free non running requests for current step
 
         {

diff --git a/src/cpp/src/generation_handle.cpp b/src/cpp/src/generation_handle.cpp
@@ -9,22 +9,32 @@
 using namespace ov::genai;
 
 GenerationHandleImpl::~GenerationHandleImpl() {
-    m_generation_stream->drop();
+    drop();
 }
 
 GenerationStatus GenerationHandleImpl::get_status() {
     return m_generation_stream->get_status();
 }
 
 bool GenerationHandleImpl::can_read() {
-    return m_generation_stream->can_read();
+    return !is_dropped() &&  m_generation_stream->can_read();
+}
+
+bool GenerationHandleImpl::is_dropped() {
+    return get_status() == GenerationStatus::DROPPED_BY_HANDLE;
+}
+
+void GenerationHandleImpl::drop() {
+    m_generation_stream->drop();
 }
 
 std::unordered_map<uint64_t, GenerationOutput> GenerationHandleImpl::back() {
+    OPENVINO_ASSERT(!is_dropped(), "GenerationHandle cannot be used after it is dropped.");
     return m_generation_stream->back();
 }
 
 std::unordered_map<uint64_t, GenerationOutput> GenerationHandleImpl::read() {
+    OPENVINO_ASSERT(!is_dropped(), "GenerationHandle cannot be used after it is dropped.");
     return m_generation_stream->read();
 }
 
@@ -41,6 +51,7 @@ void add_partial_result(std::unordered_map<uint64_t, GenerationOutput>& partial_
 }
 
 std::vector<GenerationOutput> GenerationHandleImpl::read_all() {
+    OPENVINO_ASSERT(!is_dropped(), "GenerationHandle cannot be used after it is dropped.");
     std::vector<GenerationOutput> results;
     std::unordered_map<uint64_t, GenerationOutput> partial_results;
     // We iterate until generation is running or there are tokens we haven't read yet

diff --git a/src/cpp/src/sequence_group.hpp b/src/cpp/src/sequence_group.hpp
@@ -446,6 +446,10 @@ class SequenceGroup {
         return m_generation_stream->get_status() == GenerationStatus::DROPPED_BY_HANDLE;
     }
 
+    void push_empty_outputs() {
+        m_generation_stream->push({});
+    }
+
     void push_outputs() {
         GenerationOutputs outputs;
         for (auto& sequence: m_sequences) {