Skip to content

Commit

Permalink
Implement retrying for transient DE errors
Browse files Browse the repository at this point in the history
On occasion, we will experience transient errors from Discovery Engine
that will resolve themselves if we wait briefly and try again. In
particular, this can happen when a number of updates come through in
quick succession for the same document that are handled by different
workers.

- Add ability for put/delete operations to try for up to three attempts
  when an error occurs before giving up
  • Loading branch information
csutter committed Aug 7, 2024
1 parent 3438397 commit 73a7967
Show file tree
Hide file tree
Showing 3 changed files with 118 additions and 8 deletions.
22 changes: 20 additions & 2 deletions app/services/discovery_engine/sync/operation.rb
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
module DiscoveryEngine::Sync
class Operation
MAX_RETRIES_ON_ERROR = 3
WAIT_ON_ERROR = 3.seconds

def initialize(type, content_id, payload_version: nil, client: nil)
@type = type
@content_id = content_id
@payload_version = payload_version
@client = client || ::Google::Cloud::DiscoveryEngine.document_service(version: :v1)
@attempt = 1
end

private

attr_reader :type, :content_id, :payload_version, :client
attr_reader :type, :content_id, :payload_version, :client, :attempt

def sync
lock.acquire
Expand All @@ -26,8 +30,22 @@ def sync
log(Logger::Severity::INFO, "Ignored as newer version already synced")
end
rescue Google::Cloud::Error => e
if attempt < MAX_RETRIES_ON_ERROR
increment_counter("retry")
log(
Logger::Severity::WARN,
"Failed attempt #{attempt} to #{type} document (#{e.message}), retrying",
)
@attempt += 1
Kernel.sleep(WAIT_ON_ERROR)
retry
end

increment_counter("error")
log(Logger::Severity::ERROR, "Failed to #{type} document due to an error (#{e.message})")
log(
Logger::Severity::ERROR,
"Failed on attempt #{attempt} to #{type} document (#{e.message}), giving up",
)
GovukError.notify(e)
ensure
lock.release
Expand Down
52 changes: 49 additions & 3 deletions spec/services/discovery_engine/sync/delete_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
let(:sync_required) { true }

before do
allow(Kernel).to receive(:sleep).and_return(nil)
allow(Rails).to receive(:logger).and_return(logger)
allow(Rails.configuration).to receive(:discovery_engine_datastore_branch).and_return("branch")
allow(GovukError).to receive(:notify)
Expand Down Expand Up @@ -100,7 +101,7 @@
end
end

context "when the delete fails for another reason" do
context "when failing consistently for the maximum number of attempts" do
let(:err) { Google::Cloud::Error.new("Something went wrong") }

before do
Expand All @@ -109,15 +110,60 @@
described_class.new("some_content_id", payload_version: "1", client:).call
end

it "logs the failure" do
it "logs the failed attempts" do
expect(logger).to have_received(:add).with(
Logger::Severity::WARN,
"[DiscoveryEngine::Sync::Delete] Failed attempt 1 to delete document (Something went wrong), retrying content_id:some_content_id payload_version:1",
)
expect(logger).to have_received(:add).with(
Logger::Severity::WARN,
"[DiscoveryEngine::Sync::Delete] Failed attempt 2 to delete document (Something went wrong), retrying content_id:some_content_id payload_version:1",
)
expect(logger).to have_received(:add).with(
Logger::Severity::ERROR,
"[DiscoveryEngine::Sync::Delete] Failed to delete document due to an error (Something went wrong) content_id:some_content_id payload_version:1",
"[DiscoveryEngine::Sync::Delete] Failed on attempt 3 to delete document (Something went wrong), giving up content_id:some_content_id payload_version:1",
)
end

it "send the error to Sentry" do
expect(GovukError).to have_received(:notify).with(err)
end
end

context "when failing transiently but succeeding within the maximum attempts" do
let(:err) { Google::Cloud::Error.new("Something went wrong") }

before do
allow(client).to receive(:delete_document).and_invoke(
->(_) { raise err },
->(_) { raise err },
->(_) { double(name: "document-name") },
)

described_class.new("some_content_id", payload_version: "1", client:).call
end

it "tries three times" do
expect(client).to have_received(:delete_document).exactly(3).times
end

it "logs the failed and successful attempts" do
expect(logger).to have_received(:add).with(
Logger::Severity::WARN,
"[DiscoveryEngine::Sync::Delete] Failed attempt 1 to delete document (Something went wrong), retrying content_id:some_content_id payload_version:1",
).ordered
expect(logger).to have_received(:add).with(
Logger::Severity::WARN,
"[DiscoveryEngine::Sync::Delete] Failed attempt 2 to delete document (Something went wrong), retrying content_id:some_content_id payload_version:1",
).ordered
expect(logger).to have_received(:add).with(
Logger::Severity::INFO,
"[DiscoveryEngine::Sync::Delete] Successful delete content_id:some_content_id payload_version:1",
).ordered
end

it "does not send an error to Sentry" do
expect(GovukError).not_to have_received(:notify)
end
end
end
52 changes: 49 additions & 3 deletions spec/services/discovery_engine/sync/put_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
let(:sync_required) { true }

before do
allow(Kernel).to receive(:sleep).and_return(nil)
allow(Rails).to receive(:logger).and_return(logger)
allow(Rails.configuration).to receive(:discovery_engine_datastore_branch).and_return("branch")
allow(GovukError).to receive(:notify)
Expand Down Expand Up @@ -111,7 +112,7 @@
end
end

context "when updating the document fails" do
context "when failing consistently for the maximum number of attempts" do
let(:err) { Google::Cloud::Error.new("Something went wrong") }

before do
Expand All @@ -120,15 +121,60 @@
described_class.new("some_content_id", {}, payload_version: "1", client:).call
end

it "logs the failure" do
it "logs the failed attempts" do
expect(logger).to have_received(:add).with(
Logger::Severity::WARN,
"[DiscoveryEngine::Sync::Put] Failed attempt 1 to put document (Something went wrong), retrying content_id:some_content_id payload_version:1",
)
expect(logger).to have_received(:add).with(
Logger::Severity::WARN,
"[DiscoveryEngine::Sync::Put] Failed attempt 2 to put document (Something went wrong), retrying content_id:some_content_id payload_version:1",
)
expect(logger).to have_received(:add).with(
Logger::Severity::ERROR,
"[DiscoveryEngine::Sync::Put] Failed to put document due to an error (Something went wrong) content_id:some_content_id payload_version:1",
"[DiscoveryEngine::Sync::Put] Failed on attempt 3 to put document (Something went wrong), giving up content_id:some_content_id payload_version:1",
)
end

it "sends the error to Sentry" do
expect(GovukError).to have_received(:notify).with(err)
end
end

context "when failing transiently but succeeding within the maximum attempts" do
let(:err) { Google::Cloud::Error.new("Something went wrong") }

before do
allow(client).to receive(:update_document).and_invoke(
->(_) { raise err },
->(_) { raise err },
->(_) { double(name: "document-name") },
)

described_class.new("some_content_id", {}, payload_version: "1", client:).call
end

it "tries three times" do
expect(client).to have_received(:update_document).exactly(3).times
end

it "logs the failed and successful attempts" do
expect(logger).to have_received(:add).with(
Logger::Severity::WARN,
"[DiscoveryEngine::Sync::Put] Failed attempt 1 to put document (Something went wrong), retrying content_id:some_content_id payload_version:1",
).ordered
expect(logger).to have_received(:add).with(
Logger::Severity::WARN,
"[DiscoveryEngine::Sync::Put] Failed attempt 2 to put document (Something went wrong), retrying content_id:some_content_id payload_version:1",
).ordered
expect(logger).to have_received(:add).with(
Logger::Severity::INFO,
"[DiscoveryEngine::Sync::Put] Successful put content_id:some_content_id payload_version:1",
).ordered
end

it "does not send an error to Sentry" do
expect(GovukError).not_to have_received(:notify)
end
end
end

0 comments on commit 73a7967

Please sign in to comment.