From f5d6713a4b91eae7814b18339064359e2bb8b87e Mon Sep 17 00:00:00 2001 From: Keith Lawrence Date: Tue, 19 Sep 2023 14:46:03 +0100 Subject: [PATCH] Add sidekiq backoff service to handle OS Places outages - When OS Places has a slowdown or outage, we can avoid our own alerting problems by backing off calls (since they're not time-critical). - Add a service which is initialised on startup (relevant mainly to worker processes) which adjusts the scheduled interval of PostcodeProcessWorker creation (currently once per second). We record OS Places API failures, and with each failure we double the interval, until we reach a max of 180s. When we record a successful call, we reduce the interval by 1s, so it will quickly back off if many errors occur, and slowly creep back to full speed when the errors are over. --- app/workers/process_postcode_worker.rb | 2 + config/initializers/sidekiq.rb | 8 ++ lib/sidekiq_scheduler_backoff_service.rb | 31 +++++ .../sidekiq_scheduler_backoff_service_spec.rb | 108 ++++++++++++++++++ 4 files changed, 149 insertions(+) create mode 100644 lib/sidekiq_scheduler_backoff_service.rb create mode 100644 spec/lib/sidekiq_scheduler_backoff_service_spec.rb diff --git a/app/workers/process_postcode_worker.rb b/app/workers/process_postcode_worker.rb index 4cef899..555bc34 100644 --- a/app/workers/process_postcode_worker.rb +++ b/app/workers/process_postcode_worker.rb @@ -4,7 +4,9 @@ class ProcessPostcodeWorker def perform(postcode) PostcodeManager.new.update_postcode(postcode) + Rails.application.config.sidekiq_scheduler_backoff_service.record_success rescue OsPlacesApi::ClientError => e GovukError.notify(e) + Rails.application.config.sidekiq_scheduler_backoff_service.record_failure end end diff --git a/config/initializers/sidekiq.rb b/config/initializers/sidekiq.rb index 91e339c..3a69e95 100644 --- a/config/initializers/sidekiq.rb +++ b/config/initializers/sidekiq.rb @@ -1,4 +1,5 @@ require "sidekiq-unique-jobs" +require "sidekiq_scheduler_backoff_service" Sidekiq.configure_server do |config| config.client_middleware do |chain| @@ -17,3 +18,10 @@ chain.add SidekiqUniqueJobs::Middleware::Client end end + +# Set backoff service to slow down schedule to once per 180 seconds if lots of errors +Rails.application.config.sidekiq_scheduler_backoff_service = SidekiqSchedulerBackoffService.new( + name: "queue_oldest_postcodes_for_updating", + min_interval: 1, + max_interval: 180, +) diff --git a/lib/sidekiq_scheduler_backoff_service.rb b/lib/sidekiq_scheduler_backoff_service.rb new file mode 100644 index 0000000..731c0e5 --- /dev/null +++ b/lib/sidekiq_scheduler_backoff_service.rb @@ -0,0 +1,31 @@ +class SidekiqSchedulerBackoffService + def initialize(name:, min_interval:, max_interval:) + @name = name.to_s + @min_interval = min_interval + @max_interval = max_interval + end + + def record_success + initial_interval = current_interval + target_interval = [initial_interval - 1, @min_interval].max + restart_schedule(target_interval) if target_interval != initial_interval + end + + def record_failure + initial_interval = current_interval + target_interval = [initial_interval * 2, @max_interval].min + restart_schedule(target_interval) if target_interval != initial_interval + end + +private + + def current_interval + schedule = Sidekiq.get_schedule[@name] + Integer(schedule["every"].first.chop) + end + + def restart_schedule(target_interval) + schedule = Sidekiq.get_schedule[@name] + Sidekiq.set_schedule(@name, schedule.merge("every" => ["#{target_interval}s"])) + end +end diff --git a/spec/lib/sidekiq_scheduler_backoff_service_spec.rb b/spec/lib/sidekiq_scheduler_backoff_service_spec.rb new file mode 100644 index 0000000..41a04a1 --- /dev/null +++ b/spec/lib/sidekiq_scheduler_backoff_service_spec.rb @@ -0,0 +1,108 @@ +require "spec_helper" + +RSpec.describe SidekiqSchedulerBackoffService do + let(:min_interval) { 2 } + let(:max_interval) { 180 } + let(:name) { :queue_oldest_postcodes_for_updating } + subject { SidekiqSchedulerBackoffService.new(name:, min_interval:, max_interval:) } + + describe "#record_failure" do + context "when the scheduler is going faster than maximum speed" do + before do + set_scheduled_interval(min_interval - 1) + end + + it "sets the scheduler to maximum speed and reloads the schedule" do + subject.record_failure + expect(scheduled_interval).to eq(["#{min_interval}s"]) + end + end + + context "when the scheduler is going faster than minimum speed" do + before do + set_scheduled_interval(max_interval / 2) + end + + it "halves the scheduler speed and reloads the schedule" do + subject.record_failure + expect(scheduled_interval).to eq(["#{max_interval}s"]) + end + end + + context "when the scheduler is going at minimum speed" do + before do + set_scheduled_interval(max_interval) + end + + it "does nothing" do + subject.record_failure + expect(scheduled_interval).to eq(["#{max_interval}s"]) + end + end + + context "when the scheduler is going slower than minimum speed" do + before do + set_scheduled_interval(max_interval * 2) + end + + it "sets the scheduler to minimum speed and reloads the schedule" do + subject.record_failure + expect(scheduled_interval).to eq(["#{max_interval}s"]) + end + end + end + + describe "#record_success" do + context "when the scheduler is going faster than maximum speed" do + before do + set_scheduled_interval(min_interval - 1) + end + + it "sets the scheduler to maximum speed and reloads the schedule" do + subject.record_success + expect(scheduled_interval).to eq(["#{min_interval}s"]) + end + end + + context "when the scheduler is going at maximum speed" do + before do + set_scheduled_interval(min_interval) + end + + it "does nothing" do + subject.record_success + expect(scheduled_interval).to eq(["#{min_interval}s"]) + end + end + + context "when the scheduler is going slower than maximum speed" do + before do + set_scheduled_interval(min_interval * 4) + end + + it "decremenincrements the scheduler speed by 1 second and reloads the schedule" do + subject.record_success + expect(scheduled_interval).to eq(["#{(min_interval * 4) - 1}s"]) + end + end + + context "when the scheduler is going slower than minimum speed" do + before do + set_scheduled_interval(max_interval + 1) + end + + it "sets the scheduler to minimum speed and reloads the schedule" do + subject.record_success + expect(scheduled_interval).to eq(["#{max_interval}s"]) + end + end + end +end + +def set_scheduled_interval(interval) + Sidekiq.set_schedule(name.to_s, { "every" => ["#{interval}s"], "class" => "PostcodesCollectionWorker" }) +end + +def scheduled_interval + Sidekiq.get_schedule["queue_oldest_postcodes_for_updating"]["every"] +end