From 18bb51065fe859090075f963fcc71808263db92f Mon Sep 17 00:00:00 2001 From: Hector Correa Date: Wed, 20 Nov 2024 16:57:43 -0500 Subject: [PATCH] Significantly reduce load time on the Show page for large datasets (#1989) * WIP * Got the show page to load faster by not calculating the total size in line but rather pushing the calculation to happend in the AJAX call for the file list. * Improve the performance of the jobs to detect snapshot modifications and deletes by using a binary search rather than a plain include * Updated the upload snaphot tests * Adjust test to match new return value * Rubocop nitpicking --- app/controllers/works_controller.rb | 25 +++++- app/models/upload_snapshot.rb | 50 ++++++------ app/models/work.rb | 30 ++++++-- app/views/works/_s3_resources.html.erb | 26 ++++--- app/views/works/show.html.erb | 5 +- spec/controllers/works_controller_spec.rb | 4 +- spec/models/upload_snapshot_spec.rb | 93 +++++++++-------------- 7 files changed, 130 insertions(+), 103 deletions(-) diff --git a/app/controllers/works_controller.rb b/app/controllers/works_controller.rb index fc504ea9d..ad7bc1630 100644 --- a/app/controllers/works_controller.rb +++ b/app/controllers/works_controller.rb @@ -89,10 +89,10 @@ def file_list if params[:id] == "NONE" # This is a special case when we render the file list for a work being created # (i.e. it does not have an id just yet) - render json: [] + render json: file_list_ajax_response(nil) else - @work = Work.find(params[:id]) - render json: @work.file_list + work = Work.find(params[:id]) + render json: file_list_ajax_response(work) end end @@ -415,5 +415,24 @@ def migrated? params[:submit] == "Migrate" end + + # Returns a hash object that can be serialized into something that DataTables + # can consume. The `data` elements includes the work's file list all other + # properties are used for displaying different data elements related but not + # directly on the DataTable object (e.g. the total file size) + def file_list_ajax_response(work) + files = [] + total_size = 0 + unless work.nil? + files = work.file_list + total_size = work.total_file_size_from_list(files) + end + { + data: files, + total_size:, + total_size_display: ActiveSupport::NumberHelper.number_to_human_size(total_size), + total_file_count: files.count + } + end end # rubocop:enable Metrics/ClassLength diff --git a/app/models/upload_snapshot.rb b/app/models/upload_snapshot.rb index a660712f6..8db4d8506 100644 --- a/app/models/upload_snapshot.rb +++ b/app/models/upload_snapshot.rb @@ -6,9 +6,15 @@ class UploadSnapshot < ApplicationRecord alias_attribute :existing_files, :files def snapshot_deletions(work_changes, s3_filenames) + s3_filenames_sorted = s3_filenames.sort existing_files.each do |file| filename = file["filename"] - unless s3_filenames.include?(filename) + # Use Ruby's Binary Search functionality instead of a plain Ruby Array `.include?` + # to detect missing values in the array because the binary search performs + # much faster when the list of files is large. Notice that the binary search + # requires that the list of files is sorted. + # See https://ruby-doc.org/3.3.6/bsearch_rdoc.html + if s3_filenames_sorted.bsearch { |s3_filename| filename <=> s3_filename }.nil? work_changes << { action: "removed", filename:, checksum: file["checksum"] } end end @@ -17,12 +23,12 @@ def snapshot_deletions(work_changes, s3_filenames) def snapshot_modifications(work_changes, s3_files) # check for modifications s3_files.each do |s3_file| - next if match?(s3_file) - work_changes << if include?(s3_file) - { action: "replaced", filename: s3_file.filename, checksum: s3_file.checksum } - else - { action: "added", filename: s3_file.filename, checksum: s3_file.checksum } - end + match = existing_files_sorted.bsearch { |file| s3_file.filename <=> file["filename"] } + if match.nil? + work_changes << { action: "added", filename: s3_file.filename, checksum: s3_file.checksum } + elsif UploadSnapshot.checksum_compare(match["checksum"], s3_file.checksum) == false + work_changes << { action: "replaced", filename: s3_file.filename, checksum: s3_file.checksum } + end end end @@ -38,18 +44,6 @@ def filenames files.map { |file| file["filename"] } end - def include?(s3_file) - filenames.include?(s3_file.filename) - end - - def index(s3_file) - files.index { |file| file["filename"] == s3_file.filename && checksum_compare(file["checksum"], s3_file.checksum) } - end - - def match?(s3_file) - index(s3_file).present? - end - def store_files(s3_files) self.files = s3_files.map { |file| { "filename" => file.filename, "checksum" => file.checksum } } end @@ -58,12 +52,7 @@ def self.find_by_filename(work_id:, filename:) find_by("work_id = ? AND files @> ?", work_id, JSON.dump([{ filename: }])) end - private - - def uploads - work.uploads - end - + class << self # Compares two checksums. Accounts for the case in which one of them is # a plain MD5 value and the other has been encoded with base64. # See also @@ -85,4 +74,15 @@ def checksum_compare(checksum1, checksum2) # One of the values was not properly encoded false end + end + + private + + def existing_files_sorted + @existing_files_sorted ||= files.sort_by { |file| file["filename"] } + end + + def uploads + work.uploads + end end diff --git a/app/models/work.rb b/app/models/work.rb index 281a0f4da..7484456b8 100644 --- a/app/models/work.rb +++ b/app/models/work.rb @@ -334,7 +334,9 @@ def artifact_uploads # Returns the list of files for the work with some basic information about each of them. # This method is much faster than `uploads` because it does not return the actual S3File # objects to the client, instead it returns just a few selected data elements. + # rubocop:disable Metrics/MethodLength def file_list + start = Time.zone.now s3_files = approved? ? post_curation_uploads : pre_curation_uploads files_info = s3_files.map do |s3_file| { @@ -349,17 +351,24 @@ def file_list "is_folder": s3_file.is_folder } end + log_performance(start, "file_list called for #{id}") files_info end + # rubocop:enable Metrics/MethodLength def total_file_size - @total_file_size ||= begin - total_size = 0 - file_list.each do |file| - total_size += file[:size] - end - total_size + total_size = 0 + file_list.each do |file| + total_size += file[:size] end + total_size + end + + # Calculates the total file size from a given list of files + # This is so that we don't fetch the list twice from AWS since it can be expensive when + # there are thousands of files on the work. + def total_file_size_from_list(files) + files.sum { |file| file[:size] } end # Fetches the data from S3 directly bypassing ActiveStorage @@ -629,5 +638,14 @@ def embargo_date_as_json embargo_date_iso8601.gsub(/\+.+$/, "Z") end end + + def log_performance(start, message) + elapsed = Time.zone.now - start + if elapsed > 20 + Rails.logger.warn("PERFORMANCE: #{message}. Elapsed: #{elapsed} seconds") + else + Rails.logger.info("PERFORMANCE: #{message}. Elapsed: #{elapsed} seconds") + end + end end # rubocop:enable Metrics/ClassLength diff --git a/app/views/works/_s3_resources.html.erb b/app/views/works/_s3_resources.html.erb index bcfff1e3e..cdc67732b 100644 --- a/app/views/works/_s3_resources.html.erb +++ b/app/views/works/_s3_resources.html.erb @@ -2,8 +2,9 @@
-

<%= "#{@work.upload_count} #{'File'.pluralize(@work.upload_count)} or #{'Directory'.pluralize(@work.upload_count)}"%> in - <%= @work.approved? ? "post-curation" : "pre-curation" %> storage +

+
+ Files or Directories in <%= @work.approved? ? "post-curation" : "pre-curation" %> storage

@@ -15,14 +16,6 @@ - <% if @work.upload_count.zero? %> - - - - - - - <% end %>
<%= t('works.form.curation_uploads.empty') %>
@@ -75,7 +68,18 @@ select: true, ajax: { url: fileListUrl, - dataSrc: '' + dataSrc: function(json) { + // The JSON payload includes a few extra properties (fetched via AJAX because + // they be slow for large datasets). Here we pick up those properties and update + // the display with them. + $("#total-file-size").text(json.total_size_display); + $("#total-file-size-spinner").hide(); + $("#total-file-count-in-table").text(json.total_file_count.toLocaleString("en-US")); + $("#total-file-count-spinner").hide(); + // The JSON payload include the file list in the `data` property + // and that's what we return to DataTables as the "source" data. + return json.data; + } }, rowId: 'safe_id', columns: [ diff --git a/app/views/works/show.html.erb b/app/views/works/show.html.erb index cb908e6e4..c61ce0d5f 100644 --- a/app/views/works/show.html.erb +++ b/app/views/works/show.html.erb @@ -274,7 +274,7 @@
Location
- <% if @work.upload_count > 0 %> + <% if @work.files_location_upload? %> Amazon S3 Curation Storage <% elsif @work.files_location_cluster? %> PUL Research Cluster @@ -291,7 +291,8 @@ <% end %>
Total Size
- <%= number_to_human_size(@work.total_file_size)%> +
+
diff --git a/spec/controllers/works_controller_spec.rb b/spec/controllers/works_controller_spec.rb index b557c4d4f..668e7bc76 100644 --- a/spec/controllers/works_controller_spec.rb +++ b/spec/controllers/works_controller_spec.rb @@ -597,8 +597,10 @@ allow(fake_s3_service).to receive(:client_s3_files).and_return(data) get :file_list, params: { id: work.id } - file_list = JSON.parse(response.body) + json_response = JSON.parse(response.body) + file_list = json_response["data"] expect(file_list.map { |f| f["filename"] }.sort).to eq(["10.34770/123-abc/#{work.id}/SCoData_combined_v1_2020-07_README.txt", "10.34770/123-abc/#{work.id}/something.jpg"]) + expect(json_response["total_size"]).to be 21_518 # Check that we don't accidentally serialize the Work as part of the JSON file list. # Including the work is bad for large records, because if for example a Work has 500 files diff --git a/spec/models/upload_snapshot_spec.rb b/spec/models/upload_snapshot_spec.rb index 38eced164..aee8bc59d 100644 --- a/spec/models/upload_snapshot_spec.rb +++ b/spec/models/upload_snapshot_spec.rb @@ -2,7 +2,7 @@ require "rails_helper" RSpec.describe UploadSnapshot, type: :model do - subject(:upload_snapshot) { described_class.new(files: [{ filename:, checkSum: "aaabbb111222" }], url:, work:) } + subject(:upload_snapshot) { described_class.new(files: [{ filename:, checksum: "aaabbb111222" }], url:, work:) } let(:filename) { "us_covid_2019.csv" } let(:url) { "http://localhost.localdomain/us_covid_2019.csv" } @@ -10,7 +10,7 @@ describe "#files" do it "lists files associated with the snapshot" do - expect(upload_snapshot.files).to eq([{ "filename" => filename, "checkSum" => "aaabbb111222" }]) + expect(upload_snapshot.files).to eq([{ "filename" => filename, "checksum" => "aaabbb111222" }]) end end @@ -121,80 +121,63 @@ end end - describe "#include?" do - subject(:upload_snapshot) { described_class.new(files: [{ filename: "fileone", checksum: "aaabbb111222" }, { filename: "filetwo", checksum: "aaabbb111222" }], url:, work:) } - - let(:s3_file) { FactoryBot.build :s3_file, filename: "fileone" } - let(:other_file) { FactoryBot.build :s3_file, filename: "other" } - it "checks if a files is part of the snamshot via name" do - expect(upload_snapshot.include?(s3_file)).to be_truthy - expect(upload_snapshot.include?(other_file)).to be_falsey - end - end - - describe "#index" do - subject(:upload_snapshot) { described_class.new(files: [{ filename: "fileone", checksum: "aaabbb111222" }, { filename: "filetwo", checksum: "aaabbb111222" }], url:, work:) } - - let(:s3_file) { FactoryBot.build :s3_file, filename: "filetwo", checksum: "aaabbb111222" } - let(:other_file) { FactoryBot.build :s3_file, filename: "other" } - subject(:upload_snapshot) { described_class.new(files: [{ filename: "fileone", checksum: "aaabbb111222" }, { filename: "filetwo", checksum: "aaabbb111222" }], url:, work:) } - - it "lists filenames associated with the snapshot" do - expect(upload_snapshot.index(s3_file)).to eq(1) - expect(upload_snapshot.index(other_file)).to be_nil + describe "#snapshot_deletions" do + it "detects deletions" do + work_changes = [] + upload_snapshot.snapshot_deletions(work_changes, ["us_covid_other.csv"]) + expect(work_changes.first[:action]).to eq "removed" + expect(work_changes.first[:filename]).to eq "us_covid_2019.csv" end - it "checks both the filename and the checksum" do - s3_file.checksum = "otherchecksum" - expect(upload_snapshot.index(s3_file)).to be_nil + it "does not detect false positives" do + work_changes = [] + upload_snapshot.snapshot_deletions(work_changes, ["us_covid_2019.csv"]) + expect(work_changes.count).to be 0 end end - describe "#match?" do - subject(:upload_snapshot) { described_class.new(files: [{ filename: "fileone", checksum: "aaabbb111222" }, { filename: "filetwo", checksum: "aaabbb111222" }], url:, work:) } - - let(:s3_file) { FactoryBot.build :s3_file, filename: "filetwo", checksum: "aaabbb111222" } - let(:other_file) { FactoryBot.build :s3_file, filename: "other" } - subject(:upload_snapshot) { described_class.new(files: [{ filename: "fileone", checksum: "aaabbb111222" }, { filename: "filetwo", checksum: "aaabbb111222" }], url:, work:) } - - it "lists filenames associated with the snapshot" do - expect(upload_snapshot.match?(s3_file)).to be_truthy - expect(upload_snapshot.match?(other_file)).to be_falsey + describe "#snapshot_modifications" do + let(:existing_file) { FactoryBot.build :s3_file, filename: "us_covid_2019.csv", checksum: "aaabbb111222" } + let(:s3_new_file) { FactoryBot.build :s3_file, filename: "one.txt", checksum: "111" } + let(:s3_modified_file) { FactoryBot.build :s3_file, filename: "us_covid_2019.csv", checksum: "zzzyyyy999888" } + it "detects additions and modifications" do + work_changes = [] + upload_snapshot.snapshot_modifications(work_changes, [existing_file, s3_new_file, s3_modified_file]) + expect(work_changes.find { |change| change[:action] == "added" && change[:filename] == "one.txt" }).to_not be nil + expect(work_changes.find { |change| change[:action] == "replaced" && change[:filename] == "us_covid_2019.csv" }).to_not be nil end - it "checks both the filename and the checksum" do - s3_file.checksum = "otherchecksum" - expect(upload_snapshot.match?(s3_file)).to be_falsey + it "does not detect false positives" do + work_changes = [] + upload_snapshot.snapshot_modifications(work_changes, [existing_file]) + expect(work_changes.count).to be 0 end end - describe "checksum compare" do - let(:file1_base64) { { filename: "fileone", checksum: "98691a716ece23a77735f37b5a421253" } } - let(:file2_md5) { { filename: "filetwo", checksum: "mGkacW7OI6d3NfN7WkISUw==" } } - subject(:upload_snapshot) { described_class.new(files: [file1_base64, file2_md5], url:, work:) } + describe "#compare_checksum" do + let(:checksum1) { "98691a716ece23a77735f37b5a421253" } + let(:checksum1_encoded) { "mGkacW7OI6d3NfN7WkISUw==" } + let(:checksum2) { "xx691a716ece23a77735f37b5a421253" } it "matches identical checksums" do - s3_file = FactoryBot.build :s3_file, filename: "fileone", checksum: "98691a716ece23a77735f37b5a421253" - expect(upload_snapshot.match?(s3_file)).to be true + expect(described_class.checksum_compare(checksum1, checksum1)).to be true end it "detects differences" do - s3_file = FactoryBot.build :s3_file, filename: "fileone", checksum: "xx691a716ece23a77735f37b5a421253" - expect(upload_snapshot.match?(s3_file)).to be false + expect(described_class.checksum_compare(checksum1, checksum2)).to be false end it "matches encoded checksums" do - s3_file = FactoryBot.build :s3_file, filename: "fileone", checksum: "mGkacW7OI6d3NfN7WkISUw==" - expect(upload_snapshot.match?(s3_file)).to be true - - s3_file = FactoryBot.build :s3_file, filename: "filetwo", checksum: "98691a716ece23a77735f37b5a421253" - expect(upload_snapshot.match?(s3_file)).to be true + expect(described_class.checksum_compare(checksum1, checksum1_encoded)).to be true end it "does not cause issues when the checksum is nil" do - s3_file = FactoryBot.build :s3_file, filename: "fileone" - s3_file.checksum = nil - expect(upload_snapshot.match?(s3_file)).to be false + expect(described_class.checksum_compare(nil, checksum1)).to be false + expect(described_class.checksum_compare(checksum1, nil)).to be false + end + + it "return false if the encoding is wrong" do + expect(described_class.checksum_compare(checksum1, checksum1_encoded + "xxx")).to be false end end