Skip to content

Commit

Permalink
Significantly reduce load time on the Show page for large datasets (#…
Browse files Browse the repository at this point in the history
…1989)

* WIP

* Got the show page to load faster by not calculating the total size in line but rather pushing the calculation to happend in the AJAX call for the file list.

* Improve the performance of the jobs to detect snapshot modifications and deletes by using a binary search rather than a plain include

* Updated the upload snaphot tests

* Adjust test to match new return value

* Rubocop nitpicking
  • Loading branch information
hectorcorrea authored Nov 20, 2024
1 parent 547fdb8 commit 18bb510
Show file tree
Hide file tree
Showing 7 changed files with 130 additions and 103 deletions.
25 changes: 22 additions & 3 deletions app/controllers/works_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -89,10 +89,10 @@ def file_list
if params[:id] == "NONE"
# This is a special case when we render the file list for a work being created
# (i.e. it does not have an id just yet)
render json: []
render json: file_list_ajax_response(nil)
else
@work = Work.find(params[:id])
render json: @work.file_list
work = Work.find(params[:id])
render json: file_list_ajax_response(work)
end
end

Expand Down Expand Up @@ -415,5 +415,24 @@ def migrated?

params[:submit] == "Migrate"
end

# Returns a hash object that can be serialized into something that DataTables
# can consume. The `data` elements includes the work's file list all other
# properties are used for displaying different data elements related but not
# directly on the DataTable object (e.g. the total file size)
def file_list_ajax_response(work)
files = []
total_size = 0
unless work.nil?
files = work.file_list
total_size = work.total_file_size_from_list(files)
end
{
data: files,
total_size:,
total_size_display: ActiveSupport::NumberHelper.number_to_human_size(total_size),
total_file_count: files.count
}
end
end
# rubocop:enable Metrics/ClassLength
50 changes: 25 additions & 25 deletions app/models/upload_snapshot.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,15 @@ class UploadSnapshot < ApplicationRecord
alias_attribute :existing_files, :files

def snapshot_deletions(work_changes, s3_filenames)
s3_filenames_sorted = s3_filenames.sort
existing_files.each do |file|
filename = file["filename"]
unless s3_filenames.include?(filename)
# Use Ruby's Binary Search functionality instead of a plain Ruby Array `.include?`
# to detect missing values in the array because the binary search performs
# much faster when the list of files is large. Notice that the binary search
# requires that the list of files is sorted.
# See https://ruby-doc.org/3.3.6/bsearch_rdoc.html
if s3_filenames_sorted.bsearch { |s3_filename| filename <=> s3_filename }.nil?
work_changes << { action: "removed", filename:, checksum: file["checksum"] }
end
end
Expand All @@ -17,12 +23,12 @@ def snapshot_deletions(work_changes, s3_filenames)
def snapshot_modifications(work_changes, s3_files)
# check for modifications
s3_files.each do |s3_file|
next if match?(s3_file)
work_changes << if include?(s3_file)
{ action: "replaced", filename: s3_file.filename, checksum: s3_file.checksum }
else
{ action: "added", filename: s3_file.filename, checksum: s3_file.checksum }
end
match = existing_files_sorted.bsearch { |file| s3_file.filename <=> file["filename"] }
if match.nil?
work_changes << { action: "added", filename: s3_file.filename, checksum: s3_file.checksum }
elsif UploadSnapshot.checksum_compare(match["checksum"], s3_file.checksum) == false
work_changes << { action: "replaced", filename: s3_file.filename, checksum: s3_file.checksum }
end
end
end

Expand All @@ -38,18 +44,6 @@ def filenames
files.map { |file| file["filename"] }
end

def include?(s3_file)
filenames.include?(s3_file.filename)
end

def index(s3_file)
files.index { |file| file["filename"] == s3_file.filename && checksum_compare(file["checksum"], s3_file.checksum) }
end

def match?(s3_file)
index(s3_file).present?
end

def store_files(s3_files)
self.files = s3_files.map { |file| { "filename" => file.filename, "checksum" => file.checksum } }
end
Expand All @@ -58,12 +52,7 @@ def self.find_by_filename(work_id:, filename:)
find_by("work_id = ? AND files @> ?", work_id, JSON.dump([{ filename: }]))
end

private

def uploads
work.uploads
end

class << self
# Compares two checksums. Accounts for the case in which one of them is
# a plain MD5 value and the other has been encoded with base64.
# See also
Expand All @@ -85,4 +74,15 @@ def checksum_compare(checksum1, checksum2)
# One of the values was not properly encoded
false
end
end

private

def existing_files_sorted
@existing_files_sorted ||= files.sort_by { |file| file["filename"] }
end

def uploads
work.uploads
end
end
30 changes: 24 additions & 6 deletions app/models/work.rb
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,9 @@ def artifact_uploads
# Returns the list of files for the work with some basic information about each of them.
# This method is much faster than `uploads` because it does not return the actual S3File
# objects to the client, instead it returns just a few selected data elements.
# rubocop:disable Metrics/MethodLength
def file_list
start = Time.zone.now
s3_files = approved? ? post_curation_uploads : pre_curation_uploads
files_info = s3_files.map do |s3_file|
{
Expand All @@ -349,17 +351,24 @@ def file_list
"is_folder": s3_file.is_folder
}
end
log_performance(start, "file_list called for #{id}")
files_info
end
# rubocop:enable Metrics/MethodLength

def total_file_size
@total_file_size ||= begin
total_size = 0
file_list.each do |file|
total_size += file[:size]
end
total_size
total_size = 0
file_list.each do |file|
total_size += file[:size]
end
total_size
end

# Calculates the total file size from a given list of files
# This is so that we don't fetch the list twice from AWS since it can be expensive when
# there are thousands of files on the work.
def total_file_size_from_list(files)
files.sum { |file| file[:size] }
end

# Fetches the data from S3 directly bypassing ActiveStorage
Expand Down Expand Up @@ -629,5 +638,14 @@ def embargo_date_as_json
embargo_date_iso8601.gsub(/\+.+$/, "Z")
end
end

def log_performance(start, message)
elapsed = Time.zone.now - start
if elapsed > 20
Rails.logger.warn("PERFORMANCE: #{message}. Elapsed: #{elapsed} seconds")
else
Rails.logger.info("PERFORMANCE: #{message}. Elapsed: #{elapsed} seconds")
end
end
end
# rubocop:enable Metrics/ClassLength
26 changes: 15 additions & 11 deletions app/views/works/_s3_resources.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
<div class="lux">
<div class="card">
<div class="files card-body">
<h3><%= "#{@work.upload_count} #{'File'.pluralize(@work.upload_count)} or #{'Directory'.pluralize(@work.upload_count)}"%> in
<%= @work.approved? ? "post-curation" : "pre-curation" %> storage
<h3>
<div id="total-file-count-spinner" class="spinner-border spinner-border-sm" role="status"></div>
<span id="total-file-count-in-table"></span> Files or Directories in <%= @work.approved? ? "post-curation" : "pre-curation" %> storage
</h3>
<table id="files-table" class="table">
<thead>
Expand All @@ -15,14 +16,6 @@
</tr>
</thead>
<tbody>
<% if @work.upload_count.zero? %>
<tr class="files">
<td><span><%= t('works.form.curation_uploads.empty') %></span></td>
<td><span></span></td>
<td><span></span></td>
<td><span></span></td>
</tr>
<% end %>
</tbody>
<tfoot></tfoot>
</table>
Expand Down Expand Up @@ -75,7 +68,18 @@
select: true,
ajax: {
url: fileListUrl,
dataSrc: ''
dataSrc: function(json) {
// The JSON payload includes a few extra properties (fetched via AJAX because
// they be slow for large datasets). Here we pick up those properties and update
// the display with them.
$("#total-file-size").text(json.total_size_display);
$("#total-file-size-spinner").hide();
$("#total-file-count-in-table").text(json.total_file_count.toLocaleString("en-US"));
$("#total-file-count-spinner").hide();
// The JSON payload include the file list in the `data` property
// and that's what we return to DataTables as the "source" data.
return json.data;
}
},
rowId: 'safe_id',
columns: [
Expand Down
5 changes: 3 additions & 2 deletions app/views/works/show.html.erb
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@

<dt>Location</dt>
<dd>
<% if @work.upload_count > 0 %>
<% if @work.files_location_upload? %>
Amazon S3 Curation Storage
<% elsif @work.files_location_cluster? %>
PUL Research Cluster
Expand All @@ -291,7 +291,8 @@
<% end %>
<dt>Total Size</dt>
<dd>
<%= number_to_human_size(@work.total_file_size)%>
<div id="total-file-size-spinner" class="spinner-border spinner-border-sm" role="status"></div>
<span id="total-file-size"></span>
</dd>
</dl>

Expand Down
4 changes: 3 additions & 1 deletion spec/controllers/works_controller_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -597,8 +597,10 @@
allow(fake_s3_service).to receive(:client_s3_files).and_return(data)

get :file_list, params: { id: work.id }
file_list = JSON.parse(response.body)
json_response = JSON.parse(response.body)
file_list = json_response["data"]
expect(file_list.map { |f| f["filename"] }.sort).to eq(["10.34770/123-abc/#{work.id}/SCoData_combined_v1_2020-07_README.txt", "10.34770/123-abc/#{work.id}/something.jpg"])
expect(json_response["total_size"]).to be 21_518

# Check that we don't accidentally serialize the Work as part of the JSON file list.
# Including the work is bad for large records, because if for example a Work has 500 files
Expand Down
93 changes: 38 additions & 55 deletions spec/models/upload_snapshot_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
require "rails_helper"

RSpec.describe UploadSnapshot, type: :model do
subject(:upload_snapshot) { described_class.new(files: [{ filename:, checkSum: "aaabbb111222" }], url:, work:) }
subject(:upload_snapshot) { described_class.new(files: [{ filename:, checksum: "aaabbb111222" }], url:, work:) }

let(:filename) { "us_covid_2019.csv" }
let(:url) { "http://localhost.localdomain/us_covid_2019.csv" }
let(:work) { FactoryBot.create(:approved_work) }

describe "#files" do
it "lists files associated with the snapshot" do
expect(upload_snapshot.files).to eq([{ "filename" => filename, "checkSum" => "aaabbb111222" }])
expect(upload_snapshot.files).to eq([{ "filename" => filename, "checksum" => "aaabbb111222" }])
end
end

Expand Down Expand Up @@ -121,80 +121,63 @@
end
end

describe "#include?" do
subject(:upload_snapshot) { described_class.new(files: [{ filename: "fileone", checksum: "aaabbb111222" }, { filename: "filetwo", checksum: "aaabbb111222" }], url:, work:) }

let(:s3_file) { FactoryBot.build :s3_file, filename: "fileone" }
let(:other_file) { FactoryBot.build :s3_file, filename: "other" }
it "checks if a files is part of the snamshot via name" do
expect(upload_snapshot.include?(s3_file)).to be_truthy
expect(upload_snapshot.include?(other_file)).to be_falsey
end
end

describe "#index" do
subject(:upload_snapshot) { described_class.new(files: [{ filename: "fileone", checksum: "aaabbb111222" }, { filename: "filetwo", checksum: "aaabbb111222" }], url:, work:) }

let(:s3_file) { FactoryBot.build :s3_file, filename: "filetwo", checksum: "aaabbb111222" }
let(:other_file) { FactoryBot.build :s3_file, filename: "other" }
subject(:upload_snapshot) { described_class.new(files: [{ filename: "fileone", checksum: "aaabbb111222" }, { filename: "filetwo", checksum: "aaabbb111222" }], url:, work:) }

it "lists filenames associated with the snapshot" do
expect(upload_snapshot.index(s3_file)).to eq(1)
expect(upload_snapshot.index(other_file)).to be_nil
describe "#snapshot_deletions" do
it "detects deletions" do
work_changes = []
upload_snapshot.snapshot_deletions(work_changes, ["us_covid_other.csv"])
expect(work_changes.first[:action]).to eq "removed"
expect(work_changes.first[:filename]).to eq "us_covid_2019.csv"
end

it "checks both the filename and the checksum" do
s3_file.checksum = "otherchecksum"
expect(upload_snapshot.index(s3_file)).to be_nil
it "does not detect false positives" do
work_changes = []
upload_snapshot.snapshot_deletions(work_changes, ["us_covid_2019.csv"])
expect(work_changes.count).to be 0
end
end

describe "#match?" do
subject(:upload_snapshot) { described_class.new(files: [{ filename: "fileone", checksum: "aaabbb111222" }, { filename: "filetwo", checksum: "aaabbb111222" }], url:, work:) }

let(:s3_file) { FactoryBot.build :s3_file, filename: "filetwo", checksum: "aaabbb111222" }
let(:other_file) { FactoryBot.build :s3_file, filename: "other" }
subject(:upload_snapshot) { described_class.new(files: [{ filename: "fileone", checksum: "aaabbb111222" }, { filename: "filetwo", checksum: "aaabbb111222" }], url:, work:) }

it "lists filenames associated with the snapshot" do
expect(upload_snapshot.match?(s3_file)).to be_truthy
expect(upload_snapshot.match?(other_file)).to be_falsey
describe "#snapshot_modifications" do
let(:existing_file) { FactoryBot.build :s3_file, filename: "us_covid_2019.csv", checksum: "aaabbb111222" }
let(:s3_new_file) { FactoryBot.build :s3_file, filename: "one.txt", checksum: "111" }
let(:s3_modified_file) { FactoryBot.build :s3_file, filename: "us_covid_2019.csv", checksum: "zzzyyyy999888" }
it "detects additions and modifications" do
work_changes = []
upload_snapshot.snapshot_modifications(work_changes, [existing_file, s3_new_file, s3_modified_file])
expect(work_changes.find { |change| change[:action] == "added" && change[:filename] == "one.txt" }).to_not be nil
expect(work_changes.find { |change| change[:action] == "replaced" && change[:filename] == "us_covid_2019.csv" }).to_not be nil
end

it "checks both the filename and the checksum" do
s3_file.checksum = "otherchecksum"
expect(upload_snapshot.match?(s3_file)).to be_falsey
it "does not detect false positives" do
work_changes = []
upload_snapshot.snapshot_modifications(work_changes, [existing_file])
expect(work_changes.count).to be 0
end
end

describe "checksum compare" do
let(:file1_base64) { { filename: "fileone", checksum: "98691a716ece23a77735f37b5a421253" } }
let(:file2_md5) { { filename: "filetwo", checksum: "mGkacW7OI6d3NfN7WkISUw==" } }
subject(:upload_snapshot) { described_class.new(files: [file1_base64, file2_md5], url:, work:) }
describe "#compare_checksum" do
let(:checksum1) { "98691a716ece23a77735f37b5a421253" }
let(:checksum1_encoded) { "mGkacW7OI6d3NfN7WkISUw==" }
let(:checksum2) { "xx691a716ece23a77735f37b5a421253" }

it "matches identical checksums" do
s3_file = FactoryBot.build :s3_file, filename: "fileone", checksum: "98691a716ece23a77735f37b5a421253"
expect(upload_snapshot.match?(s3_file)).to be true
expect(described_class.checksum_compare(checksum1, checksum1)).to be true
end

it "detects differences" do
s3_file = FactoryBot.build :s3_file, filename: "fileone", checksum: "xx691a716ece23a77735f37b5a421253"
expect(upload_snapshot.match?(s3_file)).to be false
expect(described_class.checksum_compare(checksum1, checksum2)).to be false
end

it "matches encoded checksums" do
s3_file = FactoryBot.build :s3_file, filename: "fileone", checksum: "mGkacW7OI6d3NfN7WkISUw=="
expect(upload_snapshot.match?(s3_file)).to be true

s3_file = FactoryBot.build :s3_file, filename: "filetwo", checksum: "98691a716ece23a77735f37b5a421253"
expect(upload_snapshot.match?(s3_file)).to be true
expect(described_class.checksum_compare(checksum1, checksum1_encoded)).to be true
end

it "does not cause issues when the checksum is nil" do
s3_file = FactoryBot.build :s3_file, filename: "fileone"
s3_file.checksum = nil
expect(upload_snapshot.match?(s3_file)).to be false
expect(described_class.checksum_compare(nil, checksum1)).to be false
expect(described_class.checksum_compare(checksum1, nil)).to be false
end

it "return false if the encoding is wrong" do
expect(described_class.checksum_compare(checksum1, checksum1_encoded + "xxx")).to be false
end
end

Expand Down

0 comments on commit 18bb510

Please sign in to comment.