Skip to content

Commit

Permalink
Use purl_fetcher-client to iterate documents from purl + purl-fetcher
Browse files Browse the repository at this point in the history
  • Loading branch information
cbeer committed Mar 5, 2021
1 parent 6dbbaca commit c3acb68
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 191 deletions.
8 changes: 3 additions & 5 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,15 @@ gem 'i18n'
gem 'manticore', platform: :jruby
gem 'rake'
gem 'ruby-kafka'
gem 'stanford-mods'
gem 'iso-639', '< 0.3' # v0.3+ requires ruby 2.6+ (and out jruby is at 2.5)
gem 'purl_fetcher-client', '~> 0.4'
gem 'iso-639'
gem 'whenever'
gem 'honeybadger'
gem 'retriable'
gem 'mods_display'
gem 'statsd-ruby'
gem 'debouncer'
gem 'dor-rights-auth'
gem 'sequel'
gem 'mysql2', '< 0.5.3', platform: :mri
gem 'mysql2', '~> 0.5.3', platform: :mri
gem 'jdbc-mysql', '~> 5.1.0', platform: :jruby
gem 'rexml' # required for ruby 3

Expand Down
27 changes: 16 additions & 11 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@ GEM
i18n
rake (>= 10.0.0)
sshkit (>= 1.9.0)
capistrano-bundle_audit (0.2.1)
capistrano-bundle_audit (0.4.0)
bundler-audit (~> 0.5)
capistrano (~> 3.0)
capistrano-bundler (>= 1.4)
capistrano-bundler (2.0.1)
capistrano (~> 3.1)
capistrano-one_time_key (0.1.0)
Expand All @@ -37,9 +38,9 @@ GEM
diff-lcs (1.4.4)
digest-crc (0.6.3)
rake (>= 12.0.0, < 14.0.0)
dlss-capistrano (3.5.0)
dlss-capistrano (3.11.1)
capistrano (~> 3.0)
capistrano-bundle_audit (>= 0.1.0)
capistrano-bundle_audit (>= 0.3.0)
capistrano-one_time_key
capistrano-shared_configs
docile (1.3.5)
Expand Down Expand Up @@ -70,7 +71,7 @@ GEM
httpclient (2.8.3)
i18n (1.8.9)
concurrent-ruby (~> 1.0)
iso-639 (0.2.10)
iso-639 (0.3.5)
jdbc-mysql (5.1.47)
manticore (0.7.0-java)
openssl_pkcs8_pure
Expand All @@ -91,7 +92,7 @@ GEM
mods_display (0.8.0)
i18n
stanford-mods (~> 2.1)
mysql2 (0.5.2)
mysql2 (0.5.3)
net-scp (3.0.0)
net-ssh (>= 2.6.5, < 7.0.0)
net-ssh (6.1.0)
Expand All @@ -108,6 +109,12 @@ GEM
nokogiri
openssl_pkcs8_pure (0.0.0.2)
public_suffix (4.0.6)
purl_fetcher-client (0.4.1)
dor-rights-auth
http
mods_display
nokogiri
stanford-mods
racc (1.5.2)
racc (1.5.2-java)
rake (13.0.3)
Expand All @@ -129,7 +136,7 @@ GEM
ruby-kafka (1.3.0)
digest-crc
scrub_rb (1.0.1)
sequel (5.41.0)
sequel (5.42.0)
simplecov (0.21.2)
docile (~> 1.1)
simplecov-html (~> 0.11)
Expand Down Expand Up @@ -184,23 +191,21 @@ DEPENDENCIES
capistrano-shared_configs
debouncer
dlss-capistrano
dor-rights-auth
honeybadger
http
i18n
iso-639 (< 0.3)
iso-639
jdbc-mysql (~> 5.1.0)
manticore
mods_display
mysql2 (< 0.5.3)
mysql2 (~> 0.5.3)
purl_fetcher-client (~> 0.4)
rake
retriable
rexml
rspec
ruby-kafka
sequel
simplecov
stanford-mods
statsd-ruby
traject (~> 3.0)
traject-marc4j_reader
Expand Down
179 changes: 4 additions & 175 deletions lib/sdr_stuff.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
require 'http'
require 'mods_display'
require 'dor/rights_auth'
require 'purl_fetcher/client'

class SdrReader
attr_reader :input_stream
Expand All @@ -19,183 +17,14 @@ def each(*args, &block)
end
end

class PublicXmlRecord
attr_reader :druid
include ModsDisplay::ModelExtension
include ModsDisplay::ControllerExtension

class PublicXmlRecord < PurlFetcher::Client::PublicXmlRecord
mods_xml_source do |model|
model.mods.to_s
end
configure_mods_display do
end

def self.fetch(url)
if defined?(JRUBY_VERSION)
response = Manticore.get(url)
response.body if response.code == 200
else
response = HTTP.get(url)
response.body if response.status.ok?
end
end

attr_reader :purl_url

def initialize(druid, purl_url: 'https://purl.stanford.edu')
@druid = druid
@purl_url = purl_url
end

def searchworks_id
catkey.nil? ? druid : catkey
end

# @return catkey value from the DOR identity_metadata, or nil if there is no catkey
def catkey
get_value(public_xml_doc.xpath("/publicObject/identityMetadata/otherId[@name='catkey']")).presence
end

# @return objectLabel value from the DOR identity_metadata, or nil if there is no barcode
def label
get_value(public_xml_doc.xpath('/publicObject/identityMetadata/objectLabel'))
end

def get_value(node)
(node && node.first) ? node.first.content : nil
end

def stanford_mods
@smods_rec ||= Stanford::Mods::Record.new.tap do |smods_rec|
smods_rec.from_str(mods.to_s)
end
end

def mods_display
@mods_display ||= render_mods_display(self)
end

def public_xml?
!!public_xml
end

def public_xml
@public_xml ||= self.class.fetch("#{purl_url}/#{druid}.xml")
end

def public_xml_doc
@public_xml_doc ||= Nokogiri::XML(public_xml)
end

def mods
@mods ||= if public_xml_doc.xpath('/publicObject/mods:mods', mods: 'http://www.loc.gov/mods/v3').any?
public_xml_doc.xpath('/publicObject/mods:mods', mods: 'http://www.loc.gov/mods/v3').first
else
Nokogiri::XML self.class.fetch("#{purl_url}/#{druid}.mods")
end
end

def rights
@rights ||= ::Dor::RightsAuth.parse(rights_xml)
end

def public?
rights.world_unrestricted?
end

def stanford_only?
rights.stanford_only_unrestricted?
end

def rights_xml
@rights_xml ||= public_xml_doc.xpath('//rightsMetadata').to_s
end

# @return true if the identityMetadata has <objectType>collection</objectType>, false otherwise
def is_collection
object_type_nodes = public_xml_doc.xpath('//objectType')
object_type_nodes.find_index { |n| %w(collection set).include? n.text.downcase }
end

# value is used to tell SearchWorks UI app of specific display needs for objects
# this comes from the <thumb> element in publicXML or the first image found (as parsed by discovery-indexer)
# @return [String] filename or nil if none found
def thumb
return if is_collection
encoded_thumb if %w(book image manuscript map webarchive-seed).include?(dor_content_type)
end

# the value of the type attribute for a DOR object's contentMetadata
# more info about these values is here:
# https://consul.stanford.edu/display/chimera/DOR+content+types%2C+resource+types+and+interpretive+metadata
# https://consul.stanford.edu/display/chimera/Summary+of+Content+Types%2C+Resource+Types+and+their+behaviors
# @return [String]
def dor_content_type
public_xml_doc.xpath('//contentMetadata/@type').text
end

# the thumbnail in publicXML, falling back to the first image if no thumb node is found
# @return [String] thumb filename with druid prepended, e.g. oo000oo0001/filename withspace.jp2
def parse_thumb
unless public_xml_doc.nil?
thumb = public_xml_doc.xpath('//thumb')
# first try and parse what is in the thumb node of publicXML, but fallback to the first image if needed
if thumb.size == 1
thumb.first.content
elsif thumb.size == 0 && parse_sw_image_ids.size > 0
parse_sw_image_ids.first
else
nil
end
end
end

# the druid and id attribute of resource/file and objectId and fileId of the
# resource/externalFile elements that match the image, page, or thumb resource type, including extension
# Also, prepends the corresponding druid and / specifically for Searchworks use
# @return [Array<String>] filenames
def parse_sw_image_ids
public_xml_doc.xpath('//resource[@type="page" or @type="image" or @type="thumb"]').map do |node|
node.xpath('./file[@mimetype="image/jp2"]/@id').map{ |x| "#{@druid.gsub('druid:','')}/" + x } << node.xpath('./externalFile[@mimetype="image/jp2"]').map do |y|
"#{y.attributes['objectId'].text.split(':').last}" + "/" + "#{y.attributes['fileId']}"
end
end.flatten
end

def collections
@collections ||= predicate_druids('isMemberOfCollection').map do |druid|
PublicXmlRecord.new(druid, purl_url: purl_url)
end
end

def constituents
@constituents ||= predicate_druids('isConstituentOf').map do |druid|
PublicXmlRecord.new(druid, purl_url: purl_url)
end
end

# the thumbnail in publicXML properly URI encoded, including the slash separator
# @return [String] thumb filename with druid prepended, e.g. oo000oo0001%2Ffilename%20withspace.jp2
def encoded_thumb
thumb=parse_thumb
return unless thumb
thumb_druid=thumb.split('/').first # the druid (before the first slash)
thumb_filename=thumb.split(/[a-zA-Z]{2}[0-9]{3}[a-zA-Z]{2}[0-9]{4}[\/]/).last # everything after the druid
"#{thumb_druid}%2F#{ERB::Util.url_encode(thumb_filename)}"
end

# get the druids from predicate relationships in rels-ext from public_xml
# @return [Array<String>, nil] the druids (e.g. ww123yy1234) from the rdf:resource of the predicate relationships, or nil if none
def predicate_druids(predicate, predicate_ns = 'info:fedora/fedora-system:def/relations-external#')
ns_hash = { 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'pred_ns' => predicate_ns }
xpth = "/publicObject/rdf:RDF/rdf:Description/pred_ns:#{predicate}/@rdf:resource"
pred_nodes = public_xml_doc.xpath(xpth, ns_hash)
pred_nodes.reject { |n| n.value.empty? }.map do |n|
n.value.split('druid:').last
end
end

def druid_tree
druid.match(/(..)(...)(..)(....)/).captures.join('/')
def purl_url
@options[:purl_url]
end
end

0 comments on commit c3acb68

Please sign in to comment.