Skip to content
This repository has been archived by the owner on Jan 8, 2022. It is now read-only.

Add an indexer to avoid going to lyberservices #461

Merged
merged 8 commits into from
Aug 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ group :development, :test do
gem 'rubocop', '~> 0.58.1'
# gem 'rubocop-rspec', '~> 1.5'
gem 'rails-controller-testing'
gem 'webmock'
end

group :development do
Expand Down
9 changes: 9 additions & 0 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,8 @@ GEM
config (2.2.1)
deep_merge (~> 1.2, >= 1.2.1)
dry-validation (~> 1.0, >= 1.0.0)
crack (0.4.3)
safe_yaml (~> 1.0.0)
crass (1.0.6)
daemons (1.3.1)
deep_merge (1.2.1)
Expand Down Expand Up @@ -278,6 +280,7 @@ GEM
haml (5.1.2)
temple (>= 0.8.0)
tilt
hashdiff (1.0.1)
honeybadger (4.7.0)
hooks (0.4.1)
uber (~> 0.0.14)
Expand Down Expand Up @@ -485,6 +488,7 @@ GEM
mime-types
nokogiri
rest-client
safe_yaml (1.0.5)
sass (3.7.4)
sass-listen (~> 4.0.0)
sass-listen (4.0.0)
Expand Down Expand Up @@ -553,6 +557,10 @@ GEM
i18n
warden (1.2.8)
rack (>= 2.0.6)
webmock (3.8.3)
addressable (>= 2.3.6)
crack (>= 0.3.2)
hashdiff (>= 0.4.0, < 2.0.0)
websocket-driver (0.7.3)
websocket-extensions (>= 0.1.0)
websocket-extensions (0.1.5)
Expand Down Expand Up @@ -615,6 +623,7 @@ DEPENDENCIES
sqlite3 (~> 1.3.13)
uglifier (>= 1.0.3)
validates_email_format_of
webmock
whenever (~> 0.9)

BUNDLED WITH
Expand Down
3 changes: 2 additions & 1 deletion app/controllers/hydrus_solr_controller.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def reindex
render(plain: msg)
elsif is_hydrus_object(obj)
# It's a Hydrus object: re-solrize it and render the SOLR document.
solr_doc = obj.to_solr
indexer = Indexer.for(obj)
solr_doc = indexer.to_solr
solr.add(solr_doc, add_attributes: { commitWithin: 5000 })
msg = "#{msg}: updated SOLR index: class=#{obj.class}"
index_logger.info(msg)
Expand Down
13 changes: 13 additions & 0 deletions app/indexers/administrative_metadata_datastream_indexer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# frozen_string_literal: true

class AdministrativeMetadataDatastreamIndexer
attr_reader :resource
def initialize(resource:)
@resource = resource
end

# @return [Hash] the partial solr document for administrativeMetadata
def to_solr
resource.administrativeMetadata.to_solr
end
end
50 changes: 50 additions & 0 deletions app/indexers/administrative_tag_indexer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# frozen_string_literal: true

# Index administrative tags for an object.
# NOTE: Most of this code was extracted from the dor-services gem:
# https://github.com/sul-dlss/dor-services/blob/v9.0.0/lib/dor/datastreams/identity_metadata_ds.rb#L196-L218
class AdministrativeTagIndexer
TAG_PART_DELIMITER = ' : '
TAGS_TO_INDEX = ['Project', 'Registered By'].freeze

attr_reader :resource

def initialize(resource:)
@resource = resource
end

# @return [Hash] the partial solr document for administrative tags
def to_solr
solr_doc = { 'tag_ssim' => [], 'exploded_tag_ssim' => [] }
administrative_tags.each do |tag|
solr_doc['tag_ssim'] << tag
solr_doc['exploded_tag_ssim'] += exploded_tags_from(tag)

tag_prefix, rest = tag.split(TAG_PART_DELIMITER, 2)
next if !TAGS_TO_INDEX.include?(tag_prefix) || rest.nil?

prefix = tag_prefix.downcase.strip.gsub(/\s/, '_')
(solr_doc["#{prefix}_tag_ssim"] ||= []) << rest.strip
end
solr_doc
end

private

# solrize each possible prefix for the tag, inclusive of the full tag.
# e.g., for a tag such as "A : B : C", this will solrize to an _ssim field
# that contains ["A", "A : B", "A : B : C"].
def exploded_tags_from(tag)
tag_parts = tag.split(TAG_PART_DELIMITER)

1.upto(tag_parts.count).map do |i|
tag_parts.take(i).join(TAG_PART_DELIMITER)
end
end

def administrative_tags
Dor::Services::Client.object(resource.pid).administrative_tags.list
rescue Dor::Services::Client::NotFoundResponse
[]
end
end
26 changes: 26 additions & 0 deletions app/indexers/composite_indexer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# frozen_string_literal: true

# Borrowed from https://github.com/samvera/valkyrie/blob/master/lib/valkyrie/persistence/solr/composite_indexer.rb
class CompositeIndexer
attr_reader :indexers
def initialize(*indexers)
@indexers = indexers
end

def new(resource:)
Instance.new(indexers, resource: resource)
end

class Instance
attr_reader :indexers, :resource
def initialize(indexers, resource:)
@resource = resource
@indexers = indexers.map { |i| i.new(resource: resource) }
end

# @return [Hash] the merged solr document for all the sub-indexers
def to_solr
indexers.map(&:to_solr).inject({}, &:merge)
end
end
end
60 changes: 60 additions & 0 deletions app/indexers/content_metadata_datastream_indexer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# frozen_string_literal: true

class ContentMetadataDatastreamIndexer
attr_reader :resource
def initialize(resource:)
@resource = resource
end

# @return [Hash] the partial solr document for contentMetadata
def to_solr
return {} unless doc.root['type']

preserved_size = 0
shelved_size = 0
counts = Hash.new(0) # default count is zero
resource_type_counts = Hash.new(0) # default count is zero
file_roles = ::Set.new
mime_types = ::Set.new
first_shelved_image = nil

doc.xpath('contentMetadata/resource').sort { |a, b| a['sequence'].to_i <=> b['sequence'].to_i }.each do |resource|
counts['resource'] += 1
resource_type_counts[resource['type']] += 1 if resource['type']
resource.xpath('file').each do |file|
counts['content_file'] += 1
preserved_size += file['size'].to_i if file['preserve'] == 'yes'
shelved_size += file['size'].to_i if file['shelve'] == 'yes'
if file['shelve'] == 'yes'
counts['shelved_file'] += 1
first_shelved_image ||= file['id'] if file['id'].end_with?('jp2')
end
mime_types << file['mimetype']
file_roles << file['role'] if file['role']
end
end
solr_doc = {
'content_type_ssim' => doc.root['type'],
'content_file_mimetypes_ssim' => mime_types.to_a,
'content_file_count_itsi' => counts['content_file'],
'shelved_content_file_count_itsi' => counts['shelved_file'],
'resource_count_itsi' => counts['resource'],
'preserved_size_dbtsi' => preserved_size, # double (trie) to support very large sizes
'shelved_size_dbtsi' => shelved_size # double (trie) to support very large sizes
}
solr_doc['resource_types_ssim'] = resource_type_counts.keys unless resource_type_counts.empty?
solr_doc['content_file_roles_ssim'] = file_roles.to_a unless file_roles.empty?
resource_type_counts.each do |key, count|
solr_doc["#{key}_resource_count_itsi"] = count
end
# first_shelved_image is neither indexed nor multiple
solr_doc['first_shelved_image_ss'] = first_shelved_image unless first_shelved_image.nil?
solr_doc
end

private

def doc
@doc ||= resource.contentMetadata.ng_xml
end
end
30 changes: 30 additions & 0 deletions app/indexers/data_indexer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# frozen_string_literal: true

# Indexing provided by ActiveFedora
class DataIndexer
include ActiveFedora::Indexing

attr_reader :resource
def initialize(resource:)
@resource = resource
end

# we need to override this until https://github.com/samvera/active_fedora/pull/1371
# has been released
def to_solr(solr_doc = {})
c_time = create_date
c_time = Time.parse(c_time) unless c_time.is_a?(Time)
m_time = modified_date
m_time = Time.parse(m_time) unless m_time.is_a?(Time)
Solrizer.set_field(solr_doc, 'system_create', c_time, :stored_sortable)
Solrizer.set_field(solr_doc, 'system_modified', m_time, :stored_sortable)
Solrizer.set_field(solr_doc, 'object_state', state, :stored_sortable)
Solrizer.set_field(solr_doc, 'active_fedora_model', has_model, :stored_sortable)
solr_doc[SOLR_DOCUMENT_ID.to_sym] = pid
solr_doc = solrize_relationships(solr_doc)
solr_doc
end

delegate :create_date, :modified_date, :state, :pid, :inner_object,
:datastreams, :relationships, :has_model, to: :resource
end
13 changes: 13 additions & 0 deletions app/indexers/default_object_rights_datastream_indexer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# frozen_string_literal: true

class DefaultObjectRightsDatastreamIndexer
attr_reader :resource
def initialize(resource:)
@resource = resource
end

# @return [Hash] the partial solr document for defaultObjectRights
def to_solr
resource.defaultObjectRights.to_solr
end
end
54 changes: 54 additions & 0 deletions app/indexers/describable_indexer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# frozen_string_literal: true

class DescribableIndexer
attr_reader :resource
def initialize(resource:)
@resource = resource
end

# @return [Hash] the partial solr document for describable concerns
def to_solr
add_metadata_format_to_solr_doc.merge(add_mods_to_solr_doc)
end

def add_metadata_format_to_solr_doc
{ 'metadata_format_ssim' => 'mods' }
end

# rubocop:disable Style/SymbolArray
def add_mods_to_solr_doc
solr_doc = {}
mods_sources = {
sw_title_display: %w[sw_display_title_tesim],
main_author_w_date: %w[sw_author_ssim sw_author_tesim],
sw_language_facet: %w[sw_language_ssim],
sw_genre: %w[sw_genre_ssim],
format_main: %w[sw_format_ssim],
topic_facet: %w[sw_topic_ssim],
era_facet: %w[sw_subject_temporal_ssim],
geographic_facet: %w[sw_subject_geographic_ssim],
%i[term_values typeOfResource] => %w[mods_typeOfResource_ssim],
pub_year_sort_str: %w[sw_pub_date_sort_ssi],
pub_year_display_str: %w[sw_pub_date_facet_ssi]
}

mods_sources.each_pair do |meth, solr_keys|
vals = meth.is_a?(Array) ? resource.stanford_mods.send(meth.shift, *meth) : resource.stanford_mods.send(meth)

next if vals.nil? || (vals.respond_to?(:empty?) && vals.empty?)

solr_keys.each do |key|
solr_doc[key] ||= []
solr_doc[key].push(*vals)
end
# asterisk to avoid multi-dimensional array: push values, not the array
end

# convert multivalued fields to single value
%w[sw_pub_date_sort_ssi sw_pub_date_facet_ssi].each do |key|
solr_doc[key] = solr_doc[key].first unless solr_doc[key].nil?
end
solr_doc
end
# rubocop:enable Style/SymbolArray
end
13 changes: 13 additions & 0 deletions app/indexers/descriptive_metadata_datastream_indexer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# frozen_string_literal: true

class DescriptiveMetadataDatastreamIndexer
attr_reader :resource
def initialize(resource:)
@resource = resource
end

# @return [Hash] the partial solr document for descMetadata
def to_solr
resource.descMetadata.to_solr
end
end
28 changes: 28 additions & 0 deletions app/indexers/embargo_metadata_datastream_indexer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# frozen_string_literal: true

class EmbargoMetadataDatastreamIndexer
attr_reader :resource
def initialize(resource:)
@resource = resource
end

# @return [Hash] the partial solr document for embargoMetadata
def to_solr
{
'embargo_status_ssim' => embargo_status,
'twenty_pct_status_ssim' => Array(twenty_pct_status)
}.tap do |solr_doc|
rd20 = twenty_pct_release_date
solr_doc['embargo_release_dtsim'] = Array(release_date.utc.strftime('%FT%TZ')) if release_date.present?
solr_doc['twenty_pct_release_embargo_release_dtsim'] = Array(rd20.utc.strftime('%FT%TZ')) if rd20.present?
end
end

# rubocop:disable Lint/UselessAccessModifier
private

# rubocop:enable Lint/UselessAccessModifier

delegate :embargoMetadata, to: :resource
delegate :embargo_status, :twenty_pct_status, :twenty_pct_release_date, :release_date, to: :embargoMetadata
end
Loading