diff --git a/Gemfile b/Gemfile index 9a79b0530..53eacedb9 100644 --- a/Gemfile +++ b/Gemfile @@ -54,6 +54,7 @@ group :development, :test do gem 'rubocop', '~> 0.58.1' # gem 'rubocop-rspec', '~> 1.5' gem 'rails-controller-testing' + gem 'webmock' end group :development do diff --git a/Gemfile.lock b/Gemfile.lock index 44e12de48..b5f2658d3 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -154,6 +154,8 @@ GEM config (2.2.1) deep_merge (~> 1.2, >= 1.2.1) dry-validation (~> 1.0, >= 1.0.0) + crack (0.4.3) + safe_yaml (~> 1.0.0) crass (1.0.6) daemons (1.3.1) deep_merge (1.2.1) @@ -278,6 +280,7 @@ GEM haml (5.1.2) temple (>= 0.8.0) tilt + hashdiff (1.0.1) honeybadger (4.7.0) hooks (0.4.1) uber (~> 0.0.14) @@ -485,6 +488,7 @@ GEM mime-types nokogiri rest-client + safe_yaml (1.0.5) sass (3.7.4) sass-listen (~> 4.0.0) sass-listen (4.0.0) @@ -553,6 +557,10 @@ GEM i18n warden (1.2.8) rack (>= 2.0.6) + webmock (3.8.3) + addressable (>= 2.3.6) + crack (>= 0.3.2) + hashdiff (>= 0.4.0, < 2.0.0) websocket-driver (0.7.3) websocket-extensions (>= 0.1.0) websocket-extensions (0.1.5) @@ -615,6 +623,7 @@ DEPENDENCIES sqlite3 (~> 1.3.13) uglifier (>= 1.0.3) validates_email_format_of + webmock whenever (~> 0.9) BUNDLED WITH diff --git a/app/controllers/hydrus_solr_controller.rb b/app/controllers/hydrus_solr_controller.rb index e89e92452..04b1fa8fc 100644 --- a/app/controllers/hydrus_solr_controller.rb +++ b/app/controllers/hydrus_solr_controller.rb @@ -44,7 +44,8 @@ def reindex render(plain: msg) elsif is_hydrus_object(obj) # It's a Hydrus object: re-solrize it and render the SOLR document. - solr_doc = obj.to_solr + indexer = Indexer.for(obj) + solr_doc = indexer.to_solr solr.add(solr_doc, add_attributes: { commitWithin: 5000 }) msg = "#{msg}: updated SOLR index: class=#{obj.class}" index_logger.info(msg) diff --git a/app/indexers/administrative_metadata_datastream_indexer.rb b/app/indexers/administrative_metadata_datastream_indexer.rb new file mode 100644 index 000000000..fbbd17ec8 --- /dev/null +++ b/app/indexers/administrative_metadata_datastream_indexer.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +class AdministrativeMetadataDatastreamIndexer + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for administrativeMetadata + def to_solr + resource.administrativeMetadata.to_solr + end +end diff --git a/app/indexers/administrative_tag_indexer.rb b/app/indexers/administrative_tag_indexer.rb new file mode 100644 index 000000000..974c2cc2b --- /dev/null +++ b/app/indexers/administrative_tag_indexer.rb @@ -0,0 +1,50 @@ +# frozen_string_literal: true + +# Index administrative tags for an object. +# NOTE: Most of this code was extracted from the dor-services gem: +# https://github.com/sul-dlss/dor-services/blob/v9.0.0/lib/dor/datastreams/identity_metadata_ds.rb#L196-L218 +class AdministrativeTagIndexer + TAG_PART_DELIMITER = ' : ' + TAGS_TO_INDEX = ['Project', 'Registered By'].freeze + + attr_reader :resource + + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for administrative tags + def to_solr + solr_doc = { 'tag_ssim' => [], 'exploded_tag_ssim' => [] } + administrative_tags.each do |tag| + solr_doc['tag_ssim'] << tag + solr_doc['exploded_tag_ssim'] += exploded_tags_from(tag) + + tag_prefix, rest = tag.split(TAG_PART_DELIMITER, 2) + next if !TAGS_TO_INDEX.include?(tag_prefix) || rest.nil? + + prefix = tag_prefix.downcase.strip.gsub(/\s/, '_') + (solr_doc["#{prefix}_tag_ssim"] ||= []) << rest.strip + end + solr_doc + end + + private + + # solrize each possible prefix for the tag, inclusive of the full tag. + # e.g., for a tag such as "A : B : C", this will solrize to an _ssim field + # that contains ["A", "A : B", "A : B : C"]. + def exploded_tags_from(tag) + tag_parts = tag.split(TAG_PART_DELIMITER) + + 1.upto(tag_parts.count).map do |i| + tag_parts.take(i).join(TAG_PART_DELIMITER) + end + end + + def administrative_tags + Dor::Services::Client.object(resource.pid).administrative_tags.list + rescue Dor::Services::Client::NotFoundResponse + [] + end +end diff --git a/app/indexers/composite_indexer.rb b/app/indexers/composite_indexer.rb new file mode 100644 index 000000000..cfc380c7a --- /dev/null +++ b/app/indexers/composite_indexer.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +# Borrowed from https://github.com/samvera/valkyrie/blob/master/lib/valkyrie/persistence/solr/composite_indexer.rb +class CompositeIndexer + attr_reader :indexers + def initialize(*indexers) + @indexers = indexers + end + + def new(resource:) + Instance.new(indexers, resource: resource) + end + + class Instance + attr_reader :indexers, :resource + def initialize(indexers, resource:) + @resource = resource + @indexers = indexers.map { |i| i.new(resource: resource) } + end + + # @return [Hash] the merged solr document for all the sub-indexers + def to_solr + indexers.map(&:to_solr).inject({}, &:merge) + end + end +end diff --git a/app/indexers/content_metadata_datastream_indexer.rb b/app/indexers/content_metadata_datastream_indexer.rb new file mode 100644 index 000000000..7b05d74f4 --- /dev/null +++ b/app/indexers/content_metadata_datastream_indexer.rb @@ -0,0 +1,60 @@ +# frozen_string_literal: true + +class ContentMetadataDatastreamIndexer + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for contentMetadata + def to_solr + return {} unless doc.root['type'] + + preserved_size = 0 + shelved_size = 0 + counts = Hash.new(0) # default count is zero + resource_type_counts = Hash.new(0) # default count is zero + file_roles = ::Set.new + mime_types = ::Set.new + first_shelved_image = nil + + doc.xpath('contentMetadata/resource').sort { |a, b| a['sequence'].to_i <=> b['sequence'].to_i }.each do |resource| + counts['resource'] += 1 + resource_type_counts[resource['type']] += 1 if resource['type'] + resource.xpath('file').each do |file| + counts['content_file'] += 1 + preserved_size += file['size'].to_i if file['preserve'] == 'yes' + shelved_size += file['size'].to_i if file['shelve'] == 'yes' + if file['shelve'] == 'yes' + counts['shelved_file'] += 1 + first_shelved_image ||= file['id'] if file['id'].end_with?('jp2') + end + mime_types << file['mimetype'] + file_roles << file['role'] if file['role'] + end + end + solr_doc = { + 'content_type_ssim' => doc.root['type'], + 'content_file_mimetypes_ssim' => mime_types.to_a, + 'content_file_count_itsi' => counts['content_file'], + 'shelved_content_file_count_itsi' => counts['shelved_file'], + 'resource_count_itsi' => counts['resource'], + 'preserved_size_dbtsi' => preserved_size, # double (trie) to support very large sizes + 'shelved_size_dbtsi' => shelved_size # double (trie) to support very large sizes + } + solr_doc['resource_types_ssim'] = resource_type_counts.keys unless resource_type_counts.empty? + solr_doc['content_file_roles_ssim'] = file_roles.to_a unless file_roles.empty? + resource_type_counts.each do |key, count| + solr_doc["#{key}_resource_count_itsi"] = count + end + # first_shelved_image is neither indexed nor multiple + solr_doc['first_shelved_image_ss'] = first_shelved_image unless first_shelved_image.nil? + solr_doc + end + + private + + def doc + @doc ||= resource.contentMetadata.ng_xml + end +end diff --git a/app/indexers/data_indexer.rb b/app/indexers/data_indexer.rb new file mode 100644 index 000000000..44645a3d1 --- /dev/null +++ b/app/indexers/data_indexer.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +# Indexing provided by ActiveFedora +class DataIndexer + include ActiveFedora::Indexing + + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # we need to override this until https://github.com/samvera/active_fedora/pull/1371 + # has been released + def to_solr(solr_doc = {}) + c_time = create_date + c_time = Time.parse(c_time) unless c_time.is_a?(Time) + m_time = modified_date + m_time = Time.parse(m_time) unless m_time.is_a?(Time) + Solrizer.set_field(solr_doc, 'system_create', c_time, :stored_sortable) + Solrizer.set_field(solr_doc, 'system_modified', m_time, :stored_sortable) + Solrizer.set_field(solr_doc, 'object_state', state, :stored_sortable) + Solrizer.set_field(solr_doc, 'active_fedora_model', has_model, :stored_sortable) + solr_doc[SOLR_DOCUMENT_ID.to_sym] = pid + solr_doc = solrize_relationships(solr_doc) + solr_doc + end + + delegate :create_date, :modified_date, :state, :pid, :inner_object, + :datastreams, :relationships, :has_model, to: :resource +end diff --git a/app/indexers/default_object_rights_datastream_indexer.rb b/app/indexers/default_object_rights_datastream_indexer.rb new file mode 100644 index 000000000..01df1adce --- /dev/null +++ b/app/indexers/default_object_rights_datastream_indexer.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +class DefaultObjectRightsDatastreamIndexer + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for defaultObjectRights + def to_solr + resource.defaultObjectRights.to_solr + end +end diff --git a/app/indexers/describable_indexer.rb b/app/indexers/describable_indexer.rb new file mode 100644 index 000000000..29c6655fd --- /dev/null +++ b/app/indexers/describable_indexer.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +class DescribableIndexer + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for describable concerns + def to_solr + add_metadata_format_to_solr_doc.merge(add_mods_to_solr_doc) + end + + def add_metadata_format_to_solr_doc + { 'metadata_format_ssim' => 'mods' } + end + + # rubocop:disable Style/SymbolArray + def add_mods_to_solr_doc + solr_doc = {} + mods_sources = { + sw_title_display: %w[sw_display_title_tesim], + main_author_w_date: %w[sw_author_ssim sw_author_tesim], + sw_language_facet: %w[sw_language_ssim], + sw_genre: %w[sw_genre_ssim], + format_main: %w[sw_format_ssim], + topic_facet: %w[sw_topic_ssim], + era_facet: %w[sw_subject_temporal_ssim], + geographic_facet: %w[sw_subject_geographic_ssim], + %i[term_values typeOfResource] => %w[mods_typeOfResource_ssim], + pub_year_sort_str: %w[sw_pub_date_sort_ssi], + pub_year_display_str: %w[sw_pub_date_facet_ssi] + } + + mods_sources.each_pair do |meth, solr_keys| + vals = meth.is_a?(Array) ? resource.stanford_mods.send(meth.shift, *meth) : resource.stanford_mods.send(meth) + + next if vals.nil? || (vals.respond_to?(:empty?) && vals.empty?) + + solr_keys.each do |key| + solr_doc[key] ||= [] + solr_doc[key].push(*vals) + end + # asterisk to avoid multi-dimensional array: push values, not the array + end + + # convert multivalued fields to single value + %w[sw_pub_date_sort_ssi sw_pub_date_facet_ssi].each do |key| + solr_doc[key] = solr_doc[key].first unless solr_doc[key].nil? + end + solr_doc + end + # rubocop:enable Style/SymbolArray +end diff --git a/app/indexers/descriptive_metadata_datastream_indexer.rb b/app/indexers/descriptive_metadata_datastream_indexer.rb new file mode 100644 index 000000000..e96820e08 --- /dev/null +++ b/app/indexers/descriptive_metadata_datastream_indexer.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +class DescriptiveMetadataDatastreamIndexer + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for descMetadata + def to_solr + resource.descMetadata.to_solr + end +end diff --git a/app/indexers/embargo_metadata_datastream_indexer.rb b/app/indexers/embargo_metadata_datastream_indexer.rb new file mode 100644 index 000000000..1b27811c2 --- /dev/null +++ b/app/indexers/embargo_metadata_datastream_indexer.rb @@ -0,0 +1,28 @@ +# frozen_string_literal: true + +class EmbargoMetadataDatastreamIndexer + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for embargoMetadata + def to_solr + { + 'embargo_status_ssim' => embargo_status, + 'twenty_pct_status_ssim' => Array(twenty_pct_status) + }.tap do |solr_doc| + rd20 = twenty_pct_release_date + solr_doc['embargo_release_dtsim'] = Array(release_date.utc.strftime('%FT%TZ')) if release_date.present? + solr_doc['twenty_pct_release_embargo_release_dtsim'] = Array(rd20.utc.strftime('%FT%TZ')) if rd20.present? + end + end + + # rubocop:disable Lint/UselessAccessModifier + private + + # rubocop:enable Lint/UselessAccessModifier + + delegate :embargoMetadata, to: :resource + delegate :embargo_status, :twenty_pct_status, :twenty_pct_release_date, :release_date, to: :embargoMetadata +end diff --git a/app/indexers/identifiable_indexer.rb b/app/indexers/identifiable_indexer.rb new file mode 100644 index 000000000..dc0df76e0 --- /dev/null +++ b/app/indexers/identifiable_indexer.rb @@ -0,0 +1,125 @@ +# frozen_string_literal: true + +# rubocop:disable Style/ClassVars +class IdentifiableIndexer + include SolrDocHelper + + INDEX_VERSION_FIELD = 'dor_services_version_ssi' + NS_HASH = { 'hydra' => 'http://projecthydra.org/ns/relations#', + 'fedora' => 'info:fedora/fedora-system:def/relations-external#', + 'rdf' => 'http://www.w3.org/1999/02/22-rdf-syntax-ns#' }.freeze + + FIELDS = { + collection: { + hydrus: 'hydrus_collection_title', + non_hydrus: 'nonhydrus_collection_title', + union: 'collection_title' + }, + apo: { + hydrus: 'hydrus_apo_title', + non_hydrus: 'nonhydrus_apo_title', + union: 'apo_title' + } + }.freeze + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + ## Module-level variables, shared between ALL mixin includers (and ALL *their* includers/extenders)! + ## used for caching found values + @@collection_hash = {} + @@apo_hash = {} + + # @return [Hash] the partial solr document for identifiable concerns + def to_solr + solr_doc = {} + solr_doc[INDEX_VERSION_FIELD] = Dor::VERSION + solr_doc['indexer_host_ssi'] = Socket.gethostname + solr_doc['indexed_at_dtsi'] = Time.now.utc.xmlschema + + add_solr_value(solr_doc, 'title_sort', resource.label, :string, [:stored_sortable]) + + rels_doc = Nokogiri::XML(resource.datastreams['RELS-EXT'].content) + apos = rels_doc.search('//rdf:RDF/rdf:Description/hydra:isGovernedBy', NS_HASH) + collections = rels_doc.search('//rdf:RDF/rdf:Description/fedora:isMemberOfCollection', NS_HASH) + solrize_related_obj_titles(solr_doc, apos, @@apo_hash, :apo) + solrize_related_obj_titles(solr_doc, collections, @@collection_hash, :collection) + solr_doc['public_dc_relation_tesim'] ||= solr_doc['collection_title_tesim'] if solr_doc['collection_title_tesim'] + solr_doc['metadata_source_ssi'] = identity_metadata_source + # This used to be added to the index by https://github.com/sul-dlss/dor-services/commit/11b80d249d19326ef591411ffeb634900e75c2c3 + # and was called dc_identifier_druid_tesim + # It is used to search based on druid. + solr_doc['objectId_tesim'] = [resource.pid, resource.pid.split(':').last] + solr_doc + end + + # @return [String] calculated value for Solr index + def identity_metadata_source + if resource.identityMetadata.otherId('catkey').first || + resource.identityMetadata.otherId('barcode').first + 'Symphony' + else + 'DOR' + end + end + + # Clears out the cache of items. Used primarily in testing. + def self.reset_cache! + @@collection_hash = {} + @@apo_hash = {} + end + + private + + def related_object_tags(object) + return [] unless object + + Dor::Services::Client.object(object.pid).administrative_tags.list + end + + # @param [Hash] solr_doc + # @param [Array] relationships + # @param [Hash] title_hash a cache for titles + # @param [Symbol] type either :apo or :collection + def solrize_related_obj_titles(solr_doc, relationships, title_hash, type) + # TODO: if you wanted to get a little fancier, you could also solrize a 2 level hierarchy and display using hierarchial facets, like + # ["SOURCE", "SOURCE : TITLE"] (e.g. ["Hydrus", "Hydrus : Special Collections"], see (exploded) tags in IdentityMetadataDS#to_solr). + title_type = :symbol # we'll get an _ssim because of the type + title_attrs = [:stored_searchable] # we'll also get a _tesim from this attr + relationships.each do |rel_node| + rel_druid = rel_node['rdf:resource'] + next unless rel_druid # TODO: warning here would also be useful + + rel_druid = rel_druid.gsub('info:fedora/', '') + + # populate cache if necessary + unless title_hash.key?(rel_druid) + begin + related_obj = Dor.find(rel_druid) + related_obj_title = related_obj_display_title(related_obj, rel_druid) + is_from_hydrus = related_object_tags(related_obj).include?('Project : Hydrus') + title_hash[rel_druid] = { 'related_obj_title' => related_obj_title, 'is_from_hydrus' => is_from_hydrus } + rescue ActiveFedora::ObjectNotFoundError + # This may happen if the given APO or Collection does not exist (bad data) + title_hash[rel_druid] = { 'related_obj_title' => rel_druid, 'is_from_hydrus' => false } + end + end + + # cache should definitely be populated, so just use that to write solr field + if title_hash[rel_druid]['is_from_hydrus'] + add_solr_value(solr_doc, FIELDS.dig(type, :hydrus), title_hash[rel_druid]['related_obj_title'], title_type, title_attrs) + else + add_solr_value(solr_doc, FIELDS.dig(type, :non_hydrus), title_hash[rel_druid]['related_obj_title'], title_type, title_attrs) + end + add_solr_value(solr_doc, FIELDS.dig(type, :union), title_hash[rel_druid]['related_obj_title'], title_type, title_attrs) + end + end + + def related_obj_display_title(related_obj, default_title) + return default_title unless related_obj + + related_obj.full_title || default_title + end +end +# rubocop:enable Style/ClassVars diff --git a/app/indexers/identity_metadata_datastream_indexer.rb b/app/indexers/identity_metadata_datastream_indexer.rb new file mode 100644 index 000000000..76b85fc1d --- /dev/null +++ b/app/indexers/identity_metadata_datastream_indexer.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true + +class IdentityMetadataDatastreamIndexer + include SolrDocHelper + + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for identityMetadata + def to_solr + solr_doc = {} + solr_doc['objectType_ssim'] = resource.identityMetadata.objectType + + plain_identifiers = [] + ns_identifiers = [] + if source_id.present? + (name, id) = source_id.split(/:/, 2) + plain_identifiers << id + ns_identifiers << source_id + solr_doc['source_id_ssim'] = [source_id] + end + + resource.identityMetadata.otherId.compact.each do |qid| + # this section will solrize barcode and catkey, which live in otherId + (name, id) = qid.split(/:/, 2) + plain_identifiers << id + ns_identifiers << qid + next unless %w[barcode catkey].include?(name) + + solr_doc["#{name}_id_ssim"] = [id] + end + solr_doc['dor_id_tesim'] = plain_identifiers + solr_doc['identifier_tesim'] = ns_identifiers + solr_doc['identifier_ssim'] = ns_identifiers + + solr_doc + end + + private + + def source_id + @source_id ||= resource.identityMetadata.sourceId + end +end diff --git a/app/indexers/object_profile_indexer.rb b/app/indexers/object_profile_indexer.rb new file mode 100644 index 000000000..ae76e8c55 --- /dev/null +++ b/app/indexers/object_profile_indexer.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true + +class ObjectProfileIndexer + include SolrDocHelper + + attr_reader :resource + + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for releasable concerns + def to_solr + {}.tap do |solr_doc| + add_solr_value(solr_doc, 'obj_label', resource.label, :symbol, [:stored_searchable]) + end + end +end diff --git a/app/indexers/process_indexer.rb b/app/indexers/process_indexer.rb new file mode 100644 index 000000000..96eb00c9d --- /dev/null +++ b/app/indexers/process_indexer.rb @@ -0,0 +1,56 @@ +# frozen_string_literal: true + +# Indexes the process for a workflow +class ProcessIndexer + ERROR_OMISSION = '... (continued)' + private_constant :ERROR_OMISSION + + # see https://lucene.apache.org/core/7_3_1/core/org/apache/lucene/util/BytesRefHash.MaxBytesLengthExceededException.html + MAX_ERROR_LENGTH = 32_768 - 2 - ERROR_OMISSION.length + private_constant :MAX_ERROR_LENGTH + + # @param [WorkflowSolrDocument] solr_doc + # @param [String] workflow_name + # @param [Dor::Workflow::Response::Process] process + def initialize(solr_doc:, workflow_name:, process:) + @solr_doc = solr_doc + @workflow_name = workflow_name + @process = process + end + + # @return [Hash] the partial solr document for the workflow document + def to_solr + return unless status + + # add a record of the robot having operated on this item, so we can track robot activity + solr_doc.add_process_time(workflow_name, name, Time.parse(process.datetime)) if time? + + index_error_message + + # workflow name, process status then process name + solr_doc.add_wsp("#{workflow_name}:#{status}", "#{workflow_name}:#{status}:#{name}") + + # workflow name, process name then process status + solr_doc.add_wps("#{workflow_name}:#{name}", "#{workflow_name}:#{name}:#{status}") + + # process status, workflowname then process name + solr_doc.add_swp(process.status.to_s, "#{status}:#{workflow_name}", "#{status}:#{workflow_name}:#{name}") + end + + private + + attr_reader :process, :workflow_name, :solr_doc + delegate :status, :name, :state, :error_message, :datetime, to: :process + + def time? + datetime && (status == 'completed' || status == 'error') + end + + # index the error message without the druid so we hopefully get some overlap + # truncate to avoid org.apache.lucene.util.BytesRefHash$MaxBytesLengthExceededException + def index_error_message + return unless error_message + + solr_doc.error = "#{workflow_name}:#{name}:#{error_message}".truncate(MAX_ERROR_LENGTH, omission: ERROR_OMISSION) + end +end diff --git a/app/indexers/processable_indexer.rb b/app/indexers/processable_indexer.rb new file mode 100644 index 000000000..a56501c0b --- /dev/null +++ b/app/indexers/processable_indexer.rb @@ -0,0 +1,66 @@ +# frozen_string_literal: true + +class ProcessableIndexer + include SolrDocHelper + + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for processable concerns + def to_solr + {}.tap do |solr_doc| + solr_doc['current_version_isi'] = current_version.to_i # Argo Facet field "Version" + + add_sortable_milestones(solr_doc) + solr_doc['modified_latest_dttsi'] = resource.modified_date.to_datetime.utc.strftime('%FT%TZ') + add_solr_value(solr_doc, 'rights', resource.rights, :string, [:symbol]) if resource.respond_to? :rights + add_status(solr_doc) + end + end + + private + + def status_service + @status_service ||= WorkflowClientFactory.build.status(druid: resource.pid, version: resource.current_version) + end + + def current_version + @current_version ||= begin + resource.current_version + rescue StandardError + '1' + end + end + + def add_status(solr_doc) + solr_doc['status_ssi'] = status_service.display + return unless status_service.info[:status_code] + + # This is used for Argo's "Processing Status" facet + add_solr_value(solr_doc, 'processing_status_text', status_service.display_simplified, :string, [:stored_sortable]) + end + + def sortable_milestones + status_service.milestones.each_with_object({}) do |milestone, sortable| + sortable[milestone[:milestone]] ||= [] + sortable[milestone[:milestone]] << milestone[:at].utc.xmlschema + end + end + + def add_sortable_milestones(solr_doc) + sortable_milestones.each do |milestone, unordered_dates| + dates = unordered_dates.sort + # create the published_dttsi and published_day fields and the like + dates.each do |date| + solr_doc["#{milestone}_dttsim"] ||= [] + solr_doc["#{milestone}_dttsim"] << date unless solr_doc["#{milestone}_dttsim"].include?(date) + end + # fields for OAI havester to sort on: _dttsi is trie date +stored +indexed (single valued, i.e. sortable) + # TODO: we really only need accessioned_earliest and registered_earliest + solr_doc["#{milestone}_earliest_dttsi"] = dates.first + solr_doc["#{milestone}_latest_dttsi"] = dates.last + end + end +end diff --git a/app/indexers/provenance_metadata_datastream_indexer.rb b/app/indexers/provenance_metadata_datastream_indexer.rb new file mode 100644 index 000000000..163cc8b45 --- /dev/null +++ b/app/indexers/provenance_metadata_datastream_indexer.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +class ProvenanceMetadataDatastreamIndexer + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for provenanceMetadata + def to_solr + resource.provenanceMetadata.to_solr + end +end diff --git a/app/indexers/releasable_indexer.rb b/app/indexers/releasable_indexer.rb new file mode 100644 index 000000000..4c797bce3 --- /dev/null +++ b/app/indexers/releasable_indexer.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: true + +class ReleasableIndexer + include SolrDocHelper + + attr_reader :resource + + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for releasable concerns + def to_solr + solr_doc = {} + + # TODO: sort of worried about the performance impact in bulk reindex + # situations, since released_for recurses all parent collections. jmartin 2015-07-14 + released_for.each do |release_target, release_info| + add_solr_value(solr_doc, 'released_to', release_target, :symbol, []) if release_info['release'] + end + + # TODO: need to solrize whether item is released to purl? does released_for return that? + # logic is: "True when there is a published lifecycle and Access Rights is anything but Dark" + + solr_doc + end + + private + + def released_for + object_client.release_tags.list + end + + def object_client + Dor::Services::Client.object(resource.pid) + end +end diff --git a/app/indexers/rights_metadata_datastream_indexer.rb b/app/indexers/rights_metadata_datastream_indexer.rb new file mode 100644 index 000000000..b9cd45f16 --- /dev/null +++ b/app/indexers/rights_metadata_datastream_indexer.rb @@ -0,0 +1,87 @@ +# frozen_string_literal: true + +class RightsMetadataDatastreamIndexer + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for rightsMetadata + # rubocop:disable Metrics/CyclomaticComplexity + # rubocop:disable Metrics/PerceivedComplexity + def to_solr + solr_doc = { + 'copyright_ssim' => resource.rightsMetadata.copyright, + 'use_statement_ssim' => resource.rightsMetadata.use_statement + } + + dra = resource.rightsMetadata.dra_object + solr_doc['rights_primary_ssi'] = dra.index_elements[:primary] + solr_doc['rights_errors_ssim'] = dra.index_elements[:errors] unless dra.index_elements[:errors].empty? + solr_doc['rights_characteristics_ssim'] = dra.index_elements[:terms] unless dra.index_elements[:terms].empty? + + solr_doc['rights_descriptions_ssim'] = [ + dra.index_elements[:primary], + + (dra.index_elements[:obj_locations_qualified] || []).map do |rights_info| + rule_suffix = rights_info[:rule] ? " (#{rights_info[:rule]})" : '' + "location: #{rights_info[:location]}#{rule_suffix}" + end, + (dra.index_elements[:file_locations_qualified] || []).map do |rights_info| + rule_suffix = rights_info[:rule] ? " (#{rights_info[:rule]})" : '' + "location: #{rights_info[:location]} (file)#{rule_suffix}" + end, + + (dra.index_elements[:obj_agents_qualified] || []).map do |rights_info| + rule_suffix = rights_info[:rule] ? " (#{rights_info[:rule]})" : '' + "agent: #{rights_info[:agent]}#{rule_suffix}" + end, + (dra.index_elements[:file_agents_qualified] || []).map do |rights_info| + rule_suffix = rights_info[:rule] ? " (#{rights_info[:rule]})" : '' + "agent: #{rights_info[:agent]} (file)#{rule_suffix}" + end, + + (dra.index_elements[:obj_groups_qualified] || []).map do |rights_info| + rule_suffix = rights_info[:rule] ? " (#{rights_info[:rule]})" : '' + "#{rights_info[:group]}#{rule_suffix}" + end, + (dra.index_elements[:file_groups_qualified] || []).map do |rights_info| + rule_suffix = rights_info[:rule] ? " (#{rights_info[:rule]})" : '' + "#{rights_info[:group]} (file)#{rule_suffix}" + end, + + (dra.index_elements[:obj_world_qualified] || []).map do |rights_info| + rule_suffix = rights_info[:rule] ? " (#{rights_info[:rule]})" : '' + "world#{rule_suffix}" + end, + (dra.index_elements[:file_world_qualified] || []).map do |rights_info| + rule_suffix = rights_info[:rule] ? " (#{rights_info[:rule]})" : '' + "world (file)#{rule_suffix}" + end + ].flatten.uniq + + # these two values are returned by index_elements[:primary], but are just a less granular version of + # what the other more specific fields return, so discard them + solr_doc['rights_descriptions_ssim'] -= %w[access_restricted access_restricted_qualified world_qualified] + solr_doc['rights_descriptions_ssim'] += ['dark (file)'] if dra.index_elements[:terms].include? 'none_read_file' + + solr_doc['obj_rights_locations_ssim'] = dra.index_elements[:obj_locations] if dra.index_elements[:obj_locations].present? + solr_doc['file_rights_locations_ssim'] = dra.index_elements[:file_locations] if dra.index_elements[:file_locations].present? + solr_doc['obj_rights_agents_ssim'] = dra.index_elements[:obj_agents] if dra.index_elements[:obj_agents].present? + solr_doc['file_rights_agents_ssim'] = dra.index_elements[:file_agents] if dra.index_elements[:file_agents].present? + + # suppress empties + %w[use_statement_ssim copyright_ssim].each do |key| + solr_doc[key] = solr_doc[key].reject(&:blank?).flatten unless solr_doc[key].nil? + end + + solr_doc['use_license_machine_ssi'] = resource.rightsMetadata.use_license.first + + # TODO: I don't think this is used in argo, and can be removed + solr_doc['use_licenses_machine_ssim'] = resource.rightsMetadata.use_license + + solr_doc + end + # rubocop:enable Metrics/CyclomaticComplexity + # rubocop:enable Metrics/PerceivedComplexity +end diff --git a/app/indexers/role_metadata_datastream_indexer.rb b/app/indexers/role_metadata_datastream_indexer.rb new file mode 100644 index 000000000..85feac1ac --- /dev/null +++ b/app/indexers/role_metadata_datastream_indexer.rb @@ -0,0 +1,25 @@ +# frozen_string_literal: true + +class RoleMetadataDatastreamIndexer + include SolrDocHelper + + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for roleMetadata + def to_solr + {}.tap do |solr_doc| + # rubocop:disable Style/SymbolArray + resource.roleMetadata.find_by_xpath('/roleMetadata/role/*').each do |actor| + role_type = actor.parent['type'] + val = [actor.at_xpath('identifier/@type'), actor.at_xpath('identifier/text()')].join ':' + add_solr_value(solr_doc, "apo_role_#{actor.name}_#{role_type}", val, :string, [:symbol]) + add_solr_value(solr_doc, "apo_role_#{role_type}", val, :string, [:symbol]) + add_solr_value(solr_doc, 'apo_register_permissions', val, :string, %i[symbol stored_searchable]) if %w[dor-apo-manager dor-apo-depositor].include? role_type + end + # rubocop:enable Style/SymbolArray + end + end +end diff --git a/app/indexers/solr_doc_helper.rb b/app/indexers/solr_doc_helper.rb new file mode 100644 index 000000000..826fd3723 --- /dev/null +++ b/app/indexers/solr_doc_helper.rb @@ -0,0 +1,11 @@ +# frozen_string_literal: true + +module SolrDocHelper + def add_solr_value(solr_doc, field_name, value, field_type = :default, index_types = [:searchable]) + case field_type + when :symbol + index_types << field_type + end + ::Solrizer.insert_field(solr_doc, field_name, value, *index_types) + end +end diff --git a/app/indexers/version_metadata_datastream_indexer.rb b/app/indexers/version_metadata_datastream_indexer.rb new file mode 100644 index 000000000..362449f33 --- /dev/null +++ b/app/indexers/version_metadata_datastream_indexer.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +class VersionMetadataDatastreamIndexer + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for versionMetadata + def to_solr + resource.versionMetadata.to_solr + end +end diff --git a/app/indexers/workflow_indexer.rb b/app/indexers/workflow_indexer.rb new file mode 100644 index 000000000..35551bd98 --- /dev/null +++ b/app/indexers/workflow_indexer.rb @@ -0,0 +1,45 @@ +# frozen_string_literal: true + +# Indexes the objects position in workflows +class WorkflowIndexer + # @param [Workflow::Response::Workflow] workflow the workflow document to index + def initialize(workflow:) + @workflow = workflow + end + + # @return [Hash] the partial solr document for the workflow document + def to_solr + WorkflowSolrDocument.new do |solr_doc| + solr_doc.name = workflow_name + + errors = 0 # The error count is used by the Report class in Argo + processes.each do |process| + ProcessIndexer.new(solr_doc: solr_doc, workflow_name: workflow_name, process: process).to_solr + errors += 1 if process.status == 'error' + end + solr_doc.status = [workflow_name, workflow_status, errors].join('|') + end + end + + private + + attr_reader :workflow + delegate :workflow_name, to: :workflow + + def definition_process_names + @definition_process_names ||= begin + definition = WorkflowClientFactory.build.workflow_template(workflow_name) + definition['processes'].map { |p| p['name'] } + end + end + + def processes + @processes ||= definition_process_names.map do |process_name| + workflow.process_for_recent_version(name: process_name) + end + end + + def workflow_status + workflow.complete? ? 'completed' : 'active' + end +end diff --git a/app/indexers/workflows_indexer.rb b/app/indexers/workflows_indexer.rb new file mode 100644 index 000000000..d2ddfc2eb --- /dev/null +++ b/app/indexers/workflows_indexer.rb @@ -0,0 +1,32 @@ +# frozen_string_literal: true + +# Indexes the objects position in workflows +class WorkflowsIndexer + attr_reader :resource + def initialize(resource:) + @resource = resource + end + + # @return [Hash] the partial solr document for workflow concerns + def to_solr + WorkflowSolrDocument.new do |combined_doc| + workflows.each do |wf| + doc = WorkflowIndexer.new(workflow: wf).to_solr + combined_doc.merge!(doc) + end + end.to_h + end + + private + + # @return [Array] + def workflows + all_workflows.workflows + end + + # TODO: remove Dor::Workflow::Document + # @return [Workflow::Response::Workflows] + def all_workflows + @all_workflows ||= WorkflowClientFactory.build.workflow_routes.all_workflows pid: resource.pid + end +end diff --git a/app/models/workflow_solr_document.rb b/app/models/workflow_solr_document.rb new file mode 100644 index 000000000..940904602 --- /dev/null +++ b/app/models/workflow_solr_document.rb @@ -0,0 +1,91 @@ +# frozen_string_literal: true + +# Represents that part of the solr document that holds workflow data +class WorkflowSolrDocument + WORKFLOW_SOLR = 'wf_ssim' + # field that indexes workflow name, process status then process name + WORKFLOW_WPS_SOLR = 'wf_wps_ssim' + # field that indexes workflow name, process name then process status + WORKFLOW_WSP_SOLR = 'wf_wsp_ssim' + # field that indexes process status, workflowname then process name + WORKFLOW_SWP_SOLR = 'wf_swp_ssim' + WORKFLOW_ERROR_SOLR = 'wf_error_ssim' + WORKFLOW_STATUS_SOLR = 'workflow_status_ssim' + + KEYS_TO_MERGE = [ + WORKFLOW_SOLR, + WORKFLOW_WPS_SOLR, + WORKFLOW_WSP_SOLR, + WORKFLOW_SWP_SOLR, + WORKFLOW_STATUS_SOLR, + WORKFLOW_ERROR_SOLR + ].freeze + + def initialize + @data = empty_document + yield self if block_given? + end + + def name=(wf_name) + data[WORKFLOW_SOLR] += [wf_name] + data[WORKFLOW_WPS_SOLR] += [wf_name] + data[WORKFLOW_WSP_SOLR] += [wf_name] + end + + def status=(status) + data[WORKFLOW_STATUS_SOLR] += [status] + end + + def error=(message) + data[WORKFLOW_ERROR_SOLR] += [message] + end + + # Add to the field that indexes workflow name, process status then process name + def add_wps(*messages) + data[WORKFLOW_WPS_SOLR] += messages + end + + # Add to the field that indexes workflow name, process name then process status + def add_wsp(*messages) + data[WORKFLOW_WSP_SOLR] += messages + end + + # Add to the field that indexes process status, workflow name then process name + def add_swp(*messages) + data[WORKFLOW_SWP_SOLR] += messages + end + + # Add the processes data_time attribute to the solr document + # @param [String] wf_name + # @param [String] process_name + # @param [Time] time + def add_process_time(wf_name, process_name, time) + data["wf_#{wf_name}_#{process_name}_dttsi"] = time.utc.iso8601 + end + + def to_h + KEYS_TO_MERGE.each { |k| data[k].uniq! } + data + end + + delegate :except, :[], to: :data + + # @param [WorkflowSolrDocument] doc + def merge!(doc) + # This is going to get the date fields, e.g. `wf_assemblyWF_jp2-create_dttsi' + @data.merge!(doc.except(*KEYS_TO_MERGE)) + + # Combine the non-unique fields together + KEYS_TO_MERGE.each do |k| + data[k] += doc[k] + end + end + + private + + attr_reader :data + + def empty_document + KEYS_TO_MERGE.each_with_object({}) { |k, obj| obj[k] = [] } + end +end diff --git a/app/services/indexer.rb b/app/services/indexer.rb new file mode 100644 index 000000000..10f433ae9 --- /dev/null +++ b/app/services/indexer.rb @@ -0,0 +1,84 @@ +# frozen_string_literal: true + +class Indexer + ADMIN_POLICY_INDEXER = CompositeIndexer.new( + AdministrativeTagIndexer, + DataIndexer, + RoleMetadataDatastreamIndexer, + AdministrativeMetadataDatastreamIndexer, + DefaultObjectRightsDatastreamIndexer, + ProvenanceMetadataDatastreamIndexer, + RightsMetadataDatastreamIndexer, + VersionMetadataDatastreamIndexer, + ObjectProfileIndexer, + IdentityMetadataDatastreamIndexer, + DescriptiveMetadataDatastreamIndexer, + DescribableIndexer, + IdentifiableIndexer, + ProcessableIndexer, + WorkflowsIndexer + ) + + COLLECTION_INDEXER = CompositeIndexer.new( + AdministrativeTagIndexer, + DataIndexer, + ProvenanceMetadataDatastreamIndexer, + RightsMetadataDatastreamIndexer, + VersionMetadataDatastreamIndexer, + ObjectProfileIndexer, + IdentityMetadataDatastreamIndexer, + DescriptiveMetadataDatastreamIndexer, + DescribableIndexer, + IdentifiableIndexer, + ProcessableIndexer, + ReleasableIndexer, + WorkflowsIndexer + ) + + ITEM_INDEXER = CompositeIndexer.new( + AdministrativeTagIndexer, + DataIndexer, + ProvenanceMetadataDatastreamIndexer, + RightsMetadataDatastreamIndexer, + VersionMetadataDatastreamIndexer, + ObjectProfileIndexer, + IdentityMetadataDatastreamIndexer, + DescriptiveMetadataDatastreamIndexer, + EmbargoMetadataDatastreamIndexer, + ContentMetadataDatastreamIndexer, + DescribableIndexer, + IdentifiableIndexer, + ProcessableIndexer, + ReleasableIndexer, + WorkflowsIndexer + ) + + SET_INDEXER = CompositeIndexer.new( + AdministrativeTagIndexer, + DataIndexer, + ProvenanceMetadataDatastreamIndexer, + RightsMetadataDatastreamIndexer, + VersionMetadataDatastreamIndexer, + ObjectProfileIndexer, + IdentityMetadataDatastreamIndexer, + DescriptiveMetadataDatastreamIndexer, + DescribableIndexer, + IdentifiableIndexer, + ProcessableIndexer, + WorkflowsIndexer + ) + + INDEXERS = { + Dor::Agreement => ITEM_INDEXER, # Agreement uses same indexer as Dor::Item + Dor::AdminPolicyObject => ADMIN_POLICY_INDEXER, + Dor::Collection => COLLECTION_INDEXER, + Hydrus::Item => ITEM_INDEXER, + Hydrus::AdminPolicyObject => ADMIN_POLICY_INDEXER, + Dor::Item => ITEM_INDEXER, + Dor::Set => SET_INDEXER + }.freeze + + def self.for(obj) + INDEXERS.fetch(obj.class).new(resource: obj) + end +end diff --git a/app/services/workflow_client_factory.rb b/app/services/workflow_client_factory.rb new file mode 100644 index 000000000..f222001c4 --- /dev/null +++ b/app/services/workflow_client_factory.rb @@ -0,0 +1,9 @@ +# frozen_string_literal: true + +# This initializes the workflow client with values from settings +class WorkflowClientFactory + def self.build + logger = Logger.new(Settings.workflow.logfile, Settings.workflow.shift_age) + Dor::Workflow::Client.new(url: Settings.workflow.url, logger: logger, timeout: Settings.workflow.timeout) + end +end diff --git a/spec/controllers/hydrus_solr_controller_spec.rb b/spec/controllers/hydrus_solr_controller_spec.rb index 948d0786a..58268a6bf 100644 --- a/spec/controllers/hydrus_solr_controller_spec.rb +++ b/spec/controllers/hydrus_solr_controller_spec.rb @@ -6,6 +6,14 @@ let(:pid) { 'druid:bc123df4567' } describe 'reindex' do + before do + allow(Indexer).to receive(:for).with(mock_hydrus_obj).and_return(mock_indexer) + end + + let(:mock_hydrus_obj) { instance_double(Hydrus::Item, to_solr: { id: 'x' }, pid: pid) } + let(:mock_indexer) { instance_double(CompositeIndexer::Instance, to_solr: mock_solr_doc) } + let(:mock_solr_doc) { { id: pid } } + context 'when an object is not found in Fedora' do it 'responds with 404' do allow(ActiveFedora::Base).to receive(:find).and_return(nil) @@ -43,8 +51,8 @@ it 'indexes the object' do allow(ActiveFedora::Base).to receive(:find) - .and_return(instance_double(Hydrus::Item, to_solr: { id: 'x' }, pid: pid)) - expect(ActiveFedora.solr.conn).to receive(:add).with({ id: 'x' }, add_attributes: { commitWithin: 5000 }).and_return(true) + .and_return(mock_hydrus_obj) + expect(ActiveFedora.solr.conn).to receive(:add).with({ id: pid }, add_attributes: { commitWithin: 5000 }).and_return(true) get :reindex, params: { id: 'druid:bc123df4567' } expect(response.status).to eq(200) end @@ -61,8 +69,8 @@ it 'indexes the object' do allow(ActiveFedora::Base).to receive(:find) - .and_return(instance_double(Hydrus::Item, to_solr: { id: 'x' }, pid: pid)) - expect(ActiveFedora.solr.conn).to receive(:add).with({ id: 'x' }, add_attributes: { commitWithin: 5000 }).and_return(true) + .and_return(mock_hydrus_obj) + expect(ActiveFedora.solr.conn).to receive(:add).with({ id: pid }, add_attributes: { commitWithin: 5000 }).and_return(true) get :reindex, params: { id: 'druid:bc123df4567' } expect(response.status).to eq(200) end diff --git a/spec/features/hydrus_solr_controller_spec.rb b/spec/features/hydrus_solr_controller_spec.rb index b05cf3ed2..b2c7bd2f8 100644 --- a/spec/features/hydrus_solr_controller_spec.rb +++ b/spec/features/hydrus_solr_controller_spec.rb @@ -14,11 +14,17 @@ let(:fake_tags_client) { instance_double(Dor::Services::Client::AdministrativeTags, list: tags) } let(:tags) { ['Project : Hydrus'] } + let(:mock_hydrus_obj) { instance_double(Hydrus::Item, to_solr: { id: 'x' }, pid: druid) } + let(:mock_indexer) { instance_double(CompositeIndexer::Instance, to_solr: mock_solr_doc) } + let(:mock_solr_doc) { { id: druid } } + before do allow(Dor::Services::Client).to receive(:object).with(druid).and_return(fake_object_client) + allow(Indexer).to receive(:for).with(mock_hydrus_obj).and_return(mock_indexer) end it 'indexes an item into solr' do + allow(ActiveFedora::Base).to receive(:find).and_return(mock_hydrus_obj) expect(ActiveFedora.solr.conn).to receive(:add).with(hash_including(id: druid), anything) visit "/hydrus_solr/reindex/#{druid}" end diff --git a/spec/indexers/administrative_tag_indexer_spec.rb b/spec/indexers/administrative_tag_indexer_spec.rb new file mode 100644 index 000000000..5faf23d35 --- /dev/null +++ b/spec/indexers/administrative_tag_indexer_spec.rb @@ -0,0 +1,59 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe AdministrativeTagIndexer do + describe '#to_solr' do + subject(:document) { indexer.to_solr } + + let(:indexer) { described_class.new(resource: object) } + let(:object) { Dor::Abstract.new(pid: 'druid:rt923jk234') } + let(:tags) do + [ + 'Google Books : Phase 1', + 'Google Books : Scan source STANFORD', + 'Project : Beautiful Books', + 'Registered By : blalbrit', + 'DPG : Beautiful Books : Octavo : newpri', + 'Remediated By : 4.15.4' + ] + end + + before do + # Don't actually hit the dor-services-app API endpoint + allow(indexer).to receive(:administrative_tags).and_return(tags) + end + + it 'indexes all administrative tags' do + expect(document).to include('tag_ssim' => tags) + end + + it 'indexes exploded tags' do + expect(document['exploded_tag_ssim']).to match_array( + [ + 'Google Books', + 'Google Books : Phase 1', + 'Google Books', + 'Google Books : Scan source STANFORD', + 'Project', + 'Project : Beautiful Books', + 'Registered By', + 'Registered By : blalbrit', + 'DPG', + 'DPG : Beautiful Books', + 'DPG : Beautiful Books : Octavo', + 'DPG : Beautiful Books : Octavo : newpri', + 'Remediated By', + 'Remediated By : 4.15.4' + ] + ) + end + + it 'indexes prefixed tags' do + expect(document).to include( + 'project_tag_ssim' => ['Beautiful Books'], + 'registered_by_tag_ssim' => ['blalbrit'] + ) + end + end +end diff --git a/spec/indexers/composite_indexer_spec.rb b/spec/indexers/composite_indexer_spec.rb new file mode 100644 index 000000000..b9b65a75b --- /dev/null +++ b/spec/indexers/composite_indexer_spec.rb @@ -0,0 +1,82 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe CompositeIndexer do + let(:model) { Dor::Abstract } + let(:mods) do + double('mods', sw_title_display: 'foo', sw_genre: ['test genre'], + main_author_w_date: '1999', + sw_sort_author: 'baz', + sw_language_facet: 'en', + format_main: 'foofmt', + topic_facet: 'topicbar', + era_facet: ['17th century', '18th century'], + geographic_facet: %w[Europe Europe], + term_values: 'huh?', + pub_year_sort_str: '1600', + pub_year_int: 1600, + pub_year_display_str: '1600') + end + let(:obj) do + instance_double(Dor::Item, + pid: 'druid:mx123ms3333', + stanford_mods: mods, + datastreams: datastreams, + label: 'obj label', + identityMetadata: identity_metadata, + versionMetadata: version_metadata, + current_version: '7', + modified_date: '1999-12-30') + end + let(:datastreams) do + { 'RELS-EXT' => double('datastream', datastream_spec_string: 'huh', new?: false, content: '') } + end + let(:identity_metadata) do + instance_double(Dor::IdentityMetadataDS, otherId: 'foo') + end + let(:version_metadata) do + instance_double(Dor::VersionMetadataDS, tag_for_version: 'tag7', description_for_version: 'desc7', current_version_id: '7') + end + + let(:indexer) do + described_class.new( + DescribableIndexer, + IdentifiableIndexer, + ProcessableIndexer + ) + end + + describe 'to_solr' do + let(:status) do + instance_double(Dor::Workflow::Client::Status, milestones: {}, info: {}, display: 'bad') + end + let(:workflow_client) { instance_double(Dor::Workflow::Client, status: status) } + let(:doc) { indexer.new(resource: obj).to_solr } + + before do + allow(Dor::Workflow::Client).to receive(:new).and_return(workflow_client) + end + + it 'searchworks date-fu: temporal periods and pub_dates' do + expect(doc).to match a_hash_including( + 'sw_subject_temporal_ssim' => a_collection_containing_exactly('18th century', '17th century'), + 'sw_pub_date_sort_ssi' => '1600', + 'sw_pub_date_facet_ssi' => '1600' + ) + end + + it 'subject geographic fields' do + expect(doc).to match a_hash_including( + 'sw_subject_geographic_ssim' => %w[Europe Europe] + ) + end + + it 'genre fields' do + genre_list = obj.stanford_mods.sw_genre + expect(doc).to match a_hash_including( + 'sw_genre_ssim' => genre_list + ) + end + end +end diff --git a/spec/indexers/content_metadata_datastream_indexer_spec.rb b/spec/indexers/content_metadata_datastream_indexer_spec.rb new file mode 100644 index 000000000..13dcba9d9 --- /dev/null +++ b/spec/indexers/content_metadata_datastream_indexer_spec.rb @@ -0,0 +1,59 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe ContentMetadataDatastreamIndexer do + let(:xml) do + <<~XML + + + + + + 3d3ff46d98f3d517d0bf086571e05c18 + ca1eb0edd09a21f9dd9e3a89abc790daf4d04916 + + + + 406d5d80fdd9ecc0352d339badb4a8fb + 61940d4fad097cba98a3e9dd9f12a90dde0be1ac + + + + 81ccd17bccf349581b779615e82a0366 + 12586b624540031bfa3d153299160c4885c3508c + + + + XML + end + + let(:obj) { Dor::Item.new } + + let(:indexer) do + described_class.new(resource: obj) + end + + before do + obj.contentMetadata.content = xml + end + + describe '#to_solr' do + subject(:doc) { indexer.to_solr } + + it 'has the fields used by argo' do + expect(doc).to include( + 'content_type_ssim' => 'map', + 'content_file_mimetypes_ssim' => ['image/jp2', 'image/gif', 'image/tiff'], + 'content_file_roles_ssim' => ['derivative'], + 'shelved_content_file_count_itsi' => 1, + 'resource_count_itsi' => 1, + 'content_file_count_itsi' => 3, + 'image_resource_count_itsi' => 1, + 'first_shelved_image_ss' => 'gw177fc7976_05_0001.jp2', + 'preserved_size_dbtsi' => 86_774_303, + 'shelved_size_dbtsi' => 5_143_883 + ) + end + end +end diff --git a/spec/indexers/data_indexer_spec.rb b/spec/indexers/data_indexer_spec.rb new file mode 100644 index 000000000..c97bb31d7 --- /dev/null +++ b/spec/indexers/data_indexer_spec.rb @@ -0,0 +1,26 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe DataIndexer do + let(:obj) do + Dor::AdminPolicyObject.new(pid: 'druid:999') + end + + let(:indexer) do + described_class.new(resource: obj) + end + + describe '#to_solr' do + let(:indexer) do + CompositeIndexer.new( + described_class + ).new(resource: obj) + end + let(:doc) { indexer.to_solr } + + it 'makes a solr doc' do + expect(doc).to match a_hash_including(id: 'druid:999') + end + end +end diff --git a/spec/indexers/default_object_rights_datastream_indexer_spec.rb b/spec/indexers/default_object_rights_datastream_indexer_spec.rb new file mode 100644 index 000000000..65aa2c51d --- /dev/null +++ b/spec/indexers/default_object_rights_datastream_indexer_spec.rb @@ -0,0 +1,33 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe DefaultObjectRightsDatastreamIndexer do + let(:obj) do + Dor::AdminPolicyObject.new + end + + let(:indexer) do + described_class.new(resource: obj) + end + + describe '#to_solr' do + let(:indexer) do + CompositeIndexer.new( + described_class + ).new(resource: obj) + end + let(:doc) { indexer.to_solr } + + before do + obj.use_statement = 'Rights are owned by Stanford University Libraries.' + obj.copyright_statement = 'Additional copyright info' + end + + it 'makes a solr doc' do + expect(doc).to match a_hash_including('use_statement_ssim' => + ['Rights are owned by Stanford University Libraries.']) + expect(doc).to match a_hash_including('copyright_ssim' => ['Additional copyright info']) + end + end +end diff --git a/spec/indexers/describable_indexer_spec.rb b/spec/indexers/describable_indexer_spec.rb new file mode 100644 index 000000000..9736ba2f5 --- /dev/null +++ b/spec/indexers/describable_indexer_spec.rb @@ -0,0 +1,122 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe DescribableIndexer do + let(:xml) do + <<~XML + + + + The + complete works of Henry George + + + George, Henry + + 1839-1897 + + creator + + + + George, Henry + + 1862-1916 + + text + + + xx + + + + Garden City, N. Y + + Doubleday, Page + 1911 + 1911 + [Library ed.] + + monographic + + + eng + + + + print + + 10 v. fronts (v. 1-9) ports. 21 cm. + + + YNG + 731210 + 19900625062034.0 + 68184 + + 757655 + + + + electronic + preservation + reformatted digital + + + I. Progress and poverty.--II. Social problems.--III. The land question. Property in land. The condition of labor.--IV. Protection or free trade.--V. A perplexed philosopher [Herbert Spencer]--VI. The science of political economy, books I and II.--VII. The science of political economy, books III to V. "Moses": a lecture.--VIII. Our land and land policy.--IX-X. The life of Henry George, by his son Henry George, jr. + On cover: Complete works of Henry George. Fels fund. Library edition. + + Economics + 1800-1900 + + + + DOR_MARC2MODS3-3.xsl Revision 1.1 + 2011-02-25T18:20:23.132-08:00 + 36105010700545 + + druid:pz263ny9658 + + Stanford University Libraries + + http://purl.stanford.edu/pz263ny9658 + + + XML + end + let(:obj) { Dor::Abstract.new } + + let(:indexer) do + described_class.new(resource: obj) + end + + describe '#to_solr' do + let(:doc) { indexer.to_solr } + + before do + obj.datastreams['descMetadata'].content = xml + end + + it 'includes values from stanford_mods' do + expect(doc).to match a_hash_including( + 'sw_language_ssim' => ['English'], + 'sw_format_ssim' => ['Book'], + 'sw_subject_temporal_ssim' => ['1800-1900'], + 'sw_pub_date_sort_ssi' => '1911', + 'sw_pub_date_facet_ssi' => '1911' + ) + end + + it 'does not include empty values' do + doc.keys.sort_by(&:to_s).each do |k| + expect(doc).to include(k) + expect(doc).to match hash_excluding(k => nil) + expect(doc).to match hash_excluding(k => []) + end + end + end +end diff --git a/spec/indexers/embargo_metadata_datastream_indexer_spec.rb b/spec/indexers/embargo_metadata_datastream_indexer_spec.rb new file mode 100644 index 000000000..5c7bd7d28 --- /dev/null +++ b/spec/indexers/embargo_metadata_datastream_indexer_spec.rb @@ -0,0 +1,50 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe EmbargoMetadataDatastreamIndexer do + let(:xml) do + <<~XML + + + embargoed + 2011-10-12T15:47:52-07:00 + released + 2016-10-12T15:47:52-07:00 + + + + + + + + + + + + + + XML + end + + let(:obj) { Hydrus::Item.new } + + let(:indexer) do + described_class.new(resource: obj) + end + + before do + obj.embargoMetadata.content = xml + end + + describe '#to_solr' do + subject(:doc) { indexer.to_solr } + + it 'has the fields used by argo' do + expect(doc).to eq('embargo_release_dtsim' => ['2011-10-12T22:47:52Z'], + 'embargo_status_ssim' => ['embargoed'], + 'twenty_pct_status_ssim' => ['released'], + 'twenty_pct_release_embargo_release_dtsim' => ['2016-10-12T22:47:52Z']) + end + end +end diff --git a/spec/indexers/identifiable_indexer_spec.rb b/spec/indexers/identifiable_indexer_spec.rb new file mode 100644 index 000000000..dc77bd96f --- /dev/null +++ b/spec/indexers/identifiable_indexer_spec.rb @@ -0,0 +1,169 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe IdentifiableIndexer do + let(:xml) do + <<~XML + + druid:rt923jk342 + item + google download barcode 36105049267078 + DOR + Squirrels of North America + Eder, Tamara, 1974- + STANFORD_342837261527 + 36105049267078 + 129483625 + 7f3da130-7b02-11de-8a39-0800200c9a66 + Google Books : Phase 1 + Google Books : Scan source STANFORD + Project : Beautiful Books + Registered By : blalbrit + DPG : Beautiful Books : Octavo : newpri + Remediated By : 4.15.4 + true + true + + XML + end + + let(:obj) { Dor::Abstract.new(pid: 'druid:rt923jk342') } + + let(:indexer) do + described_class.new(resource: obj) + end + + before do + obj.identityMetadata.content = xml + described_class.reset_cache! + end + + describe '#identity_metadata_source' do + it 'depends on remove_other_Id' do + obj.identityMetadata.remove_other_Id('catkey', '129483625') + obj.identityMetadata.remove_other_Id('barcode', '36105049267078') + obj.identityMetadata.add_other_Id('catkey', '129483625') + expect(indexer.identity_metadata_source).to eq 'Symphony' + obj.identityMetadata.remove_other_Id('catkey', '129483625') + obj.identityMetadata.add_other_Id('barcode', '36105049267078') + expect(indexer.identity_metadata_source).to eq 'Symphony' + obj.identityMetadata.remove_other_Id('barcode', '36105049267078') + expect(indexer.identity_metadata_source).to eq 'DOR' + obj.identityMetadata.remove_other_Id('foo', 'bar') + expect(indexer.identity_metadata_source).to eq 'DOR' + end + + it 'indexes metadata source' do + expect(indexer.identity_metadata_source).to eq 'Symphony' + end + end + + describe '#to_solr' do + let(:doc) { indexer.to_solr } + + context 'with related objects' do + let(:mock_rel_druid) { 'druid:does_not_exist' } + let(:mock_rels_ext_xml) do + %( + + + + + + ) + end + + before do + allow(obj.datastreams['RELS-EXT']).to receive(:content).and_return(mock_rels_ext_xml) + end + + context 'when related collection and APOs are not found' do + before do + allow(Dor).to receive(:find).with(mock_rel_druid).and_raise(ActiveFedora::ObjectNotFoundError) + end + + it 'generate collections and apo title fields' do + expect(doc[Solrizer.solr_name('collection_title', :symbol)].first).to eq mock_rel_druid + expect(doc[Solrizer.solr_name('collection_title', :stored_searchable)].first).to eq mock_rel_druid + expect(doc[Solrizer.solr_name('apo_title', :symbol)].first).to eq mock_rel_druid + expect(doc[Solrizer.solr_name('apo_title', :stored_searchable)].first).to eq mock_rel_druid + expect(doc[Solrizer.solr_name('nonhydrus_apo_title', :symbol)].first).to eq mock_rel_druid + expect(doc[Solrizer.solr_name('nonhydrus_apo_title', :stored_searchable)].first).to eq mock_rel_druid + end + end + + context 'when related collection and APOs are found' do + let(:mock_obj) { instance_double(Dor::Item, full_title: 'Test object') } + + before do + allow(Dor).to receive(:find).with(mock_rel_druid).and_return(mock_obj) + allow(indexer).to receive(:related_object_tags).and_return([]) + end + + it 'generate collections and apo title fields' do + expect(doc[Solrizer.solr_name('collection_title', :symbol)].first).to eq 'Test object' + expect(doc[Solrizer.solr_name('collection_title', :stored_searchable)].first).to eq 'Test object' + expect(doc[Solrizer.solr_name('apo_title', :symbol)].first).to eq 'Test object' + expect(doc[Solrizer.solr_name('apo_title', :stored_searchable)].first).to eq 'Test object' + expect(doc[Solrizer.solr_name('nonhydrus_apo_title', :symbol)].first).to eq 'Test object' + expect(doc[Solrizer.solr_name('nonhydrus_apo_title', :stored_searchable)].first).to eq 'Test object' + end + end + end + + it 'indexes metadata source' do + expect(doc).to match a_hash_including('metadata_source_ssi' => 'Symphony') + end + end + + describe '#related_object_tags' do + context 'with a nil' do + let(:object) { nil } + + it 'returns an empty array' do + expect(indexer.send(:related_object_tags, object)).to eq([]) + end + end + + context 'with an object that responds to #pid' do + before do + allow(Dor::Services::Client).to receive(:object).with(object.pid).and_return(fake_object_client) + end + + let(:fake_object_client) { instance_double(Dor::Services::Client::Object, administrative_tags: fake_tags_client) } + let(:fake_tags_client) { instance_double(Dor::Services::Client::AdministrativeTags, list: nil) } + let(:object) { obj } + + it 'makes a dor-services-client call' do + indexer.send(:related_object_tags, object) + expect(fake_tags_client).to have_received(:list).once + end + end + end + + describe '#related_obj_display_title' do + subject { indexer.send(:related_obj_display_title, mock_apo_obj, mock_default_title) } + + let(:mock_default_title) { 'druid:zy098xw7654' } + + context 'when the main title is available' do + let(:mock_apo_obj) { instance_double(Dor::AdminPolicyObject, full_title: 'apo title') } + + it { is_expected.to eq 'apo title' } + end + + context 'when the first descMetadata main title entry is empty string' do + let(:mock_apo_obj) { instance_double(Dor::AdminPolicyObject, full_title: nil) } + + it { is_expected.to eq mock_default_title } + end + + context 'when the related object is nil' do + let(:mock_apo_obj) { nil } + + it { is_expected.to eq mock_default_title } + end + end +end diff --git a/spec/indexers/identity_metadata_datastream_indexer_spec.rb b/spec/indexers/identity_metadata_datastream_indexer_spec.rb new file mode 100644 index 000000000..69d1647e2 --- /dev/null +++ b/spec/indexers/identity_metadata_datastream_indexer_spec.rb @@ -0,0 +1,59 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe IdentityMetadataDatastreamIndexer do + let(:xml) do + <<~XML + + druid:rt923jk342 + item + google download barcode 36105049267078 + DOR + Squirrels of North America + Eder, Tamara, 1974- + STANFORD_342837261527 + 36105049267078 + 129483625 + 7f3da130-7b02-11de-8a39-0800200c9a66 + Google Books : Phase 1 + Google Books : Scan source STANFORD + Project : Beautiful Books + Registered By : blalbrit + DPG : Beautiful Books : Octavo : newpri + Remediated By : 4.15.4 + true + true + + XML + end + + let(:obj) { Dor::Item.new(pid: 'druid:rt923jk342') } + + let(:indexer) do + described_class.new(resource: obj) + end + + before do + obj.identityMetadata.content = xml + end + + describe '#to_solr' do + subject(:doc) { indexer.to_solr } + + it 'has the fields used by argo' do + expect(doc).to include( + 'barcode_id_ssim' => ['36105049267078'], + 'catkey_id_ssim' => ['129483625'], + 'dor_id_tesim' => %w[STANFORD_342837261527 36105049267078 129483625 + 7f3da130-7b02-11de-8a39-0800200c9a66], + 'identifier_ssim' => ['google:STANFORD_342837261527', 'barcode:36105049267078', + 'catkey:129483625', 'uuid:7f3da130-7b02-11de-8a39-0800200c9a66'], + 'identifier_tesim' => ['google:STANFORD_342837261527', 'barcode:36105049267078', + 'catkey:129483625', 'uuid:7f3da130-7b02-11de-8a39-0800200c9a66'], + 'objectType_ssim' => ['item'], + 'source_id_ssim' => ['google:STANFORD_342837261527'] + ) + end + end +end diff --git a/spec/indexers/object_profile_indexer_spec.rb b/spec/indexers/object_profile_indexer_spec.rb new file mode 100644 index 000000000..13bb31eb7 --- /dev/null +++ b/spec/indexers/object_profile_indexer_spec.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe ObjectProfileIndexer do + let(:obj) do + Dor::Item.new(label: 'test label') + end + + let(:indexer) do + described_class.new(resource: obj) + end + + describe '#to_solr' do + let(:indexer) do + CompositeIndexer.new( + described_class + ).new(resource: obj) + end + let(:doc) { indexer.to_solr } + + it 'makes a solr doc' do + expect(doc).to match a_hash_including( + 'obj_label_tesim' => ['test label'], + 'obj_label_ssim' => ['test label'] + ) + end + end +end diff --git a/spec/indexers/processable_indexer_spec.rb b/spec/indexers/processable_indexer_spec.rb new file mode 100644 index 000000000..00feb7e90 --- /dev/null +++ b/spec/indexers/processable_indexer_spec.rb @@ -0,0 +1,144 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe ProcessableIndexer do + let(:indexer) { described_class.new(resource: obj) } + + describe '#to_solr' do + let(:obj) do + instance_double(Dor::Item, + current_version: '4', + pid: '99', + modified_date: '1999-12-20') + end + + let(:solr_doc) { indexer.to_solr } + + context 'with rights set' do + let(:obj) do + instance_double(Dor::Item, + pid: '99', + rights: 'World', + modified_date: '1999-12-20', + current_version: '7') + end + + describe '#to_solr' do + let(:indexer) do + CompositeIndexer.new( + described_class + ).new(resource: obj) + end + + let(:status) do + instance_double(Dor::Workflow::Client::Status, + milestones: {}, + info: { status_code: 0 }, + display: 'v1 blah (parenthetical)', + display_simplified: 'blah') + end + + let(:workflow_client) { instance_double(Dor::Workflow::Client, status: status) } + + before do + allow(Dor::Workflow::Client).to receive(:new).and_return(workflow_client) + end + + it 'includes a rights facet' do + expect(solr_doc).to match a_hash_including('rights_ssim' => ['World']) + end + + it 'does not error if there is nothing in the datastream' do + allow(obj).to receive(:rightsMetadata).and_return(Dor::RightsMetadataDS.new) + expect { solr_doc }.not_to raise_error + end + end + end + + context 'with milestones' do + let(:dsxml) do + ' + + + Initial version + + + Replacing main PDF + + + Fixed title typo + + + Another typo + + + ' + end + + let(:milestones) do + [ + { milestone: 'published', at: Time.zone.parse('2012-01-26 21:06:54 -0800'), version: '2' }, + { milestone: 'opened', at: Time.zone.parse('2012-10-29 16:30:07 -0700'), version: '2' }, + { milestone: 'submitted', at: Time.zone.parse('2012-11-06 16:18:24 -0800'), version: '2' }, + { milestone: 'published', at: Time.zone.parse('2012-11-06 16:19:07 -0800'), version: '2' }, + { milestone: 'accessioned', at: Time.zone.parse('2012-11-06 16:19:10 -0800'), version: '2' }, + { milestone: 'described', at: Time.zone.parse('2012-11-06 16:19:15 -0800'), version: '2' }, + { milestone: 'opened', at: Time.zone.parse('2012-11-06 16:21:02 -0800'), version: nil }, + { milestone: 'submitted', at: Time.zone.parse('2012-11-06 16:30:03 -0800'), version: nil }, + { milestone: 'described', at: Time.zone.parse('2012-11-06 16:35:00 -0800'), version: nil }, + { milestone: 'published', at: Time.zone.parse('2012-11-06 16:59:39 -0800'), version: '3' }, + { milestone: 'published', at: Time.zone.parse('2012-11-06 16:59:39 -0800'), version: nil } + ] + end + let(:version_metadata) { Dor::VersionMetadataDS.from_xml(dsxml) } + + let(:status) do + instance_double(Dor::Workflow::Client::Status, + milestones: milestones, + info: { status_code: 4 }, + display: 'v4 In accessioning (described, published)', + display_simplified: 'In accessioning') + end + + let(:workflow_client) { instance_double(Dor::Workflow::Client, status: status) } + + before do + allow(Dor::Workflow::Client).to receive(:new).and_return(workflow_client) + allow(obj).to receive(:versionMetadata).and_return(version_metadata) + end + + it 'includes the semicolon delimited version, an earliest published date and a status' do + # published date should be the first published date + expect(solr_doc['status_ssi']).to eq 'v4 In accessioning (described, published)' + expect(solr_doc['processing_status_text_ssi']).to eq 'In accessioning' + expect(solr_doc).to match a_hash_including('opened_dttsim' => including('2012-11-07T00:21:02Z')) + expect(solr_doc['published_earliest_dttsi']).to eq('2012-01-27T05:06:54Z') + expect(solr_doc['published_latest_dttsi']).to eq('2012-11-07T00:59:39Z') + expect(solr_doc['published_dttsim'].first).to eq(solr_doc['published_earliest_dttsi']) + expect(solr_doc['published_dttsim'].last).to eq(solr_doc['published_latest_dttsi']) + expect(solr_doc['published_dttsim'].size).to eq(3) # not 4 because 1 deduplicated value removed! + expect(solr_doc['opened_earliest_dttsi']).to eq('2012-10-29T23:30:07Z') # 2012-10-29T16:30:07-0700 + expect(solr_doc['opened_latest_dttsi']).to eq('2012-11-07T00:21:02Z') # 2012-11-06T16:21:02-0800 + end + + context 'when a new version has not been opened' do + let(:milestones) do + [{ milestone: 'submitted', at: Time.zone.parse('2012-11-06 16:30:03 -0800'), version: nil }, + { milestone: 'described', at: Time.zone.parse('2012-11-06 16:35:00 -0800'), version: nil }, + { milestone: 'published', at: Time.zone.parse('2012-11-06 16:59:39 -0800'), version: '3' }, + { milestone: 'published', at: Time.zone.parse('2012-11-06 16:59:39 -0800'), version: nil }] + end + + it 'skips the versioning related steps if a new version has not been opened' do + expect(solr_doc['opened_dttsim']).to be_nil + end + end + + it 'creates a modified_latest date field' do + # the facet field should have a date in it. + expect(solr_doc['modified_latest_dttsi']).to match(/^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ$/) + end + end + end +end diff --git a/spec/indexers/releasable_indexer_spec.rb b/spec/indexers/releasable_indexer_spec.rb new file mode 100644 index 000000000..3d3681214 --- /dev/null +++ b/spec/indexers/releasable_indexer_spec.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe ReleasableIndexer do + let(:obj) { instance_double(Dor::Abstract, pid: 'druid:pz263ny9658') } + + describe 'to_solr' do + let(:doc) { described_class.new(resource: obj).to_solr } + let(:released_for_info) do + { + 'Project' => { 'release' => true }, + 'test_target' => { 'release' => true }, + 'test_nontarget' => { 'release' => false } + } + end + let(:released_to_field_name) { Solrizer.solr_name('released_to', :symbol) } + let(:object_client) { instance_double(Dor::Services::Client::Object, release_tags: tags_client) } + let(:tags_client) { instance_double(Dor::Services::Client::ReleaseTags, list: released_for_info) } + + before do + allow(Dor::Services::Client).to receive(:object).and_return(object_client) + end + + it 'indexes release tags' do + expect(doc).to eq(released_to_field_name => %w[Project test_target]) + end + end +end diff --git a/spec/indexers/rights_metadata_datastream_indexer_spec.rb b/spec/indexers/rights_metadata_datastream_indexer_spec.rb new file mode 100644 index 000000000..a514ed8a2 --- /dev/null +++ b/spec/indexers/rights_metadata_datastream_indexer_spec.rb @@ -0,0 +1,123 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe RightsMetadataDatastreamIndexer do + let(:xml) do + <<~XML + + + + + + + + + + + + + + Official WTO documents are free for public use. + + by-nc-nd + + + Copyright © World Trade Organization + + + XML + end + + let(:obj) { Dor::Item.new(pid: 'druid:rt923jk342') } + let(:rights_md_ds) { obj.rightsMetadata } + + let(:indexer) do + described_class.new(resource: obj) + end + + before do + rights_md_ds.content = xml + end + + describe '#to_solr' do + subject(:doc) { indexer.to_solr } + + it 'has the fields used by argo' do + expect(doc).to include( + 'copyright_ssim' => ['Copyright © World Trade Organization'], + 'use_statement_ssim' => ['Official WTO documents are free for public use.'], + 'use_license_machine_ssi' => 'by-nc-nd', + 'rights_descriptions_ssim' => ['world'] + ) + end + + describe 'legacy tests to_solr' do + let(:mock_dra_obj) { instance_double(Dor::RightsAuth, index_elements: index_elements) } + + before do + allow(rights_md_ds).to receive(:dra_object).and_return(mock_dra_obj) + end + + context 'when access is restricted' do + let(:index_elements) do + { + primary: 'access_restricted', + errors: [], + terms: [], + obj_locations_qualified: [{ location: 'someplace', rule: 'somerule' }], + file_groups_qualified: [{ group: 'somegroup', rule: 'someotherrule' }] + } + end + + it 'filters access_restricted from what gets aggregated into rights_descriptions_ssim' do + expect(doc).to match a_hash_including( + 'rights_primary_ssi' => 'access_restricted', + 'rights_descriptions_ssim' => ['location: someplace (somerule)', 'somegroup (file) (someotherrule)'] + ) + end + end + + context 'when it is world qualified' do + let(:index_elements) do + { + primary: 'world_qualified', + errors: [], + terms: [], + obj_world_qualified: [{ rule: 'somerule' }] + } + end + + it 'filters world_qualified from what gets aggregated into rights_descriptions_ssim' do + expect(doc).to match a_hash_including( + 'rights_primary_ssi' => 'world_qualified', + 'rights_descriptions_ssim' => ['world (somerule)'] + ) + end + end + + context 'with file_rights' do + let(:index_elements) do + { + primary: 'access_restricted', + errors: [], + terms: [], + obj_locations: ['location'], + file_locations: ['file_specific_location'], + obj_agents: ['agent'], + file_agents: ['file_specific_agent'] + } + end + + it 'includes the simple fields that are present' do + expect(doc).to match a_hash_including( + 'obj_rights_locations_ssim' => ['location'], + 'file_rights_locations_ssim' => ['file_specific_location'], + 'obj_rights_agents_ssim' => ['agent'], + 'file_rights_agents_ssim' => ['file_specific_agent'] + ) + end + end + end + end +end diff --git a/spec/indexers/role_metadata_datastream_indexer_spec.rb b/spec/indexers/role_metadata_datastream_indexer_spec.rb new file mode 100644 index 000000000..dfab5148f --- /dev/null +++ b/spec/indexers/role_metadata_datastream_indexer_spec.rb @@ -0,0 +1,58 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe RoleMetadataDatastreamIndexer do + let(:obj) { Dor::AdminPolicyObject.new } + + let(:indexer) do + described_class.new(resource: obj) + end + + before do + obj.roleMetadata.content = xml + end + + describe '#to_solr' do + subject(:doc) { indexer.to_solr } + + context 'when there are non-Hydrus roles' do + let(:xml) do + <<~XML + + + + + dlss:dor-admin + + + + XML + end + + it 'has the fields used by argo' do + expect(doc['apo_register_permissions_ssim']).to eq ['workgroup:dlss:dor-admin'] + expect(doc['apo_register_permissions_tesim']).to eq ['workgroup:dlss:dor-admin'] + end + end + + context 'when there are hydrus roles' do + let(:xml) do + <<~XML + + + + dlss:dor-admin + + + + XML + end + + it 'does not index apo_register_permissions' do + expect(doc).not_to have_key('apo_register_permissions_ssim') + expect(doc).not_to have_key('apo_register_permissions_tesim') + end + end + end +end diff --git a/spec/indexers/workflow_indexer_spec.rb b/spec/indexers/workflow_indexer_spec.rb new file mode 100644 index 000000000..344d458b8 --- /dev/null +++ b/spec/indexers/workflow_indexer_spec.rb @@ -0,0 +1,179 @@ +# frozen_string_literal: true + +require 'spec_helper' +# require 'webmock/rspec' + +RSpec.describe WorkflowIndexer do + before do + stub_request(:get, 'https://localhost/workflow_templates/accessionWF') + .to_return(status: 200, body: workflow_template_json) + end + + let(:document) { Dor::Workflow::Response::Workflow.new(xml: xml) } + let(:indexer) { described_class.new(workflow: document) } + + let(:workflow_template_json) do + '{"processes":[{"name":"hello"},{"name":"goodbye"},{"name":"technical-metadata"},{"name":"some-other-step"}]}' + end + + let(:step1) { 'hello' } + let(:step2) { 'goodbye' } + let(:step3) { 'technical-metadata' } + let(:step4) { 'some-other-step' } + + describe '#to_solr' do + subject(:solr_doc) { indexer.to_solr.to_h } + + context 'when not all of the steps are completed' do + let(:xml) do + <<-XML + + + + + + XML + end + + it 'creates the workflow_status field with the workflow repository included, and indicates that the workflow is still active' do + expect(solr_doc[Solrizer.solr_name('workflow_status', :symbol)].first).to eq('accessionWF|active|0') + end + end + + context 'when the template has been changed to have new steps, but the workflow service indicates all steps are completed' do + let(:workflow_template_json) do + '{"processes":[{"name":"hello"},{"name":"goodbye"},{"name":"technical-metadata"},{"name":"some-other-step"}]}' + end + + let(:xml) do + <<-XML + + + + + + XML + end + + it 'indicates that the workflow is complete' do + expect(solr_doc[Solrizer.solr_name('workflow_status', :symbol)].first).to eq('accessionWF|completed|0') + end + end + + context 'when all steps are completed or skipped' do + let(:xml) do + <<-XML + + + + + + + + XML + end + + it 'indexes the right workflow status (completed)' do + expect(solr_doc).to match a_hash_including('workflow_status_ssim' => ['accessionWF|completed|0']) + end + end + + context 'when the xml has dates for completed and errored steps' do + let(:xml) do + <<-XML + + + + + + + + XML + end + + it 'indexes the iso8601 UTC dates' do + expect(solr_doc).to match a_hash_including('wf_accessionWF_start-accession_dttsi' => '2012-11-07T00:18:24Z') + expect(solr_doc).to match a_hash_including('wf_accessionWF_technical-metadata_dttsi' => '2012-11-07T00:18:58Z') + end + end + + context 'when the xml does not have dates for completed and errored steps' do + let(:xml) do + <<-XML + + + + + + + + XML + end + + it 'only indexes the dates on steps that include a date' do + expect(solr_doc).to match a_hash_including('wf_accessionWF_technical-metadata_dttsi') + expect(solr_doc).not_to match a_hash_including('wf_accessionWF_start_dttsi') + expect(solr_doc).not_to match a_hash_including('wf_accessionWF_goodbye_dttsi') + end + end + + context 'when there are error messages' do + let(:xml) do + <<-XML + + + + + + XML + end + + let(:wf_error) { solr_doc[Solrizer.solr_name('wf_error', :symbol)] } + + it 'indexes the error messages' do + expect(wf_error).to eq ['accessionWF:technical-metadata:druid:gv054hp4128 - Item error; caused by 413 Request Entity Too Large:'] + end + end + + context 'when the error messages are crazy long' do + let(:error_length) { 40_000 } + let(:error) { (0...error_length).map { rand(65..90).chr }.join } + let(:xml) do + <<-XML + + + + + + XML + end + + let(:wf_error) { solr_doc[Solrizer.solr_name('wf_error', :symbol)] } + + it "truncates the error messages to below Solr's limit" do + # 31 is the leader + expect(wf_error.first.length).to be < 32_766 + end + end + end +end diff --git a/spec/indexers/workflows_indexer_spec.rb b/spec/indexers/workflows_indexer_spec.rb new file mode 100644 index 000000000..f98333c77 --- /dev/null +++ b/spec/indexers/workflows_indexer_spec.rb @@ -0,0 +1,138 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe WorkflowsIndexer do + let(:obj) { instance_double(Dor::Item, pid: 'druid:ab123cd4567') } + + let(:indexer) { described_class.new(resource: obj) } + + describe '#to_solr' do + let(:solr_doc) { indexer.to_solr } + let(:xml) do + <<~XML + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + XML + end + + let(:accession_json) do + { 'processes' => [ + { 'name' => 'start-accession' }, + { 'name' => 'descriptive-metadata' }, + { 'name' => 'rights-metadata' }, + { 'name' => 'content-metadata' }, + { 'name' => 'technical-metadata' }, + { 'name' => 'remediate-object' }, + { 'name' => 'shelve' }, + { 'name' => 'publish' }, + { 'name' => 'provenance-metadata' }, + { 'name' => 'sdr-ingest-transfer' }, + { 'name' => 'sdr-ingest-received' }, + { 'name' => 'reset-workspace' }, + { 'name' => 'end-accession' } + ] } + end + + let(:assembly_json) do + { 'processes' => [ + { 'name' => 'start-assembly' }, + { 'name' => 'content-metadata-create' }, + { 'name' => 'jp2-create' }, + { 'name' => 'checksum-compute' }, + { 'name' => 'exif-collect' }, + { 'name' => 'accessioning-initiate' } + ] } + end + + let(:dissemination_json) do + { + 'processes' => [ + { 'name' => 'cleanup' } + ] + } + end + + let(:hydrus_json) do + { 'processes' => [ + { 'name' => 'start-deposit' }, + { 'name' => 'submit' }, + { 'name' => 'approve' }, + { 'name' => 'start-assembly' } + ] } + end + + let(:versioning_json) do + { 'processes' => [ + { 'name' => 'start-version' }, + { 'name' => 'submit-version' }, + { 'name' => 'start-accession' } + ] } + end + let(:workflow_client) { instance_double(Dor::Workflow::Client, workflow_routes: workflow_routes) } + let(:workflow_routes) do + instance_double(Dor::Workflow::Client::WorkflowRoutes, all_workflows: Dor::Workflow::Response::Workflows.new(xml: xml)) + end + + before do + allow(Dor::Workflow::Client).to receive(:new).and_return(workflow_client) + + allow(workflow_client).to receive(:workflow_template).with('accessionWF').and_return(accession_json) + allow(workflow_client).to receive(:workflow_template).with('assemblyWF').and_return(assembly_json) + allow(workflow_client).to receive(:workflow_template).with('disseminationWF').and_return(dissemination_json) + allow(workflow_client).to receive(:workflow_template).with('hydrusAssemblyWF').and_return(hydrus_json) + allow(workflow_client).to receive(:workflow_template).with('versioningWF').and_return(versioning_json) + end + + describe 'workflow_status_ssim' do + subject { solr_doc['workflow_status_ssim'] } + + it { is_expected.to eq ['accessionWF|completed|0', 'assemblyWF|active|1', 'disseminationWF|completed|0', 'hydrusAssemblyWF|completed|0', 'versioningWF|completed|0'] } + end + end +end diff --git a/spec/spec_helper.rb b/spec/spec_helper.rb index 1ab37d282..ac1149f61 100644 --- a/spec/spec_helper.rb +++ b/spec/spec_helper.rb @@ -15,6 +15,8 @@ require 'equivalent-xml/rspec_matchers' require 'factory_bot' require 'hydrus' +require 'webmock/rspec' +WebMock.disable_net_connect!(allow_localhost: true, allow: ['https://purl.stanford.edu']) # Requires supporting ruby files with custom matchers and macros, etc, in # spec/support/ and its subdirectories. Files matching `spec/**/*_spec.rb` are