From 6cb769f9a2d51a9ea5623949146c3397a61324bc Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Mon, 14 Dec 2020 13:46:40 -0600 Subject: [PATCH 01/24] Add a script to export collections --- bin/export-collections | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100755 bin/export-collections diff --git a/bin/export-collections b/bin/export-collections new file mode 100755 index 000000000..fcf6f37a8 --- /dev/null +++ b/bin/export-collections @@ -0,0 +1,28 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require_relative '../config/environment' +list = Hydrus::Collection.where(pid: 'druid:vq434wh5503') # Hydrus::Collection.all + +def serialize(coll) + creator = coll.events.ng_xml.xpath('//event[text()="Collection created"]/@who').to_s + apo = coll.apo + { + druid: coll.id, + creator: { sunetid: creator }, + name: coll.title, + visibility_option_value: coll.visibility_option_value, + embargo_option: coll.embargo_option, + embargo_terms: coll.embargo_terms, + requires_human_approval: coll.requires_human_approval, + license_option: coll.license_option, + object_status: coll.object_status, + managers: apo.persons_with_role('hydrus-collection-manager'), + depositors: apo.persons_with_role('hydrus-collection-item-depositor') + apo.persons_with_role('hydrus-collection-depositor'), + reviewers: apo.persons_with_role('hydrus-collection-reviewer') + } +end + +warn "Exporting #{list.count} collections" + +list.map { |collection| serialize(collection) } From 4841e193374f624eac3cdfbf17bd4094a7c2e967 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Mon, 14 Dec 2020 13:55:59 -0600 Subject: [PATCH 02/24] Export all the collections --- bin/export-collections | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bin/export-collections b/bin/export-collections index fcf6f37a8..4b5004673 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -1,8 +1,11 @@ #!/usr/bin/env ruby # frozen_string_literal: true +# Usage: +# RAILS_ENV=production bin/export-collections > collections.jsonl + require_relative '../config/environment' -list = Hydrus::Collection.where(pid: 'druid:vq434wh5503') # Hydrus::Collection.all +list = Hydrus::Collection.all def serialize(coll) creator = coll.events.ng_xml.xpath('//event[text()="Collection created"]/@who').to_s @@ -25,4 +28,4 @@ end warn "Exporting #{list.count} collections" -list.map { |collection| serialize(collection) } +puts list.map { |collection| serialize(collection).to_json } From f42d2ea2ca0d473ef4ed5cdcb43c04abfb2eda32 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Mon, 14 Dec 2020 14:18:48 -0600 Subject: [PATCH 03/24] Add required columns --- bin/export-collections | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/export-collections b/bin/export-collections index 4b5004673..da29c3d8b 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -14,6 +14,8 @@ def serialize(coll) druid: coll.id, creator: { sunetid: creator }, name: coll.title, + description: coll.abstract, + contact_email: coll.contact, visibility_option_value: coll.visibility_option_value, embargo_option: coll.embargo_option, embargo_terms: coll.embargo_terms, From c1cb30d85f4ba784a4cd60eed84fe30b9958dbcb Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Mon, 14 Dec 2020 14:25:38 -0600 Subject: [PATCH 04/24] Export the version --- bin/export-collections | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/export-collections b/bin/export-collections index da29c3d8b..6cc4a9fef 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -12,6 +12,7 @@ def serialize(coll) apo = coll.apo { druid: coll.id, + version: coll.current_version.to_i, creator: { sunetid: creator }, name: coll.title, description: coll.abstract, From 45dd09c3ebfe9a7651d6ca80c4e89c6b3c625d2c Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Mon, 14 Dec 2020 15:03:18 -0600 Subject: [PATCH 05/24] Warn on invalid APOs --- bin/export-collections | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bin/export-collections b/bin/export-collections index 6cc4a9fef..ee7646be4 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -10,6 +10,12 @@ list = Hydrus::Collection.all def serialize(coll) creator = coll.events.ng_xml.xpath('//event[text()="Collection created"]/@who').to_s apo = coll.apo + + unless apo.class == Hydrus::AdminPolicyObject + warn "The APO (#{apo.id}) for collection #{coll.id} is not a Hydrus::APO" + return {} + end + { druid: coll.id, version: coll.current_version.to_i, From d4485fbe2c3ce13acda894f0b31a5829fe0d0ef4 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Mon, 14 Dec 2020 16:32:08 -0600 Subject: [PATCH 06/24] Add related items to the export list --- bin/export-collections | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/bin/export-collections b/bin/export-collections index ee7646be4..7df9686d6 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -4,18 +4,23 @@ # Usage: # RAILS_ENV=production bin/export-collections > collections.jsonl +GRAVEYARD_APO = 'druid:kg712km1576' +UNIVERSITY_ARCHIVES_APO = 'druid:yf767bj4831' + require_relative '../config/environment' list = Hydrus::Collection.all def serialize(coll) creator = coll.events.ng_xml.xpath('//event[text()="Collection created"]/@who').to_s - apo = coll.apo - - unless apo.class == Hydrus::AdminPolicyObject - warn "The APO (#{apo.id}) for collection #{coll.id} is not a Hydrus::APO" - return {} + return if coll.apo_id == GRAVEYARD_APO + if coll.apo_id == UNIVERSITY_ARCHIVES_APO + warn "Collection #{collection.id} is in the Univrsity Archives APO, which is not a Hydrus::APO" + return end + apo = coll.apo + related_items = coll.related_items.map { |rel| { link_title: rel.title, url: rel.url } } + { druid: coll.id, version: coll.current_version.to_i, @@ -31,10 +36,11 @@ def serialize(coll) object_status: coll.object_status, managers: apo.persons_with_role('hydrus-collection-manager'), depositors: apo.persons_with_role('hydrus-collection-item-depositor') + apo.persons_with_role('hydrus-collection-depositor'), - reviewers: apo.persons_with_role('hydrus-collection-reviewer') + reviewers: apo.persons_with_role('hydrus-collection-reviewer'), + related_items: related_items } end warn "Exporting #{list.count} collections" -puts list.map { |collection| serialize(collection).to_json } +puts list.map { |collection| serialize(collection).to_json }.compact From 7b6314b20ea01b091780f502184f56a41290e58f Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Mon, 14 Dec 2020 16:58:39 -0600 Subject: [PATCH 07/24] Fix apo identifier method --- bin/export-collections | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/export-collections b/bin/export-collections index 7df9686d6..20de76ac7 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -12,9 +12,9 @@ list = Hydrus::Collection.all def serialize(coll) creator = coll.events.ng_xml.xpath('//event[text()="Collection created"]/@who').to_s - return if coll.apo_id == GRAVEYARD_APO - if coll.apo_id == UNIVERSITY_ARCHIVES_APO - warn "Collection #{collection.id} is in the Univrsity Archives APO, which is not a Hydrus::APO" + return if coll.admin_policy_object_id == GRAVEYARD_APO + if coll.admin_policy_object_id == UNIVERSITY_ARCHIVES_APO + warn "Collection #{coll.id} is in the University Archives APO, which is not a Hydrus::APO" return end From 8d9d3d85292b96e490b5d6bb31f4709976f8beda Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Tue, 15 Dec 2020 08:29:37 -0600 Subject: [PATCH 08/24] Don't export empty related items --- bin/export-collections | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/export-collections b/bin/export-collections index 20de76ac7..1f99ed8ff 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -19,7 +19,7 @@ def serialize(coll) end apo = coll.apo - related_items = coll.related_items.map { |rel| { link_title: rel.title, url: rel.url } } + related_items = coll.related_items.map { |rel| { link_title: rel.title.presence, url: rel.url.presence }.compact }.reject(&:blank?) { druid: coll.id, From 34afe25660b922b4a3886c280cff1b4470b08e49 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Tue, 15 Dec 2020 09:46:55 -0600 Subject: [PATCH 09/24] Export more collection fields --- bin/export-collections | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/export-collections b/bin/export-collections index 1f99ed8ff..e7599a163 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -28,11 +28,13 @@ def serialize(coll) name: coll.title, description: coll.abstract, contact_email: coll.contact, - visibility_option_value: coll.visibility_option_value, + visibility_option: coll.visibility_option, + visibility: coll.visibility, embargo_option: coll.embargo_option, embargo_terms: coll.embargo_terms, requires_human_approval: coll.requires_human_approval, license_option: coll.license_option, + license: coll.license, object_status: coll.object_status, managers: apo.persons_with_role('hydrus-collection-manager'), depositors: apo.persons_with_role('hydrus-collection-item-depositor') + apo.persons_with_role('hydrus-collection-depositor'), From 9084e88b7322cc11a148c88a6428570bd4d2181e Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Tue, 15 Dec 2020 13:19:26 -0600 Subject: [PATCH 10/24] Add a script for exporting items --- bin/export-items | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100755 bin/export-items diff --git a/bin/export-items b/bin/export-items new file mode 100755 index 000000000..54bf8aa0d --- /dev/null +++ b/bin/export-items @@ -0,0 +1,35 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +# Usage: +# RAILS_ENV=production bin/export-items > items.jsonl + +require_relative '../config/environment' +list = Hydrus::Item.all + +def serialize(item) + creator = item.roleMetadata.ng_xml.xpath('//role/person/identifier').text + related_items = item.related_items.map { |rel| { link_title: rel.title.presence, url: rel.url.presence }.compact }.reject(&:blank?) + + { + druid: item.id, + version: item.current_version.to_i, + creator: { sunetid: creator }, + title: item.title, + abstract: item.abstract, + contact_email: item.contact, + collection: item.collection_id, + visibility: item.visibility.first, + license: item.license, + embargo_release_date: item.rmd_embargo_release_date, + date_created: item.date_created, + object_status: item.object_status, + item_type: item.item_type, + citation: item.preferred_citation, + related_items: related_items + } +end + +warn "Exporting #{list.count} items" + +puts list.map { |item| serialize(item).to_json }.compact From cde889af5f813cfa312bb48e140a43ded410449e Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Tue, 15 Dec 2020 14:56:18 -0600 Subject: [PATCH 11/24] Stream output and handle errors --- bin/export-items | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/bin/export-items b/bin/export-items index 54bf8aa0d..86780804d 100755 --- a/bin/export-items +++ b/bin/export-items @@ -31,5 +31,14 @@ def serialize(item) end warn "Exporting #{list.count} items" - -puts list.map { |item| serialize(item).to_json }.compact +count = 0 +list.find_each do |item| + count += 1 + warn count + begin + attributes = serialize(item) + puts attributes.compact.to_json + rescue => e + warn "Error with #{item.pid}. #{e.message}" + end +end From f5ec36f9d03dbee80936ed95928add41010ec625 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Tue, 15 Dec 2020 16:45:52 -0600 Subject: [PATCH 12/24] Export contributors --- bin/export-items | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bin/export-items b/bin/export-items index 86780804d..dfbf72ec0 100755 --- a/bin/export-items +++ b/bin/export-items @@ -10,6 +10,9 @@ list = Hydrus::Item.all def serialize(item) creator = item.roleMetadata.ng_xml.xpath('//role/person/identifier').text related_items = item.related_items.map { |rel| { link_title: rel.title.presence, url: rel.url.presence }.compact }.reject(&:blank?) + contributors = item.contributors. + map { |contrib| { full_name: contrib.name, role: contrib.role, name_type: contrib.name_type }}. + reject { |contrib| contrib[:full_name].blank? } { druid: item.id, @@ -26,7 +29,8 @@ def serialize(item) object_status: item.object_status, item_type: item.item_type, citation: item.preferred_citation, - related_items: related_items + related_items: related_items, + contributors: contributors } end From c753a21013c7fa36de66c2a0d732c398ff13b2d5 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Wed, 16 Dec 2020 12:57:37 -0600 Subject: [PATCH 13/24] Export timestamps and files --- bin/export-collections | 4 +++- bin/export-items | 26 +++++++++++++++++++++----- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/bin/export-collections b/bin/export-collections index e7599a163..dc8fb64ae 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -39,7 +39,9 @@ def serialize(coll) managers: apo.persons_with_role('hydrus-collection-manager'), depositors: apo.persons_with_role('hydrus-collection-item-depositor') + apo.persons_with_role('hydrus-collection-depositor'), reviewers: apo.persons_with_role('hydrus-collection-reviewer'), - related_items: related_items + related_items: related_items, + created_at: item.create_date, + updated_at: item.modified_date } end diff --git a/bin/export-items b/bin/export-items index dfbf72ec0..47d20a8d3 100755 --- a/bin/export-items +++ b/bin/export-items @@ -7,13 +7,26 @@ require_relative '../config/environment' list = Hydrus::Item.all +def contributors(item) + item.contributors. + map { |contrib| { full_name: contrib.name, role: contrib.role, name_type: contrib.name_type }}. + reject { |contrib| contrib[:full_name].blank? } +end + +def files(item) + item.files. + map do |object_file| + { + path: File.realdirpath(object_file.current_path), + label: object_file.label + } + end +end + + def serialize(item) creator = item.roleMetadata.ng_xml.xpath('//role/person/identifier').text related_items = item.related_items.map { |rel| { link_title: rel.title.presence, url: rel.url.presence }.compact }.reject(&:blank?) - contributors = item.contributors. - map { |contrib| { full_name: contrib.name, role: contrib.role, name_type: contrib.name_type }}. - reject { |contrib| contrib[:full_name].blank? } - { druid: item.id, version: item.current_version.to_i, @@ -30,7 +43,10 @@ def serialize(item) item_type: item.item_type, citation: item.preferred_citation, related_items: related_items, - contributors: contributors + contributors: contributors(item), + files: files(item), + created_at: item.create_date, + updated_at: item.modified_date } end From 0e90294d03e8e759f96369b162821310b967575f Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Wed, 16 Dec 2020 15:21:57 -0600 Subject: [PATCH 14/24] Export keywords --- bin/export-items | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/export-items b/bin/export-items index 47d20a8d3..b7f4fe6c8 100755 --- a/bin/export-items +++ b/bin/export-items @@ -44,6 +44,7 @@ def serialize(item) citation: item.preferred_citation, related_items: related_items, contributors: contributors(item), + keywords: item.keywords, files: files(item), created_at: item.create_date, updated_at: item.modified_date From 8dad1b752e56a509dea1a5cb2eee1ac073ddeb11 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Wed, 16 Dec 2020 15:27:48 -0600 Subject: [PATCH 15/24] Export related citations --- bin/export-items | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/export-items b/bin/export-items index b7f4fe6c8..32d6b1e4b 100755 --- a/bin/export-items +++ b/bin/export-items @@ -43,6 +43,7 @@ def serialize(item) item_type: item.item_type, citation: item.preferred_citation, related_items: related_items, + related_citations: item.related_citation, contributors: contributors(item), keywords: item.keywords, files: files(item), From c8aefde5de868f50d5fbdf6e5fba2a97f841a751 Mon Sep 17 00:00:00 2001 From: "Michael J. Giarlo" Date: Wed, 17 Feb 2021 13:39:07 -0800 Subject: [PATCH 16/24] Allow for filtering which collections and items are exported By convention, use a file called "collection_druids.txt", which is a plain-text file containing a number of bare druids, one per line. This file is produced by copying a column of druids out of a spreadsheet maintained by the PO. This will filter both the collections and the items that are exported. --- bin/export-collections | 10 +++++++++- bin/export-items | 11 ++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/bin/export-collections b/bin/export-collections index dc8fb64ae..a4ff80910 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -4,11 +4,19 @@ # Usage: # RAILS_ENV=production bin/export-collections > collections.jsonl +COLLECTION_DRUIDS_LIST = 'collection_druids.txt' GRAVEYARD_APO = 'druid:kg712km1576' UNIVERSITY_ARCHIVES_APO = 'druid:yf767bj4831' require_relative '../config/environment' -list = Hydrus::Collection.all + +collection_druids = File.exist?(COLLECTION_DRUIDS_LIST) ? + File.read(COLLECTION_DRUIDS_LIST).split.map { |bare_druid| "druid:#{bare_druid}" } : + [] + +list = collection_druids.any? ? + Hydrus::Collection.find(collection_druids) : + Hydrus::Collection.all def serialize(coll) creator = coll.events.ng_xml.xpath('//event[text()="Collection created"]/@who').to_s diff --git a/bin/export-items b/bin/export-items index 32d6b1e4b..9f5148c98 100755 --- a/bin/export-items +++ b/bin/export-items @@ -4,8 +4,17 @@ # Usage: # RAILS_ENV=production bin/export-items > items.jsonl +COLLECTION_DRUIDS_LIST = 'collection_druids.txt' + require_relative '../config/environment' -list = Hydrus::Item.all + +collection_druids = File.exist?(COLLECTION_DRUIDS_LIST) ? + File.read(COLLECTION_DRUIDS_LIST).split.map { |bare_druid| "druid:#{bare_druid}" } : + [] + +list = collection_druids.any? ? + Hydrus::Collection.find(collection_druids).flat_map(&:items) : + Hydrus::Item.all def contributors(item) item.contributors. From 366dbb1c5928f6c1c3916d9606cd887a81791e37 Mon Sep 17 00:00:00 2001 From: "Michael J. Giarlo" Date: Wed, 17 Feb 2021 16:37:01 -0800 Subject: [PATCH 17/24] Fix bug in collection exporter (referencing an item) --- bin/export-collections | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/export-collections b/bin/export-collections index a4ff80910..e7cb7b41a 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -48,8 +48,8 @@ def serialize(coll) depositors: apo.persons_with_role('hydrus-collection-item-depositor') + apo.persons_with_role('hydrus-collection-depositor'), reviewers: apo.persons_with_role('hydrus-collection-reviewer'), related_items: related_items, - created_at: item.create_date, - updated_at: item.modified_date + created_at: coll.create_date, + updated_at: coll.modified_date } end From c47a429eb1d434a2f980c43b27be6f9974d68638 Mon Sep 17 00:00:00 2001 From: "Michael J. Giarlo" Date: Wed, 17 Feb 2021 16:38:10 -0800 Subject: [PATCH 18/24] Fix bug with item exporter such that the list is always an AF::Relation Prior this could be an array which does not respond to #find_each and winds up loading tons of data into memory. Monkeypatch AF to allow POSTing to Solr. Confirmed working. And prefer `#each` with `#find_each` which, when chained with `#where`, has unpredictable results. (Instead of operating on the ~4K items in the relation returned by `#where`, `#find_each` operated on all instances of the model.) To support this, add amother AF monkeypatch to use HTTP POST. --- bin/export-items | 46 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/bin/export-items b/bin/export-items index 9f5148c98..1b77ca333 100755 --- a/bin/export-items +++ b/bin/export-items @@ -8,13 +8,53 @@ COLLECTION_DRUIDS_LIST = 'collection_druids.txt' require_relative '../config/environment' +# Monkey-patch AF to allow using HTTP POST (for querying items by their collection) +module ActiveFedora + class SolrService + def self.query(query, args={}) + raw = args.delete(:raw) + args = args.merge(:q=>query, :qt=>'standard') + result = SolrService.instance.conn.post('select', :data=>args) + return result if raw + result['response']['docs'] + end + end + + module FinderMethods + def find_in_batches conditions, opts={} + data = { :q => create_query(conditions) } + opts[:qt] = @klass.solr_query_handler + #set default sort to created date ascending + unless opts[:sort].present? + opts[:sort]= @klass.default_sort_params + end + + batch_size = opts.delete(:batch_size) || 1000 + + counter = 0 + begin + counter += 1 + response = ActiveFedora::SolrService.instance.conn.paginate counter, batch_size, "select", { :method => :post, :params => opts, :data => data } + docs = response["response"]["docs"] + yield docs + end while docs.has_next? + end + end +end + collection_druids = File.exist?(COLLECTION_DRUIDS_LIST) ? File.read(COLLECTION_DRUIDS_LIST).split.map { |bare_druid| "druid:#{bare_druid}" } : [] -list = collection_druids.any? ? - Hydrus::Collection.find(collection_druids).flat_map(&:items) : +list = if collection_druids.any? + where_collection_in_list_query = ActiveFedora::SolrService.construct_query_for_rel( + collection_druids.map { |druid| [:is_member_of_collection, "info:fedora/#{druid}"] }, + ' OR ' + ) + Hydrus::Item.where(where_collection_in_list_query) + else Hydrus::Item.all + end def contributors(item) item.contributors. @@ -63,7 +103,7 @@ end warn "Exporting #{list.count} items" count = 0 -list.find_each do |item| +list.each do |item| count += 1 warn count begin From 6bbba607a6dc85fdfee90651e87c364d6b48be7b Mon Sep 17 00:00:00 2001 From: "Michael J. Giarlo" Date: Tue, 23 Feb 2021 14:07:37 -0800 Subject: [PATCH 19/24] Export events for items and collections --- bin/export-collections | 12 ++++++++++++ bin/export-items | 11 +++++++++++ 2 files changed, 23 insertions(+) diff --git a/bin/export-collections b/bin/export-collections index e7cb7b41a..da4673e5b 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -18,6 +18,17 @@ list = collection_druids.any? ? Hydrus::Collection.find(collection_druids) : Hydrus::Collection.all +def events_for(coll) + coll.get_hydrus_events.map do |event| + { + who: event.who, # is a bare sunetid string, maps to user_id + when: event.when, # is a UTC timestamp string, maps to created_at/updated_at + text: event.text # is a string, maps to event_type ('update_metadata') and description + # mapping also requires an eventable_type ('Collection') and an eventable_id (the collection ID) + } + end +end + def serialize(coll) creator = coll.events.ng_xml.xpath('//event[text()="Collection created"]/@who').to_s return if coll.admin_policy_object_id == GRAVEYARD_APO @@ -48,6 +59,7 @@ def serialize(coll) depositors: apo.persons_with_role('hydrus-collection-item-depositor') + apo.persons_with_role('hydrus-collection-depositor'), reviewers: apo.persons_with_role('hydrus-collection-reviewer'), related_items: related_items, + events: events_for(coll), created_at: coll.create_date, updated_at: coll.modified_date } diff --git a/bin/export-items b/bin/export-items index 1b77ca333..b39fd1925 100755 --- a/bin/export-items +++ b/bin/export-items @@ -72,6 +72,16 @@ def files(item) end end +def events_for(item) + item.get_hydrus_events.map do |event| + { + who: event.who, # is a bare sunetid string, maps to user_id + when: event.when, # is a UTC timestamp string, maps to created_at/updated_at + text: event.text # is a string, maps to event_type ('update_metadata') and description + # mapping also requires an eventable_type ('Work') and an eventable_id (the work ID) + } + end +end def serialize(item) creator = item.roleMetadata.ng_xml.xpath('//role/person/identifier').text @@ -96,6 +106,7 @@ def serialize(item) contributors: contributors(item), keywords: item.keywords, files: files(item), + events: events_for(item), created_at: item.create_date, updated_at: item.modified_date } From a3fbe51b4c4410e33d6a38181a36251bd1dd89aa Mon Sep 17 00:00:00 2001 From: "Michael J. Giarlo" Date: Wed, 24 Feb 2021 15:05:04 -0800 Subject: [PATCH 20/24] Filter out graveyard APO collections in item exporter --- bin/export-items | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/bin/export-items b/bin/export-items index b39fd1925..6b14e2256 100755 --- a/bin/export-items +++ b/bin/export-items @@ -5,6 +5,8 @@ # RAILS_ENV=production bin/export-items > items.jsonl COLLECTION_DRUIDS_LIST = 'collection_druids.txt' +GRAVEYARD_APO = 'druid:kg712km1576' +UNIVERSITY_ARCHIVES_APO = 'druid:yf767bj4831' require_relative '../config/environment' @@ -42,9 +44,15 @@ module ActiveFedora end end -collection_druids = File.exist?(COLLECTION_DRUIDS_LIST) ? - File.read(COLLECTION_DRUIDS_LIST).split.map { |bare_druid| "druid:#{bare_druid}" } : +collection_druids = if File.exist?(COLLECTION_DRUIDS_LIST) + File + .read(COLLECTION_DRUIDS_LIST) + .split + .map { |bare_druid| "druid:#{bare_druid}" } + .reject { |druid| [GRAVEYARD_APO, UNIVERSITY_ARCHIVES_APO].include?(Hydrus::Collection.find(druid).admin_policy_object_id) } + else [] + end list = if collection_druids.any? where_collection_in_list_query = ActiveFedora::SolrService.construct_query_for_rel( From 7d66be1b1dc1c5c6e4fe4c613e35c9a1c3f9ccf9 Mon Sep 17 00:00:00 2001 From: "Michael J. Giarlo" Date: Thu, 25 Feb 2021 16:20:09 -0800 Subject: [PATCH 21/24] Filter out items belonging to the graveyard APO when exporting Connects to sul-dlss/happy-heron#1199 --- bin/export-items | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/bin/export-items b/bin/export-items index 6b14e2256..cb1500932 100755 --- a/bin/export-items +++ b/bin/export-items @@ -6,7 +6,6 @@ COLLECTION_DRUIDS_LIST = 'collection_druids.txt' GRAVEYARD_APO = 'druid:kg712km1576' -UNIVERSITY_ARCHIVES_APO = 'druid:yf767bj4831' require_relative '../config/environment' @@ -45,11 +44,7 @@ module ActiveFedora end collection_druids = if File.exist?(COLLECTION_DRUIDS_LIST) - File - .read(COLLECTION_DRUIDS_LIST) - .split - .map { |bare_druid| "druid:#{bare_druid}" } - .reject { |druid| [GRAVEYARD_APO, UNIVERSITY_ARCHIVES_APO].include?(Hydrus::Collection.find(druid).admin_policy_object_id) } + File.read(COLLECTION_DRUIDS_LIST).split.map { |bare_druid| "druid:#{bare_druid}" } else [] end @@ -126,6 +121,10 @@ list.each do |item| count += 1 warn count begin + if item.apo_pid == GRAVEYARD_APO + warn "Skipping #{item.pid} because it belongs to the Graveyard APO" + next + end attributes = serialize(item) puts attributes.compact.to_json rescue => e From 19e58fb23388cb033dbaff996e410766214f4b56 Mon Sep 17 00:00:00 2001 From: "Michael J. Giarlo" Date: Thu, 25 Feb 2021 16:30:55 -0800 Subject: [PATCH 22/24] Filter out items and collections from being exported if they have a catkey Connects to sul-dlss/happy-heron#1199 --- bin/export-collections | 9 ++++++++- bin/export-items | 3 +++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/bin/export-collections b/bin/export-collections index da4673e5b..87e7bd83a 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -67,4 +67,11 @@ end warn "Exporting #{list.count} collections" -puts list.map { |collection| serialize(collection).to_json }.compact +list.map do |collection| + if collection.catkey.present? + warn "Skipping #{collection.pid} because it has a catkey" + next + end + + puts serialize(collection).to_json +end diff --git a/bin/export-items b/bin/export-items index cb1500932..0abd1bd3a 100755 --- a/bin/export-items +++ b/bin/export-items @@ -124,6 +124,9 @@ list.each do |item| if item.apo_pid == GRAVEYARD_APO warn "Skipping #{item.pid} because it belongs to the Graveyard APO" next + elsif item.catkey.present? + warn "Skipping #{item.pid} because it has a catkey" + next end attributes = serialize(item) puts attributes.compact.to_json From c89d5a6c32b416334da7af9f7a888f3bce77b30f Mon Sep 17 00:00:00 2001 From: "Michael J. Giarlo" Date: Thu, 25 Feb 2021 16:35:49 -0800 Subject: [PATCH 23/24] Export files "hide" bit when exporting items Connects to sul-dlss/happy-heron#1199 --- bin/export-items | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/export-items b/bin/export-items index 0abd1bd3a..a6dbaddbc 100755 --- a/bin/export-items +++ b/bin/export-items @@ -70,7 +70,8 @@ def files(item) map do |object_file| { path: File.realdirpath(object_file.current_path), - label: object_file.label + label: object_file.label, + hide: object_file.hide } end end From e1a872beba69ba2fd1bb5b33d73e44d49edea72c Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Thu, 3 Jun 2021 10:11:46 -0500 Subject: [PATCH 24/24] Add a list of items to exclude --- bin/export-items | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/bin/export-items b/bin/export-items index a6dbaddbc..909247f46 100755 --- a/bin/export-items +++ b/bin/export-items @@ -7,6 +7,17 @@ COLLECTION_DRUIDS_LIST = 'collection_druids.txt' GRAVEYARD_APO = 'druid:kg712km1576' +# This list from: https://docs.google.com/spreadsheets/d/1Gu0TIUpNByNgNtFDn5MJARUvtsgcNXJAHiKf5s_NsUc/edit#gid=0 +# NOTE: we don't need to worry about the "hydrus object with DOI", because these are not returned in the query +# for Hydrus::Items as they have been converted to Dor::Item. +ITEMS_TO_EXCLUDE = %w[ + druid:ty334nd6571 + druid:bx749bs2681 + druid:zc000fq4044 + druid:pr213sh5046 + druid:jr671fk0644 +] + require_relative '../config/environment' # Monkey-patch AF to allow using HTTP POST (for querying items by their collection) @@ -128,6 +139,9 @@ list.each do |item| elsif item.catkey.present? warn "Skipping #{item.pid} because it has a catkey" next + elsif ITEMS_TO_EXCLUDE.include? item.pid + warn "Skipping #{item.pid} because it is on the list of items to exclude" + next end attributes = serialize(item) puts attributes.compact.to_json