From 7bab02c2a0a1a3f1eec6be708440a1b3a972e0b2 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Mon, 14 Dec 2020 13:46:40 -0600 Subject: [PATCH 01/15] Add a script to export collections --- bin/export-collections | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100755 bin/export-collections diff --git a/bin/export-collections b/bin/export-collections new file mode 100755 index 000000000..fcf6f37a8 --- /dev/null +++ b/bin/export-collections @@ -0,0 +1,28 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require_relative '../config/environment' +list = Hydrus::Collection.where(pid: 'druid:vq434wh5503') # Hydrus::Collection.all + +def serialize(coll) + creator = coll.events.ng_xml.xpath('//event[text()="Collection created"]/@who').to_s + apo = coll.apo + { + druid: coll.id, + creator: { sunetid: creator }, + name: coll.title, + visibility_option_value: coll.visibility_option_value, + embargo_option: coll.embargo_option, + embargo_terms: coll.embargo_terms, + requires_human_approval: coll.requires_human_approval, + license_option: coll.license_option, + object_status: coll.object_status, + managers: apo.persons_with_role('hydrus-collection-manager'), + depositors: apo.persons_with_role('hydrus-collection-item-depositor') + apo.persons_with_role('hydrus-collection-depositor'), + reviewers: apo.persons_with_role('hydrus-collection-reviewer') + } +end + +warn "Exporting #{list.count} collections" + +list.map { |collection| serialize(collection) } From 4b62d4ebec964eb563e2e650ceded34e1c90b30c Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Mon, 14 Dec 2020 13:55:59 -0600 Subject: [PATCH 02/15] Export all the collections --- bin/export-collections | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/bin/export-collections b/bin/export-collections index fcf6f37a8..4b5004673 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -1,8 +1,11 @@ #!/usr/bin/env ruby # frozen_string_literal: true +# Usage: +# RAILS_ENV=production bin/export-collections > collections.jsonl + require_relative '../config/environment' -list = Hydrus::Collection.where(pid: 'druid:vq434wh5503') # Hydrus::Collection.all +list = Hydrus::Collection.all def serialize(coll) creator = coll.events.ng_xml.xpath('//event[text()="Collection created"]/@who').to_s @@ -25,4 +28,4 @@ end warn "Exporting #{list.count} collections" -list.map { |collection| serialize(collection) } +puts list.map { |collection| serialize(collection).to_json } From a1aa81901c0da0c8787ef13a4fc2593fa37136ef Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Mon, 14 Dec 2020 14:18:48 -0600 Subject: [PATCH 03/15] Add required columns --- bin/export-collections | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/export-collections b/bin/export-collections index 4b5004673..da29c3d8b 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -14,6 +14,8 @@ def serialize(coll) druid: coll.id, creator: { sunetid: creator }, name: coll.title, + description: coll.abstract, + contact_email: coll.contact, visibility_option_value: coll.visibility_option_value, embargo_option: coll.embargo_option, embargo_terms: coll.embargo_terms, From 5e4a789ce7ddc31862e672605b91637c11b38515 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Mon, 14 Dec 2020 14:25:38 -0600 Subject: [PATCH 04/15] Export the version --- bin/export-collections | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/export-collections b/bin/export-collections index da29c3d8b..6cc4a9fef 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -12,6 +12,7 @@ def serialize(coll) apo = coll.apo { druid: coll.id, + version: coll.current_version.to_i, creator: { sunetid: creator }, name: coll.title, description: coll.abstract, From 501c69df38ffe490674c8848ad059d9dfe6518f8 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Mon, 14 Dec 2020 15:03:18 -0600 Subject: [PATCH 05/15] Warn on invalid APOs --- bin/export-collections | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/bin/export-collections b/bin/export-collections index 6cc4a9fef..ee7646be4 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -10,6 +10,12 @@ list = Hydrus::Collection.all def serialize(coll) creator = coll.events.ng_xml.xpath('//event[text()="Collection created"]/@who').to_s apo = coll.apo + + unless apo.class == Hydrus::AdminPolicyObject + warn "The APO (#{apo.id}) for collection #{coll.id} is not a Hydrus::APO" + return {} + end + { druid: coll.id, version: coll.current_version.to_i, From 76f1f3e7c44dcc4e67f0ee77d51fea69b7bd8f18 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Mon, 14 Dec 2020 16:32:08 -0600 Subject: [PATCH 06/15] Add related items to the export list --- bin/export-collections | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/bin/export-collections b/bin/export-collections index ee7646be4..7df9686d6 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -4,18 +4,23 @@ # Usage: # RAILS_ENV=production bin/export-collections > collections.jsonl +GRAVEYARD_APO = 'druid:kg712km1576' +UNIVERSITY_ARCHIVES_APO = 'druid:yf767bj4831' + require_relative '../config/environment' list = Hydrus::Collection.all def serialize(coll) creator = coll.events.ng_xml.xpath('//event[text()="Collection created"]/@who').to_s - apo = coll.apo - - unless apo.class == Hydrus::AdminPolicyObject - warn "The APO (#{apo.id}) for collection #{coll.id} is not a Hydrus::APO" - return {} + return if coll.apo_id == GRAVEYARD_APO + if coll.apo_id == UNIVERSITY_ARCHIVES_APO + warn "Collection #{collection.id} is in the Univrsity Archives APO, which is not a Hydrus::APO" + return end + apo = coll.apo + related_items = coll.related_items.map { |rel| { link_title: rel.title, url: rel.url } } + { druid: coll.id, version: coll.current_version.to_i, @@ -31,10 +36,11 @@ def serialize(coll) object_status: coll.object_status, managers: apo.persons_with_role('hydrus-collection-manager'), depositors: apo.persons_with_role('hydrus-collection-item-depositor') + apo.persons_with_role('hydrus-collection-depositor'), - reviewers: apo.persons_with_role('hydrus-collection-reviewer') + reviewers: apo.persons_with_role('hydrus-collection-reviewer'), + related_items: related_items } end warn "Exporting #{list.count} collections" -puts list.map { |collection| serialize(collection).to_json } +puts list.map { |collection| serialize(collection).to_json }.compact From f95baa7d73dfffd040f5030af98833aee4428acd Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Mon, 14 Dec 2020 16:58:39 -0600 Subject: [PATCH 07/15] Fix apo identifier method --- bin/export-collections | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/export-collections b/bin/export-collections index 7df9686d6..20de76ac7 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -12,9 +12,9 @@ list = Hydrus::Collection.all def serialize(coll) creator = coll.events.ng_xml.xpath('//event[text()="Collection created"]/@who').to_s - return if coll.apo_id == GRAVEYARD_APO - if coll.apo_id == UNIVERSITY_ARCHIVES_APO - warn "Collection #{collection.id} is in the Univrsity Archives APO, which is not a Hydrus::APO" + return if coll.admin_policy_object_id == GRAVEYARD_APO + if coll.admin_policy_object_id == UNIVERSITY_ARCHIVES_APO + warn "Collection #{coll.id} is in the University Archives APO, which is not a Hydrus::APO" return end From 7a3a3588dcbc5bf17c4a32e0a998ddae01780bb2 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Tue, 15 Dec 2020 08:29:37 -0600 Subject: [PATCH 08/15] Don't export empty related items --- bin/export-collections | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/export-collections b/bin/export-collections index 20de76ac7..1f99ed8ff 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -19,7 +19,7 @@ def serialize(coll) end apo = coll.apo - related_items = coll.related_items.map { |rel| { link_title: rel.title, url: rel.url } } + related_items = coll.related_items.map { |rel| { link_title: rel.title.presence, url: rel.url.presence }.compact }.reject(&:blank?) { druid: coll.id, From a5c2fb5bfd725184d737a703ebcb0fb9571870ea Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Tue, 15 Dec 2020 09:46:55 -0600 Subject: [PATCH 09/15] Export more collection fields --- bin/export-collections | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/export-collections b/bin/export-collections index 1f99ed8ff..e7599a163 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -28,11 +28,13 @@ def serialize(coll) name: coll.title, description: coll.abstract, contact_email: coll.contact, - visibility_option_value: coll.visibility_option_value, + visibility_option: coll.visibility_option, + visibility: coll.visibility, embargo_option: coll.embargo_option, embargo_terms: coll.embargo_terms, requires_human_approval: coll.requires_human_approval, license_option: coll.license_option, + license: coll.license, object_status: coll.object_status, managers: apo.persons_with_role('hydrus-collection-manager'), depositors: apo.persons_with_role('hydrus-collection-item-depositor') + apo.persons_with_role('hydrus-collection-depositor'), From 6ae6cef13a3e9c67d1a6978ce9ebdac1a2f1e4da Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Tue, 15 Dec 2020 13:19:26 -0600 Subject: [PATCH 10/15] Add a script for exporting items --- bin/export-items | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100755 bin/export-items diff --git a/bin/export-items b/bin/export-items new file mode 100755 index 000000000..54bf8aa0d --- /dev/null +++ b/bin/export-items @@ -0,0 +1,35 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +# Usage: +# RAILS_ENV=production bin/export-items > items.jsonl + +require_relative '../config/environment' +list = Hydrus::Item.all + +def serialize(item) + creator = item.roleMetadata.ng_xml.xpath('//role/person/identifier').text + related_items = item.related_items.map { |rel| { link_title: rel.title.presence, url: rel.url.presence }.compact }.reject(&:blank?) + + { + druid: item.id, + version: item.current_version.to_i, + creator: { sunetid: creator }, + title: item.title, + abstract: item.abstract, + contact_email: item.contact, + collection: item.collection_id, + visibility: item.visibility.first, + license: item.license, + embargo_release_date: item.rmd_embargo_release_date, + date_created: item.date_created, + object_status: item.object_status, + item_type: item.item_type, + citation: item.preferred_citation, + related_items: related_items + } +end + +warn "Exporting #{list.count} items" + +puts list.map { |item| serialize(item).to_json }.compact From f8f5d66dca5605e16486f5f1690c9657c27bb510 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Tue, 15 Dec 2020 14:56:18 -0600 Subject: [PATCH 11/15] Stream output and handle errors --- bin/export-items | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/bin/export-items b/bin/export-items index 54bf8aa0d..86780804d 100755 --- a/bin/export-items +++ b/bin/export-items @@ -31,5 +31,14 @@ def serialize(item) end warn "Exporting #{list.count} items" - -puts list.map { |item| serialize(item).to_json }.compact +count = 0 +list.find_each do |item| + count += 1 + warn count + begin + attributes = serialize(item) + puts attributes.compact.to_json + rescue => e + warn "Error with #{item.pid}. #{e.message}" + end +end From 6ed567b9a7000f1f84ddd952fbff72e788be3660 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Tue, 15 Dec 2020 16:45:52 -0600 Subject: [PATCH 12/15] Export contributors --- bin/export-items | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bin/export-items b/bin/export-items index 86780804d..dfbf72ec0 100755 --- a/bin/export-items +++ b/bin/export-items @@ -10,6 +10,9 @@ list = Hydrus::Item.all def serialize(item) creator = item.roleMetadata.ng_xml.xpath('//role/person/identifier').text related_items = item.related_items.map { |rel| { link_title: rel.title.presence, url: rel.url.presence }.compact }.reject(&:blank?) + contributors = item.contributors. + map { |contrib| { full_name: contrib.name, role: contrib.role, name_type: contrib.name_type }}. + reject { |contrib| contrib[:full_name].blank? } { druid: item.id, @@ -26,7 +29,8 @@ def serialize(item) object_status: item.object_status, item_type: item.item_type, citation: item.preferred_citation, - related_items: related_items + related_items: related_items, + contributors: contributors } end From e4a69e20325983de6a54cd9857a81099467e1a52 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Wed, 16 Dec 2020 12:57:37 -0600 Subject: [PATCH 13/15] Export timestamps and files --- bin/export-collections | 4 +++- bin/export-items | 26 +++++++++++++++++++++----- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/bin/export-collections b/bin/export-collections index e7599a163..dc8fb64ae 100755 --- a/bin/export-collections +++ b/bin/export-collections @@ -39,7 +39,9 @@ def serialize(coll) managers: apo.persons_with_role('hydrus-collection-manager'), depositors: apo.persons_with_role('hydrus-collection-item-depositor') + apo.persons_with_role('hydrus-collection-depositor'), reviewers: apo.persons_with_role('hydrus-collection-reviewer'), - related_items: related_items + related_items: related_items, + created_at: item.create_date, + updated_at: item.modified_date } end diff --git a/bin/export-items b/bin/export-items index dfbf72ec0..47d20a8d3 100755 --- a/bin/export-items +++ b/bin/export-items @@ -7,13 +7,26 @@ require_relative '../config/environment' list = Hydrus::Item.all +def contributors(item) + item.contributors. + map { |contrib| { full_name: contrib.name, role: contrib.role, name_type: contrib.name_type }}. + reject { |contrib| contrib[:full_name].blank? } +end + +def files(item) + item.files. + map do |object_file| + { + path: File.realdirpath(object_file.current_path), + label: object_file.label + } + end +end + + def serialize(item) creator = item.roleMetadata.ng_xml.xpath('//role/person/identifier').text related_items = item.related_items.map { |rel| { link_title: rel.title.presence, url: rel.url.presence }.compact }.reject(&:blank?) - contributors = item.contributors. - map { |contrib| { full_name: contrib.name, role: contrib.role, name_type: contrib.name_type }}. - reject { |contrib| contrib[:full_name].blank? } - { druid: item.id, version: item.current_version.to_i, @@ -30,7 +43,10 @@ def serialize(item) item_type: item.item_type, citation: item.preferred_citation, related_items: related_items, - contributors: contributors + contributors: contributors(item), + files: files(item), + created_at: item.create_date, + updated_at: item.modified_date } end From 1abba2a629e2216ec21e60885eec5f2411193f6f Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Wed, 16 Dec 2020 15:21:57 -0600 Subject: [PATCH 14/15] Export keywords --- bin/export-items | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/export-items b/bin/export-items index 47d20a8d3..b7f4fe6c8 100755 --- a/bin/export-items +++ b/bin/export-items @@ -44,6 +44,7 @@ def serialize(item) citation: item.preferred_citation, related_items: related_items, contributors: contributors(item), + keywords: item.keywords, files: files(item), created_at: item.create_date, updated_at: item.modified_date From 5e1f0570d8279254b129eb0d3b454835d7f1e0c9 Mon Sep 17 00:00:00 2001 From: Justin Coyne Date: Wed, 16 Dec 2020 15:27:48 -0600 Subject: [PATCH 15/15] Export related citations --- bin/export-items | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/export-items b/bin/export-items index b7f4fe6c8..32d6b1e4b 100755 --- a/bin/export-items +++ b/bin/export-items @@ -43,6 +43,7 @@ def serialize(item) item_type: item.item_type, citation: item.preferred_citation, related_items: related_items, + related_citations: item.related_citation, contributors: contributors(item), keywords: item.keywords, files: files(item),