From 63941eea39c2d18cc18fdb0709f4cc733f1fadc6 Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Thu, 6 Feb 2020 15:32:48 -0600 Subject: [PATCH 01/15] updates rake gem fairly aggressively --- Gemfile.lock | 4 ++-- datura.gemspec | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index f6371ab75..1dac847f9 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -22,7 +22,7 @@ GEM netrc (0.11.0) nokogiri (1.10.7) mini_portile2 (~> 2.4.0) - rake (10.5.0) + rake (13.0.1) rest-client (2.0.2) http-cookie (>= 1.0.2, < 2.0) mime-types (>= 1.16, < 4.0) @@ -38,7 +38,7 @@ DEPENDENCIES bundler (>= 1.16.0, < 3.0) datura! minitest (~> 5.0) - rake (~> 10.0) + rake (~> 13.0) BUNDLED WITH 2.1.4 diff --git a/datura.gemspec b/datura.gemspec index 1735714ea..ef85aa47d 100644 --- a/datura.gemspec +++ b/datura.gemspec @@ -59,5 +59,5 @@ Gem::Specification.new do |spec| spec.add_runtime_dependency "rest-client", "~> 2.0.2" spec.add_development_dependency "bundler", ">= 1.16.0", "< 3.0" spec.add_development_dependency "minitest", "~> 5.0" - spec.add_development_dependency "rake", "~> 10.0" + spec.add_development_dependency "rake", "~> 13.0" end From c238002d980b330ee9f5f243433241b3470f7ca2 Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Thu, 6 Feb 2020 15:39:12 -0600 Subject: [PATCH 02/15] moving methods out of common_xml for greater use originally they were in that class because it was built in imitation of the widely used CDRH-built XSLT collection of methods but it's time for change moved to "helpers" to make them more accessible find replace across repository (not test suite) a little misc cleanup as well, but more to come --- lib/datura/common_xml.rb | 60 ++++++++------------------- lib/datura/helpers.rb | 57 +++++++++++++++++++++++++ lib/datura/to_es/html_to_es/fields.rb | 19 ++++----- lib/datura/to_es/tei_to_es/fields.rb | 33 +++++++-------- lib/datura/to_es/vra_to_es/fields.rb | 33 +++++++-------- lib/datura/to_es/webs_to_es/fields.rb | 4 +- lib/datura/to_es/xml_to_es.rb | 2 +- 7 files changed, 116 insertions(+), 92 deletions(-) diff --git a/lib/datura/common_xml.rb b/lib/datura/common_xml.rb index 02a931508..163be72ad 100644 --- a/lib/datura/common_xml.rb +++ b/lib/datura/common_xml.rb @@ -39,59 +39,24 @@ def self.create_xml_object(filepath, remove_ns=true) return file_xml end - # pass in a date and identify whether it should be before or after - # in order to fill in dates (ex: 2014 => 2014-12-31) - + # deprecated method def self.date_display(date, nd_text="N.D.") - date_hyphen = CommonXml.date_standardize(date) - if date_hyphen - y, m, d = date_hyphen.split("-").map { |s| s.to_i } - date_obj = Date.new(y, m, d) - return date_obj.strftime("%B %-d, %Y") - else - return nd_text - end + Datura::Helpers.date_display(date, nd_text) end - # automatically defaults to setting incomplete dates to the earliest - # date (2016-07 becomes 2016-07-01) but pass in "false" in order - # to set it to the latest available date + # deprecated method def self.date_standardize(date, before=true) - return_date = nil - if date - y, m, d = date.split(/-|\//) - if y && y.length == 4 - # use -1 to indicate that this will be the last possible - m_default = before ? "01" : "-1" - d_default = before ? "01" : "-1" - m = m_default if !m - d = d_default if !d - # TODO clean this up because man it sucks - if Date.valid_date?(y.to_i, m.to_i, d.to_i) - date = Date.new(y.to_i, m.to_i, d.to_i) - month = date.month.to_s.rjust(2, "0") - day = date.day.to_s.rjust(2, "0") - return_date = "#{date.year}-#{month}-#{day}" - end - end - end - return_date + Datura::Helpers.date_standardize(date, before) end + # deprecated method def self.normalize_name(abnormal) - # put in lower case - # remove starting a, an, or the - down = abnormal.downcase - down.gsub(/^the |^a |^an /, "") + Datura::Helpers.normalize_name(abnormal) end - # imitates xslt fn:normalize-space - # removes leading / trailing whitespace, newlines, repeating whitespace, etc + # deprecated method def self.normalize_space(abnormal) - if abnormal - normal = abnormal.strip.gsub(/\s+/, " ") - end - normal || abnormal + Datura::Helpers.normalize_space(abnormal) end # saxon accepts params in following manner @@ -116,4 +81,13 @@ def self.to_display_text(aXml) CommonXml.sub_corrections(aXml).text end + # TODO remove in 2021 + class << self + extend Gem::Deprecate + deprecate :date_display, :"Datura::Helpers.normalize_space", 2021, 1 + deprecate :date_standardize, :"Datura::Helpers.normalize_space", 2021, 1 + deprecate :normalize_name, :"Datura::Helpers.normalize_space", 2021, 1 + deprecate :normalize_space, :"Datura::Helpers.normalize_space", 2021, 1 + end + end diff --git a/lib/datura/helpers.rb b/lib/datura/helpers.rb index 1b56fb33f..2ed51363a 100644 --- a/lib/datura/helpers.rb +++ b/lib/datura/helpers.rb @@ -5,6 +5,46 @@ module Datura::Helpers + # date_display + # pass in a date and identify whether it should be before or after + # in order to fill in dates (ex: 2014 => 2014-12-31) + def self.date_display(date, nd_text="N.D.") + date_hyphen = self.date_standardize(date) + if date_hyphen + y, m, d = date_hyphen.split("-").map { |s| s.to_i } + date_obj = Date.new(y, m, d) + date_obj.strftime("%B %-d, %Y") + else + nd_text + end + end + + # date_standardize + # automatically defaults to setting incomplete dates to the earliest + # date (2016-07 becomes 2016-07-01) but pass in "false" in order + # to set it to the latest available date + def self.date_standardize(date, before=true) + return_date = nil + if date + y, m, d = date.split(/-|\//) + if y && y.length == 4 + # use -1 to indicate that this will be the last possible + m_default = before ? "01" : "-1" + d_default = before ? "01" : "-1" + m = m_default if !m + d = d_default if !d + # TODO clean this up because man it sucks + if Date.valid_date?(y.to_i, m.to_i, d.to_i) + date = Date.new(y.to_i, m.to_i, d.to_i) + month = date.month.to_s.rjust(2, "0") + day = date.day.to_s.rjust(2, "0") + return_date = "#{date.year}-#{month}-#{day}" + end + end + end + return_date + end + # get_directory_files # Note: do not end with / # params: directory (string) @@ -55,6 +95,23 @@ def self.make_dirs(*args) FileUtils.mkdir_p(args) end + # normalize_name + # lowercase and remove articles from front + def self.normalize_name(abnormal) + down = abnormal.downcase + down.gsub(/^the |^a |^an /, "") + end + + # normalize_space + # imitates xslt fn:normalize-space + # removes leading / trailing whitespace, newlines, repeating whitespace, etc + def self.normalize_space(abnormal) + if abnormal + normal = abnormal.strip.gsub(/\s+/, " ") + end + normal || abnormal + end + # regex_files # looks through a directory's files for those matching the regex # params: files (array of file names), regex (regular expression) diff --git a/lib/datura/to_es/html_to_es/fields.rb b/lib/datura/to_es/html_to_es/fields.rb index 9852b5c4f..94c35db68 100644 --- a/lib/datura/to_es/html_to_es/fields.rb +++ b/lib/datura/to_es/html_to_es/fields.rb @@ -137,20 +137,18 @@ def subcategory def text # handling separate fields in array # means no worrying about handling spacing between words - text = [] + text_all = [] body = get_text(@xpaths["text"], false) - text << body - text += text_additional - return CommonXml.normalize_space(text.join(" ")) + text_all << body + text_all += text_additional + Datura::Helpers.normalize_space(text_all.join(" ")) end def text_additional # Note: Override this per collection if you need additional # searchable fields or information for collections # just make sure you return an array at the end! - - text = [] - text << title + [ title ] end def title @@ -158,8 +156,7 @@ def title end def title_sort - t = title - CommonXml.normalize_name(t) + Datura::Helpers.normalize_name(title) end def topics @@ -172,9 +169,7 @@ def uri end def uri_data - base = @options["data_base"] - subpath = "data/#{@options["collection"]}/tei" - "#{base}/#{subpath}/#{@id}.xml" + # TODO per repository end def uri_html diff --git a/lib/datura/to_es/tei_to_es/fields.rb b/lib/datura/to_es/tei_to_es/fields.rb index 6c80601d2..ed727bdb5 100644 --- a/lib/datura/to_es/tei_to_es/fields.rb +++ b/lib/datura/to_es/tei_to_es/fields.rb @@ -21,13 +21,13 @@ def annotations_text def category category = get_text(@xpaths["category"]) - return category.length > 0 ? CommonXml.normalize_space(category) : "none" + return category.length > 0 ? Datura::Helpers.normalize_space(category) : "none" end # note this does not sort the creators def creator creators = get_list(@xpaths["creators"]) - return creators.map { |creator| { "name" => CommonXml.normalize_space(creator) } } + return creators.map { |creator| { "name" => Datura::Helpers.normalize_space(creator) } } end # returns ; delineated string of alphabetized creators @@ -50,8 +50,8 @@ def contributor eles.each do |ele| contribs << { "id" => ele["id"], - "name" => CommonXml.normalize_space(ele.text), - "role" => CommonXml.normalize_space(ele["role"]) + "name" => Datura::Helpers.normalize_space(ele.text), + "role" => Datura::Helpers.normalize_space(ele["role"]) } end end @@ -64,7 +64,7 @@ def data_type def date(before=true) datestr = get_text(@xpaths["date"]) - return CommonXml.date_standardize(datestr, before) + return Datura::Helpers.date_standardize(datestr, before) end def date_display @@ -124,15 +124,15 @@ def person people = eles.map do |p| { "id" => "", - "name" => CommonXml.normalize_space(p.text), - "role" => CommonXml.normalize_space(p["role"]) + "name" => Datura::Helpers.normalize_space(p.text), + "role" => Datura::Helpers.normalize_space(p["role"]) } end return people end def people - @json["person"].map { |p| CommonXml.normalize_space(p["name"]) } + @json["person"].map { |p| Datura::Helpers.normalize_space(p["name"]) } end def places @@ -148,7 +148,7 @@ def recipient people = eles.map do |p| { "id" => "", - "name" => CommonXml.normalize_space(p.text), + "name" => Datura::Helpers.normalize_space(p.text), "role" => "recipient" } end @@ -186,13 +186,13 @@ def subcategory def text # handling separate fields in array # means no worrying about handling spacing between words - text = [] + text_all = [] body = get_text(@xpaths["text"], false) - text << body + text_all << body # TODO: do we need to preserve tags like in text? if so, turn get_text to true - # text << CommonXml.convert_tags_in_string(body) - text += text_additional - return CommonXml.normalize_space(text.join(" ")) + # text_all << CommonXml.convert_tags_in_string(body) + text_all += text_additional + Datura::Helpers.normalize_space(text_all.join(" ")) end def text_additional @@ -200,8 +200,7 @@ def text_additional # searchable fields or information for collections # just make sure you return an array at the end! - text = [] - text << title + [ title ] end def title @@ -214,7 +213,7 @@ def title def title_sort t = title - CommonXml.normalize_name(t) + Datura::Helpers.normalize_name(t) end def topics diff --git a/lib/datura/to_es/vra_to_es/fields.rb b/lib/datura/to_es/vra_to_es/fields.rb index 2ca37a155..11a45ba61 100644 --- a/lib/datura/to_es/vra_to_es/fields.rb +++ b/lib/datura/to_es/vra_to_es/fields.rb @@ -26,7 +26,7 @@ def category # note this does not sort the creators def creator creators = get_list(@xpaths["creators"]) - return creators.map { |creator| { "name" => CommonXml.normalize_space(creator) } } + return creators.map { |creator| { "name" => Datura::Helpers.normalize_space(creator) } } end # returns ; delineated string of alphabetized creators @@ -48,8 +48,8 @@ def contributor contributors.each do |ele| contrib_list << { "id" => "", - "name" => CommonXml.normalize_space(ele.xpath("name").text), - "role" => CommonXml.normalize_space(ele.xpath("role").text) + "name" => Datura::Helpers.normalize_space(ele.xpath("name").text), + "role" => Datura::Helpers.normalize_space(ele.xpath("role").text) } end return contrib_list @@ -61,7 +61,7 @@ def data_type def date(before=true) datestr = get_text(@xpaths["dates"]["earliest"]) - CommonXml.date_standardize(datestr, before) + Datura::Helpers.date_standardize(datestr, before) end def date_display @@ -115,14 +115,14 @@ def person return eles.map do |p| { "id" => "", - "name" => CommonXml.normalize_space(p.text), - "role" => CommonXml.normalize_space(p["role"]) + "name" => Datura::Helpers.normalize_space(p.text), + "role" => Datura::Helpers.normalize_space(p["role"]) } end end def people - @json["person"].map { |p| CommonXml.normalize_space(p["name"]) } + @json["person"].map { |p| Datura::Helpers.normalize_space(p["name"]) } end def places @@ -138,8 +138,8 @@ def recipient people = eles.map do |p| { "id" => "", - "name" => CommonXml.normalize_space(p.text), - "role" => CommonXml.normalize_space(p["role"]), + "name" => Datura::Helpers.normalize_space(p.text), + "role" => Datura::Helpers.normalize_space(p["role"]), } end return people @@ -175,12 +175,12 @@ def subjects def text # handling separate fields in array # means no worrying about handling spacing between words - text = [] - text << get_text(@xpaths["text"], false) + text_all = [] + text_all << get_text(@xpaths["text"], false) # TODO: do we need to preserve tags like in text? if so, turn get_text to true - # text << CommonXml.convert_tags_in_string(body) - text += text_additional - return CommonXml.normalize_space(text.join(" ")) + # text_all << CommonXml.convert_tags_in_string(body) + text_all += text_additional + Datura::Helpers.normalize_space(text_all.join(" ")) end def text_additional @@ -188,8 +188,7 @@ def text_additional # searchable fields or information for collections # just make sure you return an array at the end! - text = [] - text << title + [ title ] end def title @@ -198,7 +197,7 @@ def title def title_sort t = title - CommonXml.normalize_name(t) + Datura::Helpers.normalize_name(t) end def topics diff --git a/lib/datura/to_es/webs_to_es/fields.rb b/lib/datura/to_es/webs_to_es/fields.rb index 88a0cf760..aff7d27eb 100644 --- a/lib/datura/to_es/webs_to_es/fields.rb +++ b/lib/datura/to_es/webs_to_es/fields.rb @@ -149,7 +149,7 @@ def text body = get_text(@xpaths["text"], false) text << body text += text_additional - return CommonXml.normalize_space(text.join(" ")) + return Datura::Helpers.normalize_space(text.join(" ")) end def text_additional @@ -167,7 +167,7 @@ def title def title_sort t = title - CommonXml.normalize_name(t) + Datura::Helpers.normalize_name(t) end def topics diff --git a/lib/datura/to_es/xml_to_es.rb b/lib/datura/to_es/xml_to_es.rb index 38aec2bc9..4a634dd61 100644 --- a/lib/datura/to_es/xml_to_es.rb +++ b/lib/datura/to_es/xml_to_es.rb @@ -111,7 +111,7 @@ def get_xpaths(xpaths, keep_tags=false, xml=nil) text = CommonXml.to_display_text(content) end # remove whitespace of all kinds from the text - text = CommonXml.normalize_space(text) + text = Datura::Helpers.normalize_space(text) if text.length > 0 list << text end From d336b2a71c05ce8d34041a0b2ad108e5b09bdedd Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Thu, 6 Feb 2020 15:39:37 -0600 Subject: [PATCH 03/15] first crack at CsvToEs class and hookup --- lib/datura/file_types/file_csv.rb | 18 +-- lib/datura/requirer.rb | 2 + lib/datura/to_es/csv_to_es.rb | 54 ++++++++ lib/datura/to_es/csv_to_es/fields.rb | 187 ++++++++++++++++++++++++++ lib/datura/to_es/csv_to_es/request.rb | 122 +++++++++++++++++ 5 files changed, 372 insertions(+), 11 deletions(-) create mode 100644 lib/datura/to_es/csv_to_es.rb create mode 100644 lib/datura/to_es/csv_to_es/fields.rb create mode 100644 lib/datura/to_es/csv_to_es/request.rb diff --git a/lib/datura/file_types/file_csv.rb b/lib/datura/file_types/file_csv.rb index e7c01f2a8..57f8efd71 100644 --- a/lib/datura/file_types/file_csv.rb +++ b/lib/datura/file_types/file_csv.rb @@ -41,18 +41,14 @@ def read_csv(file_location, encoding="utf-8") }) end - # most basic implementation assumes column header is the es field name - # operates with no logic on the fields - # YOU MUST OVERRIDE FOR CSVS WHICH DO NOT HAVE BESPOKE HEADINGS FOR API + # NOTE previously this blindly took column headings and tried + # to send them to Elasticsearch, but this will make a mess of + # our index mapping, so instead prefer to only push specific fields + # leaving "headers" in method arguments for backwards compatibility + # + # override as necessary per project def row_to_es(headers, row) - doc = {} - headers.each do |column| - doc[column] = row[column] if row[column] - end - if doc.key?("text") && doc.key?("title") - doc["text"] << " #{doc["title"]}" - end - doc + CsvToEs.new(row, options, @csv, self.filename(false)).json end # most basic implementation assumes column header is the solr field name diff --git a/lib/datura/requirer.rb b/lib/datura/requirer.rb index b50190822..48eed7149 100644 --- a/lib/datura/requirer.rb +++ b/lib/datura/requirer.rb @@ -5,6 +5,8 @@ current_dir = File.expand_path(File.dirname(__FILE__)) +require_relative "to_es/csv_to_es.rb" + require_relative "to_es/html_to_es.rb" require_relative "to_es/tei_to_es.rb" diff --git a/lib/datura/to_es/csv_to_es.rb b/lib/datura/to_es/csv_to_es.rb new file mode 100644 index 000000000..cf92fec0f --- /dev/null +++ b/lib/datura/to_es/csv_to_es.rb @@ -0,0 +1,54 @@ +require_relative "../helpers.rb" +require_relative "csv_to_es/fields.rb" +require_relative "csv_to_es/request.rb" + +######################################### +# NOTE: DO NOT EDIT THIS FILE!!!!!!!!! # +######################################### +# (unless you are a CDRH dev and then you may do so very cautiously) +# this file provides defaults for ALL of the collections included +# in the API and changing it could alter dozens of sites unexpectedly! +# PLEASE RUN LOADS OF TESTS AFTER A CHANGE BEFORE PUSHING TO PRODUCTION + +# WHAT IS THIS FILE? +# This file sets up default behavior for transforming CSV +# documents to Elasticsearch JSON documents + +class CsvToEs + + attr_reader :json, :row, :csv + # variables + # id, row, csv, options + + def initialize(row, options={}, csv=nil, filename=nil) + @row = row + @options = options + @csv = csv + @filename = filename + @id = get_id + + create_json + end + + # getter for @json response object + def create_json + @json = {} + # if anything needs to be done before processing + # do it here (ex: reading in annotations into memory) + preprocessing + assemble_json + postprocessing + end + + def get_id + @row["id"] || @row["identifier"] || nil + end + + def preprocessing + # copy this in your csv_to_es collection file to customize + end + + def postprocessing + # copy this in your csv_to_es collection file to customize + end +end diff --git a/lib/datura/to_es/csv_to_es/fields.rb b/lib/datura/to_es/csv_to_es/fields.rb new file mode 100644 index 000000000..96e26db2e --- /dev/null +++ b/lib/datura/to_es/csv_to_es/fields.rb @@ -0,0 +1,187 @@ +class CsvToEs + # Note to add custom fields, use "assemble_collection_specific" from request.rb + # and be sure to either use the _d, _i, _k, or _t to use the correct field type + + ########## + # FIELDS # + ########## + def id + @id + end + + def id_dc + "https://cdrhapi.unl.edu/doc/#{@id}" + end + + def annotations_text + # TODO what should default behavior be? + end + + def category + @row["category"] + end + + # nested field + def creator + # TODO + end + + # returns ; delineated string of alphabetized creators + def creator_sort + # TODO + end + + def collection + @options["collection"] + end + + def collection_desc + @options["collection_desc"] || @options["collection"] + end + + def contributor + # TODO + end + + def data_type + "csv" + end + + def date(before=true) + Datura::Helpers.date_standardize(@row["date"], before) + end + + def date_display + Datura::Helpers.date_display(date) + end + + def date_not_after + date(false) + end + + def date_not_before + date(true) + end + + def description + # Note: override per collection as needed + end + + def format + @row["format"] + end + + def image_id + # TODO + end + + def keywords + # TODO + end + + def language + # TODO + end + + def languages + # TODO + end + + def medium + # Default behavior is the same as "format" method + format + end + + def person + # TODO + end + + def people + # TODO + end + + def places + # TODO + end + + def publisher + # TODO + end + + def recipient + # TODO + end + + def rights + # Note: override by collection as needed + "All Rights Reserved" + end + + def rights_holder + # TODO + end + + def rights_uri + # TODO + end + + def source + @row["source"] + end + + def subjects + # TODO + end + + def subcategory + @row["subcategory"] + end + + # text is generally going to be pulled from + def text + text_all = [ @row["text"] ] + + text_all += text_additional + text_all = text_all.compact + Datura::Helpers.normalize_space(text_all.join(" ")) + end + + # override and add by collection as needed + def text_additional + [ title ] + end + + def title + @row["title"] + end + + def title_sort + Datura::Helpers.normalize_name(title) if title + end + + def topics + @row["topics"] + end + + def uri + # override per collection + # should point at the live website view of resource + end + + def uri_data + base = @options["data_base"] + subpath = "data/#{@options["collection"]}/source/csv" + "#{base}/#{subpath}/#{@filename}.csv" + end + + def uri_html + base = @options["data_base"] + subpath = "data/#{@options["collection"]}/output/#{@options["environment"]}/html" + "#{base}/#{subpath}/#{@id}.html" + end + + def works + @row["works"] + end + +end diff --git a/lib/datura/to_es/csv_to_es/request.rb b/lib/datura/to_es/csv_to_es/request.rb new file mode 100644 index 000000000..c047c879e --- /dev/null +++ b/lib/datura/to_es/csv_to_es/request.rb @@ -0,0 +1,122 @@ +# request creation portion of CSV to ES transformation +class CsvToEs + + def assemble_json + # Note: if your collection does not require a specific field + # it may be better to override the field's behavior to return + # nil than to alter the below field list methods + # values being sent, because otherwise they could just override + # the field behavior to be blank + + # below not alphabetical to reflect their position + # in the cdrh api schema + assemble_identifiers + assemble_categories + assemble_locations + assemble_descriptions + assemble_other_metadata + assemble_dates + assemble_publishing + assemble_people + assemble_spatial + assemble_references + assemble_text + assemble_collection_specific + + @json + end + + ############## + # components # + ############## + + def assemble_categories + @json["category"] = category + @json["subcategory"] = subcategory + @json["data_type"] = data_type + @json["collection"] = collection + @json["collection_desc"] = collection_desc + @json["subjects"] = subjects + end + + def assemble_collection_specific + # add your own per collection + # with format + # @json["fieldname"] = field_contents + end + + def assemble_dates + @json["date_display"] = date_display + @json["date"] = date + @json["date_not_before"] = date_not_before + @json["date_not_after"] = date_not_after + end + + def assemble_descriptions + @json["title_sort"] = title_sort + @json["title"] = title + @json["description"] = description + @json["topics"] = topics + # @json["alternative"] + end + + def assemble_identifiers + @json["identifier"] = @id + end + + def assemble_locations + @json["uri"] = uri + @json["uri_data"] = uri_data + @json["uri_html"] = uri_html + @json["image_id"] = image_id + end + + def assemble_other_metadata + @json["format"] = format + @json["language"] = language + @json["languages"] = languages + # @json["relation"] + # @json["type"] + # @json["extent"] + @json["medium"] = medium + end + + def assemble_people + # container fields + @json["person"] = person + @json["contributor"] = contributor + @json["creator"] = creator + @json["recipient"] = recipient + # can draw off of container fields + @json["creator_sort"] = creator_sort + @json["people"] = people + end + + def assemble_publishing + @json["publisher"] = publisher + @json["rights"] = rights + @json["rights_uri"] = rights_uri + @json["rights_holder"] = rights_holder + @json["source"] = source + end + + def assemble_references + @json["keywords"] = keywords + @json["places"] = places + @json["works"] = works + end + + def assemble_spatial + # TODO not sure about the naming convention here? + # TODO has place_name, coordinates, id, city, county, country, + # region, state, street, postal_code + # @json["coverage.spatial"] + end + + def assemble_text + @json["annotations_text"] = annotations_text + @json["text"] = text + # @json["abstract"] + end + +end From 310ce401b7e1da503e1e8d7b5c12b211316b79ed Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Fri, 7 Feb 2020 09:40:17 -0600 Subject: [PATCH 04/15] moves tests for the methods I changed up --- test/common_xml_test.rb | 44 ----------------------------------------- test/helpers_test.rb | 44 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/test/common_xml_test.rb b/test/common_xml_test.rb index 05c4879a1..765f85c97 100644 --- a/test/common_xml_test.rb +++ b/test/common_xml_test.rb @@ -44,50 +44,6 @@ def test_create_xml_object # TODO end - def test_date_display - # normal dates - assert_equal "December 2, 2016", CommonXml.date_display("2016-12-02") - assert_equal "January 31, 2014", CommonXml.date_display("2014-01-31", "no date") - # no date - assert_equal "N.D.", CommonXml.date_display(nil) - assert_equal "no date", CommonXml.date_display("20143183", "no date") - assert_equal "", CommonXml.date_display(nil, "") - end - - def test_date_standardize - # missing month and day - assert_equal "2016-01-01", CommonXml.date_standardize("2016") - assert_equal "2016-12-31", CommonXml.date_standardize("2016", false) - # missing day - assert_nil CommonXml.date_standardize("01-12") - assert_equal "2014-01-01", CommonXml.date_standardize("2014-01") - assert_equal "2014-01-31", CommonXml.date_standardize("2014-01", false) - # complete date - assert_equal "2014-01-12", CommonXml.date_standardize("2014-01-12") - # invalid date - assert_nil CommonXml.date_standardize("2014-30-31") - # February final day - assert_equal "2015-02-28", CommonXml.date_standardize("2015-2", false) - assert_equal "2016-02-29", CommonXml.date_standardize("2016-02", false) - end - - def test_normalize_name - assert_equal "title", CommonXml.normalize_name("The Title") - assert_equal "anne of green gables", CommonXml.normalize_name("Anne of Green Gables") - assert_equal "fancy party", CommonXml.normalize_name("A Fancy Party") - assert_equal "hour", CommonXml.normalize_name("An Hour") - end - - def test_normalize_space - # ensure that return characters are replaced by spaces, and multispaces squashed - test1 = " \rExample \n \n " - assert_equal " Example ", CommonXml.normalize_space(test1) - - # check that newlines are dead regardless - test2 = "\rExa\rmple\n" - assert_equal " Exa mple ", CommonXml.normalize_space(test2) - end - def test_sub_corrections xml_string = "Somethng Something" xml = Nokogiri::XML xml_string diff --git a/test/helpers_test.rb b/test/helpers_test.rb index 7c3c72440..357c64e01 100644 --- a/test/helpers_test.rb +++ b/test/helpers_test.rb @@ -3,6 +3,33 @@ class Datura::HelpersTest < Minitest::Test + def test_date_display + # normal dates + assert_equal "December 2, 2016", Datura::Helpers.date_display("2016-12-02") + assert_equal "January 31, 2014", Datura::Helpers.date_display("2014-01-31", "no date") + # no date + assert_equal "N.D.", Datura::Helpers.date_display(nil) + assert_equal "no date", Datura::Helpers.date_display("20143183", "no date") + assert_equal "", Datura::Helpers.date_display(nil, "") + end + + def test_date_standardize + # missing month and day + assert_equal "2016-01-01", Datura::Helpers.date_standardize("2016") + assert_equal "2016-12-31", Datura::Helpers.date_standardize("2016", false) + # missing day + assert_nil Datura::Helpers.date_standardize("01-12") + assert_equal "2014-01-01", Datura::Helpers.date_standardize("2014-01") + assert_equal "2014-01-31", Datura::Helpers.date_standardize("2014-01", false) + # complete date + assert_equal "2014-01-12", Datura::Helpers.date_standardize("2014-01-12") + # invalid date + assert_nil Datura::Helpers.date_standardize("2014-30-31") + # February final day + assert_equal "2015-02-28", Datura::Helpers.date_standardize("2015-2", false) + assert_equal "2016-02-29", Datura::Helpers.date_standardize("2016-02", false) + end + def test_get_directory_files # real directory files = Datura::Helpers.get_directory_files("#{File.dirname(__FILE__)}/fixtures") @@ -25,6 +52,23 @@ def test_make_dirs # TODO end + def test_normalize_name + assert_equal "title", Datura::Helpers.normalize_name("The Title") + assert_equal "anne of green gables", Datura::Helpers.normalize_name("Anne of Green Gables") + assert_equal "fancy party", Datura::Helpers.normalize_name("A Fancy Party") + assert_equal "hour", Datura::Helpers.normalize_name("An Hour") + end + + def test_normalize_space + # ensure that return characters are replaced by spaces, and multispaces squashed + test1 = " \rExample \n \n " + assert_equal " Example ", Datura::Helpers.normalize_space(test1) + + # check that newlines are dead regardless + test2 = "\rExa\rmple\n" + assert_equal " Exa mple ", Datura::Helpers.normalize_space(test2) + end + def test_regex_files test_files = %w[ /path/to/cody.book.001.xml From 8fa4b860260c2f9e8fe3851ceb43263b9ea61410 Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Fri, 7 Feb 2020 10:20:11 -0600 Subject: [PATCH 05/15] removes unnecessary returns, some variable renames in a lot of cases, there were self-referential things like def category category = xpath_result end these have been renamed where discovered --- lib/datura/common_xml.rb | 8 ++-- lib/datura/data_manager.rb | 10 ++--- lib/datura/file_type.rb | 6 +-- lib/datura/file_types/file_csv.rb | 6 +-- lib/datura/file_types/file_tei.rb | 2 +- lib/datura/file_types/file_vra.rb | 2 +- lib/datura/helpers.rb | 12 +++--- lib/datura/options.rb | 4 +- lib/datura/parser.rb | 2 +- lib/datura/parser_options/post.rb | 2 +- .../parser_options/solr_create_api_ore.rb | 2 +- .../parser_options/solr_manage_schema.rb | 2 +- lib/datura/solr_poster.rb | 12 +++--- lib/datura/to_es/tei_to_es/fields.rb | 39 +++++++++---------- .../tei_to_es/tei_to_es_personography.rb | 8 ++-- lib/datura/to_es/vra_to_es/fields.rb | 18 ++++----- .../vra_to_es/vra_to_es_personography.rb | 8 ++-- lib/datura/to_es/xml_to_es.rb | 8 ++-- lib/datura/to_es/xml_to_es_request.rb | 2 +- 19 files changed, 74 insertions(+), 79 deletions(-) diff --git a/lib/datura/common_xml.rb b/lib/datura/common_xml.rb index 163be72ad..7227b2c7b 100644 --- a/lib/datura/common_xml.rb +++ b/lib/datura/common_xml.rb @@ -20,7 +20,7 @@ def self.convert_tags(xml) ele.delete("rend") end xml = CommonXml.sub_corrections(xml) - return xml + xml end # wrap in order to make valid xml @@ -29,14 +29,14 @@ def self.convert_tags(xml) def self.convert_tags_in_string(text) xml = Nokogiri::XML("#{text}") converted = convert_tags(xml) - return converted.xpath("//xml").inner_html + converted.xpath("//xml").inner_html end def self.create_xml_object(filepath, remove_ns=true) file_xml = File.open(filepath) { |f| Nokogiri::XML(f, &:noblanks) } # TODO is this a good idea? file_xml.remove_namespaces! if remove_ns - return file_xml + file_xml end # deprecated method @@ -66,7 +66,7 @@ def self.stringify_params(param_hash) if param_hash params = param_hash.map{ |k, v| "#{k}=#{v}" }.join(" ") end - return params + params end def self.sub_corrections(aXml) diff --git a/lib/datura/data_manager.rb b/lib/datura/data_manager.rb index 861d71da6..fa46e7327 100644 --- a/lib/datura/data_manager.rb +++ b/lib/datura/data_manager.rb @@ -63,7 +63,7 @@ def load_collection_classes def print_options pretty = JSON.pretty_generate(@options) puts "Options: #{pretty}" - return pretty + pretty end def run @@ -179,7 +179,7 @@ def get_files found = Datura::Helpers.get_directory_files(File.join(@options["collection_dir"], "source", format)) files += found if found end - return files + files end def options_msg @@ -196,7 +196,7 @@ def options_msg if @options["verbose"] print_options end - return msg + msg end # override this step in project specific files @@ -241,7 +241,7 @@ def prepare_files @log.error(msg) end end - return file_classes + file_classes end def prepare_xslt @@ -293,7 +293,7 @@ def set_up_logger def should_transform?(type) # adjust default transformation type in params parser - return @options["transform_types"].include?(type) + @options["transform_types"].include?(type) end def transform_and_post(file) diff --git a/lib/datura/file_type.rb b/lib/datura/file_type.rb index 38ca0806e..2dbdd9edb 100644 --- a/lib/datura/file_type.rb +++ b/lib/datura/file_type.rb @@ -95,11 +95,11 @@ def post_solr(url=nil) def print_es json = transform_es - return pretty_json(json) + pretty_json(json) end def print_solr - return transform_solr + transform_solr end # these rules apply to all XML files (HTML / TEI / VRA) @@ -149,7 +149,7 @@ def transform_solr else req = exec_xsl(@file_location, @script_solr, "xml", nil, @options["variables_solr"]) end - return req + req end private diff --git a/lib/datura/file_types/file_csv.rb b/lib/datura/file_types/file_csv.rb index 57f8efd71..65655a940 100644 --- a/lib/datura/file_types/file_csv.rb +++ b/lib/datura/file_types/file_csv.rb @@ -34,7 +34,7 @@ def present?(item) # override to change encoding def read_csv(file_location, encoding="utf-8") - return CSV.read(file_location, { + CSV.read(file_location, { encoding: encoding, headers: true, return_headers: true @@ -57,7 +57,7 @@ def row_to_solr(doc, headers, row) headers.each do |column| doc.add_child("#{row[column]}") if row[column] end - return doc + doc end def transform_es @@ -107,7 +107,7 @@ def transform_solr filepath = "#{@out_solr}/#{self.filename(false)}.xml" File.open(filepath, "w") { |f| f.write(solr_doc.root.to_xml) } end - return { "doc" => solr_doc.root.to_xml } + { "doc" => solr_doc.root.to_xml } end def write_html_to_file(builder, index) diff --git a/lib/datura/file_types/file_tei.rb b/lib/datura/file_types/file_tei.rb index 66fd4a970..d756450f2 100644 --- a/lib/datura/file_types/file_tei.rb +++ b/lib/datura/file_types/file_tei.rb @@ -17,7 +17,7 @@ def initialize(file_location, options) def subdoc_xpaths # match subdocs against classes - return { + { "/TEI" => TeiToEs, # "//listPerson/person" => TeiToEsPersonography, } diff --git a/lib/datura/file_types/file_vra.rb b/lib/datura/file_types/file_vra.rb index cf8b9bd31..e48e4587b 100644 --- a/lib/datura/file_types/file_vra.rb +++ b/lib/datura/file_types/file_vra.rb @@ -11,7 +11,7 @@ def initialize(file_location, options) def subdoc_xpaths # planning ahead on this one, but not necessary at the moment - return { + { "/vra" => VraToEs, "//listPerson/person" => VraToEsPersonography } diff --git a/lib/datura/helpers.rb b/lib/datura/helpers.rb index 2ed51363a..2e841d267 100644 --- a/lib/datura/helpers.rb +++ b/lib/datura/helpers.rb @@ -54,10 +54,10 @@ def self.get_directory_files(directory, verbose_flag=false) exists = File.directory?(directory) if exists files = Dir["#{directory}/*"] # grab all the files inside that directory - return files + files else puts "Unable to find a directory at #{directory}" if verbose_flag - return nil + nil end end # end get_directory_files @@ -70,14 +70,14 @@ def self.get_input(original_input, msg) puts "#{msg}: \n" new_input = STDIN.gets.chomp if !new_input.nil? && new_input.length > 0 - return new_input + new_input else # keep bugging the user until they answer or despair puts "Please enter a valid response" get_input(nil, msg) end else - return original_input + original_input end end @@ -136,11 +136,11 @@ def self.regex_files(files, regex=nil) def self.should_update?(file, since_date=nil) if since_date.nil? # if there is no specified date, then update everything - return true + true else # if a file has been updated since a time specified by user file_date = File.mtime(file) - return file_date > since_date + file_date > since_date end end diff --git a/lib/datura/options.rb b/lib/datura/options.rb index 25ce6b352..36d4e47e2 100644 --- a/lib/datura/options.rb +++ b/lib/datura/options.rb @@ -70,7 +70,7 @@ def remove_environments(config) end end end - return new_config + new_config end # remove the unneeded environment and put everything at the first level @@ -85,7 +85,7 @@ def smash_configs collection = c.merge(d) # collection overrides general config - return general.merge(collection) + general.merge(collection) end end diff --git a/lib/datura/parser.rb b/lib/datura/parser.rb index 8b5655c52..b66fcc4d9 100644 --- a/lib/datura/parser.rb +++ b/lib/datura/parser.rb @@ -25,7 +25,7 @@ def self.argv_collection_dir(argv) puts @usage exit end - return collection_dir + collection_dir end # take a string in utc and create a time object with it diff --git a/lib/datura/parser_options/post.rb b/lib/datura/parser_options/post.rb index 6f52cf2ad..f52c154bc 100644 --- a/lib/datura/parser_options/post.rb +++ b/lib/datura/parser_options/post.rb @@ -86,6 +86,6 @@ def self.post_params # magic optparse.parse! - return options + options end end diff --git a/lib/datura/parser_options/solr_create_api_ore.rb b/lib/datura/parser_options/solr_create_api_ore.rb index 41bacb101..134e45707 100644 --- a/lib/datura/parser_options/solr_create_api_ore.rb +++ b/lib/datura/parser_options/solr_create_api_ore.rb @@ -28,6 +28,6 @@ def self.solr_create_api_core_params exit end - return options + options end end diff --git a/lib/datura/parser_options/solr_manage_schema.rb b/lib/datura/parser_options/solr_manage_schema.rb index 605921082..0721b693b 100644 --- a/lib/datura/parser_options/solr_manage_schema.rb +++ b/lib/datura/parser_options/solr_manage_schema.rb @@ -32,6 +32,6 @@ def self.solr_manage_schema_params optparse.parse! - return options + options end end diff --git a/lib/datura/solr_poster.rb b/lib/datura/solr_poster.rb index 71066d8b4..eb4434a88 100644 --- a/lib/datura/solr_poster.rb +++ b/lib/datura/solr_poster.rb @@ -23,7 +23,7 @@ def clear_index else puts "Unable to clear index!" end - return res + res end def clear_index_by_regex(field, regex) @@ -37,7 +37,7 @@ def clear_index_by_regex(field, regex) else puts "Unable to clear files from index!" end - return res + res end # returns an error or nil @@ -49,7 +49,7 @@ def commit_solr puts "UNABLE TO COMMIT YOUR CHANGES TO SOLR. Please commit manually" end end - return commit_res + commit_res end def post(content, type) @@ -60,7 +60,7 @@ def post(content, type) request = Net::HTTP::Post.new(url.request_uri) request.body = content request["Content-Type"] = type - return http.request(request) + http.request(request) end # post_file @@ -68,7 +68,7 @@ def post(content, type) # TODO refactor? def post_file(file_location) file = IO.read(file_location) - return post_xml(file) + post_xml(file) end # post_json @@ -91,7 +91,7 @@ def post_xml(content) if content.nil? || content.empty? puts "Missing content to index to Solr. Please check that files are" puts "available to be converted to Solr format and that they were transformed." - return nil + nil else post(content, "application/xml") end diff --git a/lib/datura/to_es/tei_to_es/fields.rb b/lib/datura/to_es/tei_to_es/fields.rb index ed727bdb5..b33596771 100644 --- a/lib/datura/to_es/tei_to_es/fields.rb +++ b/lib/datura/to_es/tei_to_es/fields.rb @@ -20,19 +20,19 @@ def annotations_text end def category - category = get_text(@xpaths["category"]) - return category.length > 0 ? Datura::Helpers.normalize_space(category) : "none" + cat = get_text(@xpaths["category"]) + cat.length > 0 ? Datura::Helpers.normalize_space(cat) : "none" end # note this does not sort the creators def creator creators = get_list(@xpaths["creators"]) - return creators.map { |creator| { "name" => Datura::Helpers.normalize_space(creator) } } + creators.map { |c| { "name" => Datura::Helpers.normalize_space(c) } } end # returns ; delineated string of alphabetized creators def creator_sort - return get_text(@xpaths["creators"]) + get_text(@xpaths["creators"]) end def collection @@ -64,11 +64,11 @@ def data_type def date(before=true) datestr = get_text(@xpaths["date"]) - return Datura::Helpers.date_standardize(datestr, before) + Datura::Helpers.date_standardize(datestr, before) end def date_display - date = get_text(@xpaths["date_display"]) + get_text(@xpaths["date_display"]) end def date_not_after @@ -121,14 +121,13 @@ def person # and put in the xpaths above, also for attributes, etc # should contain name, id, and role eles = @xml.xpath(@xpaths["person"]) - people = eles.map do |p| + eles.map do |p| { "id" => "", "name" => Datura::Helpers.normalize_space(p.text), "role" => Datura::Helpers.normalize_space(p["role"]) } end - return people end def people @@ -136,7 +135,7 @@ def people end def places - return get_list(@xpaths["places"]) + get_list(@xpaths["places"]) end def publisher @@ -145,14 +144,13 @@ def publisher def recipient eles = @xml.xpath(@xpaths["recipient"]) - people = eles.map do |p| + eles.map do |p| { "id" => "", "name" => Datura::Helpers.normalize_space(p.text), "role" => "recipient" } end - return people end def rights @@ -179,8 +177,8 @@ def subjects end def subcategory - subcategory = get_text(@xpaths["subcategory"]) - subcategory.length > 0 ? subcategory : "none" + subcat = get_text(@xpaths["subcategory"]) + subcat.length > 0 ? subcat : "none" end def text @@ -204,16 +202,15 @@ def text_additional end def title - title = get_text(@xpaths["titles"]["main"]) - if title.empty? - title = get_text(@xpaths["titles"]["alt"]) + title_disp = get_text(@xpaths["titles"]["main"]) + if title_disp.empty? + title_disp = get_text(@xpaths["titles"]["alt"]) end - return title + title_disp end def title_sort - t = title - Datura::Helpers.normalize_name(t) + Datura::Helpers.normalize_name(title) end def topics @@ -228,13 +225,13 @@ def uri def uri_data base = @options["data_base"] subpath = "data/#{@options["collection"]}/source/tei" - return "#{base}/#{subpath}/#{@id}.xml" + "#{base}/#{subpath}/#{@id}.xml" end def uri_html base = @options["data_base"] subpath = "data/#{@options["collection"]}/output/#{@options["environment"]}/html" - return "#{base}/#{subpath}/#{@id}.html" + "#{base}/#{subpath}/#{@id}.html" end def works diff --git a/lib/datura/to_es/tei_to_es/tei_to_es_personography.rb b/lib/datura/to_es/tei_to_es/tei_to_es_personography.rb index 81105ec64..7e4ff79be 100644 --- a/lib/datura/to_es/tei_to_es/tei_to_es_personography.rb +++ b/lib/datura/to_es/tei_to_es/tei_to_es_personography.rb @@ -1,7 +1,7 @@ class TeiToEsPersonography < TeiToEs def override_xpaths - return { + { "titles" => { "main" => "persName[@type='display']", "alt" => "persName" @@ -16,16 +16,16 @@ def category def creator creators = get_list(@xpaths["creators"], false, @parent_xml) - return creators.map { |creator| { "name" => creator } } + creators.map { |c| { "name" => c } } end def creators - return get_text(@xpaths["creators"], false, @parent_xml) + get_text(@xpaths["creators"], false, @parent_xml) end def get_id person = @xml["id"] - return "#{@filename}_#{person}" + "#{@filename}_#{person}" end def person diff --git a/lib/datura/to_es/vra_to_es/fields.rb b/lib/datura/to_es/vra_to_es/fields.rb index 11a45ba61..ed3d2e45e 100644 --- a/lib/datura/to_es/vra_to_es/fields.rb +++ b/lib/datura/to_es/vra_to_es/fields.rb @@ -26,12 +26,12 @@ def category # note this does not sort the creators def creator creators = get_list(@xpaths["creators"]) - return creators.map { |creator| { "name" => Datura::Helpers.normalize_space(creator) } } + creators.map { |c| { "name" => Datura::Helpers.normalize_space(c) } } end # returns ; delineated string of alphabetized creators def creator_sort - return get_text(@xpaths["creators"]) + get_text(@xpaths["creators"]) end def collection @@ -52,7 +52,7 @@ def contributor "role" => Datura::Helpers.normalize_space(ele.xpath("role").text) } end - return contrib_list + contrib_list end def data_type @@ -112,7 +112,7 @@ def person # and put in the xpaths above, also for attributes, etc # should contain name, id, and role eles = @xml.xpath(@xpaths["person"]) - return eles.map do |p| + eles.map do |p| { "id" => "", "name" => Datura::Helpers.normalize_space(p.text), @@ -135,14 +135,13 @@ def publisher def recipient eles = @xml.xpath(@xpaths["recipient"]) - people = eles.map do |p| + eles.map do |p| { "id" => "", "name" => Datura::Helpers.normalize_space(p.text), "role" => Datura::Helpers.normalize_space(p["role"]), } end - return people end def rights @@ -196,8 +195,7 @@ def title end def title_sort - t = title - Datura::Helpers.normalize_name(t) + Datura::Helpers.normalize_name(title) end def topics @@ -212,13 +210,13 @@ def uri def uri_data base = @options["data_base"] subpath = "data/#{@options["collection"]}/source/vra" - return "#{base}/#{subpath}/#{@id}.xml" + "#{base}/#{subpath}/#{@id}.xml" end def uri_html base = @options["data_base"] subpath = "data/#{@options["collection"]}/output/#{@options["environment"]}/html" - return "#{base}/#{subpath}/#{@id}.html" + "#{base}/#{subpath}/#{@id}.html" end def works diff --git a/lib/datura/to_es/vra_to_es/vra_to_es_personography.rb b/lib/datura/to_es/vra_to_es/vra_to_es_personography.rb index 9ae5d718c..e4fd6f442 100644 --- a/lib/datura/to_es/vra_to_es/vra_to_es_personography.rb +++ b/lib/datura/to_es/vra_to_es/vra_to_es_personography.rb @@ -1,7 +1,7 @@ class VraToEsPersonography < TeiToEs def override_xpaths - return { + { "titles" => { "main" => "persName[@type='display']", "alt" => "persName" @@ -16,16 +16,16 @@ def category def creator creators = get_list(@xpaths["creators"], false, @parent_xml) - return creators.map { |creator| { "name" => creator } } + creators.map { |c| { "name" => c } } end def creator_sort - return get_text(@xpaths["creators"], false, @parent_xml) + get_text(@xpaths["creators"], false, @parent_xml) end def get_id person = @xml["id"] - return "#{@filename}_#{person}" + "#{@filename}_#{person}" end def person diff --git a/lib/datura/to_es/xml_to_es.rb b/lib/datura/to_es/xml_to_es.rb index 4a634dd61..e736fb966 100644 --- a/lib/datura/to_es/xml_to_es.rb +++ b/lib/datura/to_es/xml_to_es.rb @@ -51,7 +51,7 @@ def create_json end def get_id - return @filename + @filename end def override_xpaths @@ -74,7 +74,7 @@ def override_xpaths # returns an array with the html value in xpath def get_list(xpaths, keep_tags=false, xml=nil) xpath_array = xpaths.class == Array ? xpaths : [xpaths] - return get_xpaths(xpath_array, keep_tags, xml) + get_xpaths(xpath_array, keep_tags, xml) end # get_text @@ -87,7 +87,7 @@ def get_text(xpaths, keep_tags=false, xml=nil, delimiter=";") xpath_array = xpaths.class == Array ? xpaths : [xpaths] list = get_xpaths(xpath_array, keep_tags, xml) sorted = list.sort - return sorted.join("#{delimiter} ") + sorted.join("#{delimiter} ") end # Note: Recommend that collection team do NOT use this method directly @@ -117,7 +117,7 @@ def get_xpaths(xpaths, keep_tags=false, xml=nil) end end end - return list.uniq + list.uniq end def preprocessing diff --git a/lib/datura/to_es/xml_to_es_request.rb b/lib/datura/to_es/xml_to_es_request.rb index 051ae5b3f..eeef7bba7 100644 --- a/lib/datura/to_es/xml_to_es_request.rb +++ b/lib/datura/to_es/xml_to_es_request.rb @@ -27,7 +27,7 @@ def assemble_json assemble_text assemble_collection_specific - return @json + @json end ############## From 4d2c92e568284f296a7ac24678f9e0ba2fbc00d8 Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Fri, 7 Feb 2020 10:36:14 -0600 Subject: [PATCH 06/15] fixes typo in filename --- .../{solr_create_api_ore.rb => solr_create_api_core.rb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename lib/datura/parser_options/{solr_create_api_ore.rb => solr_create_api_core.rb} (100%) diff --git a/lib/datura/parser_options/solr_create_api_ore.rb b/lib/datura/parser_options/solr_create_api_core.rb similarity index 100% rename from lib/datura/parser_options/solr_create_api_ore.rb rename to lib/datura/parser_options/solr_create_api_core.rb From 9909ab89f71e990223890462ee745dd4c0365213 Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Fri, 7 Feb 2020 12:22:35 -0600 Subject: [PATCH 07/15] building basic hookup for custom filetype ideally this will also have a CustomToEs class in the future, but at least right now this could be usable for folks! --- lib/datura/data_manager.rb | 4 +- lib/datura/file_types/file_custom.rb | 56 ++++++++++++++++++++++++++++ lib/datura/parser_options/post.rb | 12 +++--- 3 files changed, 64 insertions(+), 8 deletions(-) create mode 100644 lib/datura/file_types/file_custom.rb diff --git a/lib/datura/data_manager.rb b/lib/datura/data_manager.rb index fa46e7327..9ae304a43 100644 --- a/lib/datura/data_manager.rb +++ b/lib/datura/data_manager.rb @@ -18,13 +18,15 @@ class Datura::DataManager attr_accessor :collection def self.format_to_class - { + classes = { "csv" => FileCsv, "html" => FileHtml, "tei" => FileTei, "vra" => FileVra, "webs" => FileWebs } + classes.default = FileCustom + classes end def initialize diff --git a/lib/datura/file_types/file_custom.rb b/lib/datura/file_types/file_custom.rb new file mode 100644 index 000000000..8408dd879 --- /dev/null +++ b/lib/datura/file_types/file_custom.rb @@ -0,0 +1,56 @@ +require_relative "../helpers.rb" +require_relative "../file_type.rb" + +require "rest-client" + +class FileCustom < FileType + attr_reader :es_req, :format + + def initialize(file_location, options) + super(file_location, options) + @format = get_format(file_location) + @file = read_file(file_location) + end + + def build_es_documents + # currently assuming that the file has one document to post + # but since some may include more (personographies, spreadsheets, etc) + # this should return an array of documents + [] + end + + def get_format(file_location) + # assumes that the format is in the directory structure + File.dirname(file_location).split("/").last + end + + # NOTE: you will likely need to override this method + # depending on the format in question + def read_file(file_location) + File.read(file_location) + end + + def transform_es + puts "transforming #{self.filename}" + # expecting an array + es_doc = build_es_documents + + if @options["output"] + filepath = "#{@out_es}/#{self.filename(false)}.json" + File.open(filepath, "w") { |f| f.write(pretty_json(es_doc)) } + end + es_doc + end + + def transform_html + raise "Custom format to HTML transformation must be implemented in collection" + end + + def transform_iiif + raise "Custom format to IIIF transformation must be implemented in collection" + end + + def transform_solr + raise "Custom format to Solr transformation must be implemented in collection" + end +end diff --git a/lib/datura/parser_options/post.rb b/lib/datura/parser_options/post.rb index f52c154bc..44783c1c8 100644 --- a/lib/datura/parser_options/post.rb +++ b/lib/datura/parser_options/post.rb @@ -22,14 +22,12 @@ def self.post_params # default to no restricted format options["format"] = nil - opts.on( '-f', '--format [input]', 'Restrict to one format (csv, html, tei, vra, webs)') do |input| - if %w[csv html tei vra webs].include?(input) - options["format"] = input - else - puts "Format #{input} is not recognized.".red - puts "Allowed formats are csv, html, tei, vra, and webs (web-scraped html)" - exit + opts.on( '-f', '--format [input]', 'Supported formats (csv, html, tei, vra, webs)') do |input| + if !%w[csv html tei vra webs].include?(input) + puts "Caution: Requested custom format #{input}.".red + puts "See FileCustom class for implementation instructions" end + options["format"] = input end options["commit"] = true From 2f985d2c4e627467aac2da6f10bf27488d8cfcda Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Mon, 10 Feb 2020 13:59:17 -0600 Subject: [PATCH 08/15] JSON structure mixin, sets up CustomToEs basics it's very difficult to make any fields for an unknown format, so I just roughed in what I could also went through and moved all the JSON request building stuff out into a module so that we don't have to redefine it over and over! This will make it easier to add and update ES fields if our API changes, so that's nice --- lib/datura/file_types/file_custom.rb | 24 ++- lib/datura/requirer.rb | 18 +- lib/datura/to_es/csv_to_es/request.rb | 122 +----------- lib/datura/to_es/custom_to_es.rb | 57 ++++++ lib/datura/to_es/custom_to_es/fields.rb | 186 ++++++++++++++++++ lib/datura/to_es/custom_to_es/request.rb | 7 + .../{xml_to_es_request.rb => es_request.rb} | 16 +- lib/datura/to_es/html_to_es/request.rb | 4 +- lib/datura/to_es/tei_to_es/request.rb | 2 +- lib/datura/to_es/vra_to_es/request.rb | 4 +- lib/datura/to_es/webs_to_es/request.rb | 4 +- lib/datura/to_es/xml_to_es.rb | 2 +- 12 files changed, 301 insertions(+), 145 deletions(-) create mode 100644 lib/datura/to_es/custom_to_es.rb create mode 100644 lib/datura/to_es/custom_to_es/fields.rb create mode 100644 lib/datura/to_es/custom_to_es/request.rb rename lib/datura/to_es/{xml_to_es_request.rb => es_request.rb} (87%) diff --git a/lib/datura/file_types/file_custom.rb b/lib/datura/file_types/file_custom.rb index 8408dd879..63359ec41 100644 --- a/lib/datura/file_types/file_custom.rb +++ b/lib/datura/file_types/file_custom.rb @@ -16,7 +16,21 @@ def build_es_documents # currently assuming that the file has one document to post # but since some may include more (personographies, spreadsheets, etc) # this should return an array of documents - [] + # NOTE this would also be a pretty reasonable method to override + # if you need to split your documents into classes of your own creation + # like "YamlToEs" or "XlsToEs", etc + docs = [] + subdocs.each do |subdoc| + puts "just checking that there's a subdoc here!" + docs << CustomToEs.new( + subdoc, + options: @options, + file: @file, + filename: self.filename, + file_type: @format) + .json + end + docs.compact end def get_format(file_location) @@ -30,6 +44,13 @@ def read_file(file_location) File.read(file_location) end + def subdocs + # if the file should be split into components (such as a CSV row + # or personography person entry), override this method to return + # an array of items + Array(@file) + end + def transform_es puts "transforming #{self.filename}" # expecting an array @@ -42,6 +63,7 @@ def transform_es es_doc end + # CURRENTLY NO SUPPORT FOR FOLLOWING TRANSFORMATIONS def transform_html raise "Custom format to HTML transformation must be implemented in collection" end diff --git a/lib/datura/requirer.rb b/lib/datura/requirer.rb index 48eed7149..75c7bb247 100644 --- a/lib/datura/requirer.rb +++ b/lib/datura/requirer.rb @@ -5,19 +5,11 @@ current_dir = File.expand_path(File.dirname(__FILE__)) -require_relative "to_es/csv_to_es.rb" +require_relative "to_es/es_request.rb" -require_relative "to_es/html_to_es.rb" - -require_relative "to_es/tei_to_es.rb" -require_relative "to_es/tei_to_es/tei_to_es_personography.rb" - -require_relative "to_es/webs_to_es.rb" - -require_relative "to_es/vra_to_es.rb" -require_relative "to_es/vra_to_es/vra_to_es_personography.rb" - -# Dir["#{current_dir}/tei_to_es/*.rb"].each {|f| require f } +# x_to_es classes +Dir["#{current_dir}/to_es/*.rb"].each { |f| require f } +Dir["#{current_dir}/to_es/**/*.rb"].each { |f| require f } # file types -Dir["#{current_dir}/file_types/*.rb"].each {|f| require f } +Dir["#{current_dir}/file_types/*.rb"].each { |f| require f } diff --git a/lib/datura/to_es/csv_to_es/request.rb b/lib/datura/to_es/csv_to_es/request.rb index c047c879e..b361f3f04 100644 --- a/lib/datura/to_es/csv_to_es/request.rb +++ b/lib/datura/to_es/csv_to_es/request.rb @@ -1,122 +1,8 @@ -# request creation portion of CSV to ES transformation class CsvToEs + include EsRequest - def assemble_json - # Note: if your collection does not require a specific field - # it may be better to override the field's behavior to return - # nil than to alter the below field list methods - # values being sent, because otherwise they could just override - # the field behavior to be blank - - # below not alphabetical to reflect their position - # in the cdrh api schema - assemble_identifiers - assemble_categories - assemble_locations - assemble_descriptions - assemble_other_metadata - assemble_dates - assemble_publishing - assemble_people - assemble_spatial - assemble_references - assemble_text - assemble_collection_specific - - @json - end - - ############## - # components # - ############## - - def assemble_categories - @json["category"] = category - @json["subcategory"] = subcategory - @json["data_type"] = data_type - @json["collection"] = collection - @json["collection_desc"] = collection_desc - @json["subjects"] = subjects - end - - def assemble_collection_specific - # add your own per collection - # with format - # @json["fieldname"] = field_contents - end - - def assemble_dates - @json["date_display"] = date_display - @json["date"] = date - @json["date_not_before"] = date_not_before - @json["date_not_after"] = date_not_after - end - - def assemble_descriptions - @json["title_sort"] = title_sort - @json["title"] = title - @json["description"] = description - @json["topics"] = topics - # @json["alternative"] - end - - def assemble_identifiers - @json["identifier"] = @id - end - - def assemble_locations - @json["uri"] = uri - @json["uri_data"] = uri_data - @json["uri_html"] = uri_html - @json["image_id"] = image_id - end - - def assemble_other_metadata - @json["format"] = format - @json["language"] = language - @json["languages"] = languages - # @json["relation"] - # @json["type"] - # @json["extent"] - @json["medium"] = medium - end - - def assemble_people - # container fields - @json["person"] = person - @json["contributor"] = contributor - @json["creator"] = creator - @json["recipient"] = recipient - # can draw off of container fields - @json["creator_sort"] = creator_sort - @json["people"] = people - end - - def assemble_publishing - @json["publisher"] = publisher - @json["rights"] = rights - @json["rights_uri"] = rights_uri - @json["rights_holder"] = rights_holder - @json["source"] = source - end - - def assemble_references - @json["keywords"] = keywords - @json["places"] = places - @json["works"] = works - end - - def assemble_spatial - # TODO not sure about the naming convention here? - # TODO has place_name, coordinates, id, city, county, country, - # region, state, street, postal_code - # @json["coverage.spatial"] - end - - def assemble_text - @json["annotations_text"] = annotations_text - @json["text"] = text - # @json["abstract"] - end + # please refer to generic es_request.rb file + # and override the JSON being sent to elasticsearch here, if needed + # project specific overrides should go in the COLLECTION's overrides! end diff --git a/lib/datura/to_es/custom_to_es.rb b/lib/datura/to_es/custom_to_es.rb new file mode 100644 index 000000000..fe39b5690 --- /dev/null +++ b/lib/datura/to_es/custom_to_es.rb @@ -0,0 +1,57 @@ +require_relative "../helpers.rb" +require_relative "custom_to_es/fields.rb" +require_relative "custom_to_es/request.rb" + +######################################### +# NOTE: DO NOT EDIT THIS FILE!!!!!!!!! # +######################################### +# (unless you are a CDRH dev and then you may do so very cautiously) +# this file provides defaults for ALL of the collections included +# in the API and changing it could alter dozens of sites unexpectedly! +# PLEASE RUN LOADS OF TESTS AFTER A CHANGE BEFORE PUSHING TO PRODUCTION + +# WHAT IS THIS FILE? +# This file sets up default behavior for transforming custom +# documents to Elasticsearch JSON documents + +class CustomToEs + + attr_reader :json, :item, :file_type + + def initialize(item, options: {}, file: nil, filename: nil, file_type: nil) + @item = item + @options = options + # behaves similarly to parent_xml in that it represents + # the entire file, whereas item MAY represent a portion + # of a file (as is the case with a csv row, personography + # //person path, etc) + @file = file + @filename = filename + @file_type = file_type + @id = get_id + + create_json + end + + # getter for @json response object + def create_json + @json = {} + # if anything needs to be done before processing + # do it here (ex: reading in annotations into memory) + preprocessing + assemble_json + postprocessing + end + + def get_id + nil + end + + def preprocessing + # copy this in your custom_to_es collection file to customize + end + + def postprocessing + # copy this in your custom_to_es collection file to customize + end +end diff --git a/lib/datura/to_es/custom_to_es/fields.rb b/lib/datura/to_es/custom_to_es/fields.rb new file mode 100644 index 000000000..a0d068308 --- /dev/null +++ b/lib/datura/to_es/custom_to_es/fields.rb @@ -0,0 +1,186 @@ +class CustomToEs + # Note to add custom fields, use "assemble_collection_specific" from request.rb + # and be sure to either use the _d, _i, _k, or _t to use the correct field type + + ########## + # FIELDS # + ########## + def id + @id + end + + def id_dc + "https://cdrhapi.unl.edu/doc/#{@id}" + end + + def annotations_text + # TODO what should default behavior be? + end + + def category + # TODO + end + + # nested field + def creator + # TODO + end + + # returns ; delineated string of alphabetized creators + def creator_sort + # TODO + end + + def collection + @options["collection"] + end + + def collection_desc + @options["collection_desc"] || @options["collection"] + end + + def contributor + # TODO + end + + def data_type + @file_type + end + + def date(before=true) + # TODO + # Datura::Helpers.date_standardize(??, before) + end + + def date_display + Datura::Helpers.date_display(date) if date + end + + def date_not_after + date(false) + end + + def date_not_before + date(true) + end + + def description + # Note: override per collection as needed + end + + def format + # TODO + end + + def image_id + # TODO + end + + def keywords + # TODO + end + + def language + # TODO + end + + def languages + # TODO + end + + def medium + # Default behavior is the same as "format" method + format + end + + def person + # TODO + end + + def people + # TODO + end + + def places + # TODO + end + + def publisher + # TODO + end + + def recipient + # TODO + end + + def rights + # Note: override by collection as needed + "All Rights Reserved" + end + + def rights_holder + # TODO + end + + def rights_uri + # TODO + end + + def source + # TODO + end + + def subjects + # TODO + end + + def subcategory + # TODO + end + + # text is generally going to be pulled from + def text + # TODO + # get text, add text_additional + # Datura::Helpers.normalize_space(your_text.join(" "))) + end + + # override and add by collection as needed + def text_additional + [ title ] + end + + def title + # TODO + end + + def title_sort + Datura::Helpers.normalize_name(title) if title + end + + def topics + # TODO + end + + def uri + # override per collection + # should point at the live website view of resource + end + + def uri_data + base = @options["data_base"] + subpath = "data/#{@options["collection"]}/source/#{@file_type}" + "#{base}/#{subpath}/#{@filename}" + end + + def uri_html + base = @options["data_base"] + subpath = "data/#{@options["collection"]}/output/#{@options["environment"]}/html" + "#{base}/#{subpath}/#{@id}.html" + end + + def works + # TODO + end + +end diff --git a/lib/datura/to_es/custom_to_es/request.rb b/lib/datura/to_es/custom_to_es/request.rb new file mode 100644 index 000000000..804ddcfd3 --- /dev/null +++ b/lib/datura/to_es/custom_to_es/request.rb @@ -0,0 +1,7 @@ +class CustomToEs + include EsRequest + # please refer to generic es_request.rb file + # and override the JSON being sent to elasticsearch here, if needed + # project specific overrides should go in the COLLECTION's overrides! + +end diff --git a/lib/datura/to_es/xml_to_es_request.rb b/lib/datura/to_es/es_request.rb similarity index 87% rename from lib/datura/to_es/xml_to_es_request.rb rename to lib/datura/to_es/es_request.rb index eeef7bba7..89dc35625 100644 --- a/lib/datura/to_es/xml_to_es_request.rb +++ b/lib/datura/to_es/es_request.rb @@ -1,8 +1,14 @@ -# request creation portion of Xml to ES transformation -# override for VRA / TEI concerns in [type]_to_es.rb -# files or in collection specific overrides - -class XmlToEs +# assemble_json sets up the JSON structure that will be +# used to create elasticsearch documents. However, the JSON +# structure depend on subclasses to define methods like +# "category" and "subcategory" to populate the JSON. +# +# This module itself is not standalone, but by putting +# the JSON structure in a common place, those classes +# including it do not each need to redefine the JSON +# request structure + +module EsRequest def assemble_json # Note: if your collection does not require a specific field diff --git a/lib/datura/to_es/html_to_es/request.rb b/lib/datura/to_es/html_to_es/request.rb index e33f905b6..d8929f0d8 100644 --- a/lib/datura/to_es/html_to_es/request.rb +++ b/lib/datura/to_es/html_to_es/request.rb @@ -1,7 +1,7 @@ class HtmlToEs < XmlToEs - # please refer to generic xml to es request file, request.rb - # and override methods specific to HTML transformation here + # please refer to generic es_request.rb file + # and override the JSON being sent to elasticsearch here, if needed # project specific overrides should go in the COLLECTION's overrides! end diff --git a/lib/datura/to_es/tei_to_es/request.rb b/lib/datura/to_es/tei_to_es/request.rb index 14f3b7438..c416d8bba 100644 --- a/lib/datura/to_es/tei_to_es/request.rb +++ b/lib/datura/to_es/tei_to_es/request.rb @@ -1,6 +1,6 @@ class TeiToEs < XmlToEs - # please refer to generic xml to es request file, request.rb + # please refer to generic es_request.rb file # and override methods specific to TEI transformation here # project specific overrides should go in the COLLECTION's overrides! diff --git a/lib/datura/to_es/vra_to_es/request.rb b/lib/datura/to_es/vra_to_es/request.rb index 974ac6be7..e8d0c1b69 100644 --- a/lib/datura/to_es/vra_to_es/request.rb +++ b/lib/datura/to_es/vra_to_es/request.rb @@ -1,7 +1,7 @@ class VraToEs < XmlToEs - # please refer to generic xml to es request file, request.rb - # and override methods specific to VRA transformation here + # please refer to generic es_request.rb file + # and override the JSON being sent to elasticsearch here, if needed # project specific overrides should go in the COLLECTION's overrides! end diff --git a/lib/datura/to_es/webs_to_es/request.rb b/lib/datura/to_es/webs_to_es/request.rb index af67228dc..95e358de6 100644 --- a/lib/datura/to_es/webs_to_es/request.rb +++ b/lib/datura/to_es/webs_to_es/request.rb @@ -1,7 +1,7 @@ class WebsToEs < XmlToEs - # please refer to generic xml to es request file, request.rb - # and override methods specific to Web Scraped HTML transformation here + # please refer to generic es_request.rb file + # and override methods specific to TEI transformation here # project specific overrides should go in the COLLECTION's overrides! end diff --git a/lib/datura/to_es/xml_to_es.rb b/lib/datura/to_es/xml_to_es.rb index e736fb966..19324853c 100644 --- a/lib/datura/to_es/xml_to_es.rb +++ b/lib/datura/to_es/xml_to_es.rb @@ -1,7 +1,6 @@ require "nokogiri" require_relative "../helpers.rb" require_relative "../common_xml.rb" -require_relative "xml_to_es_request.rb" ######################################### # NOTE: DO NOT EDIT THIS FILE!!!!!!!!! # @@ -20,6 +19,7 @@ # about altering their behavior, customizing xpaths, etc class XmlToEs + include EsRequest attr_reader :json, :xml # variables From fa1e6033479d2c4b8f5d747e4dc837982dac5b3f Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Mon, 10 Feb 2020 14:34:42 -0600 Subject: [PATCH 09/15] updates nokogiri --- Gemfile.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Gemfile.lock b/Gemfile.lock index 1dac847f9..34cdd7bfd 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -20,7 +20,7 @@ GEM mini_portile2 (2.4.0) minitest (5.14.0) netrc (0.11.0) - nokogiri (1.10.7) + nokogiri (1.10.8) mini_portile2 (~> 2.4.0) rake (13.0.1) rest-client (2.0.2) From b9782ddabfbc266e600036c503c3a7ec477b0e4a Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Tue, 11 Feb 2020 11:04:24 -0600 Subject: [PATCH 10/15] prevents authority / annotations from being sent as formats removes unnecessary arguments --- lib/datura/file_types/file_custom.rb | 12 ++++++------ lib/datura/parser_options/post.rb | 6 +++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/lib/datura/file_types/file_custom.rb b/lib/datura/file_types/file_custom.rb index 63359ec41..28725d02b 100644 --- a/lib/datura/file_types/file_custom.rb +++ b/lib/datura/file_types/file_custom.rb @@ -8,8 +8,8 @@ class FileCustom < FileType def initialize(file_location, options) super(file_location, options) - @format = get_format(file_location) - @file = read_file(file_location) + @format = get_format + @file = read_file end def build_es_documents @@ -33,15 +33,15 @@ def build_es_documents docs.compact end - def get_format(file_location) + def get_format # assumes that the format is in the directory structure - File.dirname(file_location).split("/").last + File.dirname(@file_location).split("/").last end # NOTE: you will likely need to override this method # depending on the format in question - def read_file(file_location) - File.read(file_location) + def read_file + File.read(@file_location) end def subdocs diff --git a/lib/datura/parser_options/post.rb b/lib/datura/parser_options/post.rb index 44783c1c8..daa9b7408 100644 --- a/lib/datura/parser_options/post.rb +++ b/lib/datura/parser_options/post.rb @@ -23,7 +23,11 @@ def self.post_params # default to no restricted format options["format"] = nil opts.on( '-f', '--format [input]', 'Supported formats (csv, html, tei, vra, webs)') do |input| - if !%w[csv html tei vra webs].include?(input) + if %w[authority annotations].include?(input) + puts "'authority' and 'annotations' are invalid formats".red + puts "Please select a supported format or rename your custom format" + exit + elsif !%w[csv html tei vra webs].include?(input) puts "Caution: Requested custom format #{input}.".red puts "See FileCustom class for implementation instructions" end From e9a7b962cce7723734fe57ba5c50c767cc9d16a7 Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Tue, 11 Feb 2020 11:05:12 -0600 Subject: [PATCH 11/15] lotsa docs on custom stuff turns out that there is no csv_to_es documentation but there WAS csv_to_solr documentation! changed link --- README.md | 3 + docs/2_customization/all_types.md | 6 +- docs/2_customization/custom_to_es.md | 159 +++++++++++++++++++++++++++ 3 files changed, 166 insertions(+), 2 deletions(-) create mode 100644 docs/2_customization/custom_to_es.md diff --git a/README.md b/README.md index 0b603ffd3..1a7e39192 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,9 @@ Welcome to this temporary documentation for Datura, a gem dedicated to transforming and posting data sources from CDRH projects. This gem is intended to be used with a collection containing TEI, VRA, CSVs, and more. +Looking for information about how to post documents? Check out the +[documentation for posting](/docs/3_manage/post.md). + ## Install diff --git a/docs/2_customization/all_types.md b/docs/2_customization/all_types.md index 604b588a4..deee371e9 100644 --- a/docs/2_customization/all_types.md +++ b/docs/2_customization/all_types.md @@ -5,11 +5,13 @@ There are a number of ways you can customize the transformations. Please refer ### To Elasticsearch - [XML based (HTML / TEI / VRA / webs (Web Scraped HTML))](xml_to_es.md) -- [CSV](csv_to_es.md) +- CSV (Pending) +- [Custom Formats](custom_to_es.md) (those which Datura does not support but which a collection may need) ### To Solr / HTML -- Pending docs TODO +- Pending docs for most formats TODO +- [CSV](csv_to_solr.md) ### To IIIF diff --git a/docs/2_customization/custom_to_es.md b/docs/2_customization/custom_to_es.md new file mode 100644 index 000000000..351ea2e43 --- /dev/null +++ b/docs/2_customization/custom_to_es.md @@ -0,0 +1,159 @@ +# Custom Formats to Elasticsearch + +Datura provides minimal support for formats other than TEI, VRA, +HTML, and CSV through basic infrastructure to support overrides. + +## The Basics + +If you want to add a custom format such as YAML, XLS spreadsheets, or if you +want to add some highly customized version of HTML or CSV in addition to an +existing batch of CSVs, you need to create a directory in source with a unique name. + +*The name you select should not be `authority` or `annotations`*. Those names +are reserved for projects which require authority files such as gazateers and +scholarly notes about items. + +Let's say you need to index `.txt` files. Once you have created the directory +`source/txt` and populated it with a few files, you can run the Datura scripts +with: + +``` +post -f txt +``` + +That will start off the process of grabbing the files and reading them. +Unfortunately, since Datura has no idea what sort of format to prepare for, nor +how many items you might need per format (for example, a PDF might be one item +per file while a tab-separated doc could be dozens or hundreds per file). + +Additionally, once Datura reads in a file, it doesn't know how or what +information to extract, so it looks like it's time to start writing your own +code! + +## Reading Your Format and Prepping for Launch + +### read_file + +In [file_custom.rb](/lib/datura/file_types/file_custom.rb), Datura reads in a +file as text and makes a new CustomToEs object from it. You may wish to +override the following to accommodate your format: + +``` +class FileCustom < FileType + def read_file + File.read(@file_location) + end +end +``` + +Currently, this is just straight up attempting to read a file's text. However, +if you are working with XML / HTML, JSON, CSV, YAML, etc, there is likely a +better, format-specific parser that will give you more control. For example, +you might change `read_file` to: + +``` +# note: may need to require libraries / modules +require "yaml" + +class FileCustom < FileType + def read_file + YAML.load_file(@file_location) + end +end +``` + +### subdocs + +The next thing you will need to address if your format needs to be split into +multiple documents (such as personography files, spreadsheets, database dumps, +etc), is how to split up a file. By default, Datura assumes your file is one +item. If that is not the case, override `subdocs`: + +``` +def subdocs + Array(@file) +end +``` + +Change that to something which will return an array of items. For example, from +our YAML example, you might have: + +``` +def subdocs + @file["texts"] +end +``` +Or for an XML file: +``` +def subdocs + @file.xpath("//grouping") +end +``` + +### build_es_documents + +You're almost done with `file_custom.rb`. You just need to kick off a class +that will handle the transformation per sub-document. For simplicity's sake, if +this is a totally new format that Elasticsearch hasn't seen before, I recommend +leaving this method alone. You can move onto the next step, +[CustomToEs](#customtoes). + +If you want to try to piggyback off of an existing Datura class, then you may +need to override this method. Instead of calling `CustomToEs.new()` in it, you +would instead need to add a `require_relative` path at the top of the file to +your new class, and then call `YournewclassToEs.new()` from `build_es_documents`. + +In your new class, you could presumably do something like + +``` +class YournewclassToEs < XmlToEs + # now you have access to XmlToEs helpers for xpaths, etc +end +``` + +## CustomToEs + +The files in the [custom_to_es](/lib/datura/to_es/custom_to_es) directory and +[custom_to_es.rb](/lib/datura/to_es/custom_to_es.rb) give you the basic +structure you need to create your own version of these files. Since +Datura has no way of knowing what format might come its way, the majority of the +methods in `custom_to_es/fields.rb` are empty. + +The only thing you **MUST** override is `get_id`. + +Create a file in your overrides directory called `custom_to_es.rb` and add the +following: + +``` +class CustomToEs + + def get_id + # include code here that returns an id + # it could be the @filename(false) to get a filename without extension + # or it could be `@item["identifier"] to get the value of a column, etc + + # you may want to prepend a collection abbreviation to your id, like + # "nei.#{some_value}" + end + +end +``` + +You can also add preprocessing or postprocess here by overriding `create_json`. + +It is expected that you will override most of the methods in `fields.rb`. For +example, you might set a category like: + +``` +def category + # your code here, referencing @item if necessary +end +``` + +One more note: due to how `CustomToEs` is created, it is expecting a subdoc +and the original file. This is because it accommodates for something like a +personography file, where you may want to deal with an individual person as +`@item` but need to reference `@file` to get information about the repository +or rightsholder, etc. If your format does not use sub-documents, then you +may simply refer to `@item` throughout and ignore `@file`, which should be +identical. From b91eb75f8cb59ab3e7ef0e72efb28fd68d22e9fb Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Thu, 23 Apr 2020 11:19:44 -0500 Subject: [PATCH 12/15] updates ruby and gems --- .ruby-version | 2 +- Gemfile.lock | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.ruby-version b/.ruby-version index 24ba9a38d..860487ca1 100644 --- a/.ruby-version +++ b/.ruby-version @@ -1 +1 @@ -2.7.0 +2.7.1 diff --git a/Gemfile.lock b/Gemfile.lock index 34cdd7bfd..e79532f80 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -20,7 +20,7 @@ GEM mini_portile2 (2.4.0) minitest (5.14.0) netrc (0.11.0) - nokogiri (1.10.8) + nokogiri (1.10.9) mini_portile2 (~> 2.4.0) rake (13.0.1) rest-client (2.0.2) @@ -29,7 +29,7 @@ GEM netrc (~> 0.8) unf (0.1.4) unf_ext - unf_ext (0.0.7.6) + unf_ext (0.0.7.7) PLATFORMS ruby From 08c00f5584a1fbd3a2a5d3b3213d2bd60febfffa Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Thu, 23 Apr 2020 11:20:11 -0500 Subject: [PATCH 13/15] adds documentation to custom_to_es and fixes find / replace error --- docs/2_customization/custom_to_es.md | 13 ++++++++++++- lib/datura/to_es/webs_to_es/request.rb | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/docs/2_customization/custom_to_es.md b/docs/2_customization/custom_to_es.md index 351ea2e43..b9ef0f232 100644 --- a/docs/2_customization/custom_to_es.md +++ b/docs/2_customization/custom_to_es.md @@ -32,10 +32,21 @@ code! ## Reading Your Format and Prepping for Launch +Just a note before we begin to clarify some of the variables that you may come +across while you're setting up your custom format: + +- `@file_location` -- the fullpath to the specific file being processed + - `/var/local/www/data/collections/source/[custom_format]/test.json` +- `@filename` -- the specific file without a path + - `test.json` +- `self.filename()` -- method specific to FileType and subclasses to get the filename +- `@file` -- very generically named, `@file` is the version of your file that has been read in by Ruby + - override the `read_file` method to make `@file` into an XML / JSON / YAML / etc object as needed by your custom class (see below) + ### read_file In [file_custom.rb](/lib/datura/file_types/file_custom.rb), Datura reads in a -file as text and makes a new CustomToEs object from it. You may wish to +file as text and makes a new CustomToEs object from it, which is stored as `@file`. You may wish to override the following to accommodate your format: ``` diff --git a/lib/datura/to_es/webs_to_es/request.rb b/lib/datura/to_es/webs_to_es/request.rb index 95e358de6..330795d28 100644 --- a/lib/datura/to_es/webs_to_es/request.rb +++ b/lib/datura/to_es/webs_to_es/request.rb @@ -1,7 +1,7 @@ class WebsToEs < XmlToEs # please refer to generic es_request.rb file - # and override methods specific to TEI transformation here + # and override methods specific to Web Scraped HTML transformation here # project specific overrides should go in the COLLECTION's overrides! end From 6c8a72031b65b42c2512a48604753e418a07a6d4 Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Thu, 23 Apr 2020 11:20:41 -0500 Subject: [PATCH 14/15] adds clarifying instructions for installation readme --- README.md | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 1a7e39192..2aea9e408 100644 --- a/README.md +++ b/README.md @@ -5,16 +5,31 @@ Welcome to this temporary documentation for Datura, a gem dedicated to transform Looking for information about how to post documents? Check out the [documentation for posting](/docs/3_manage/post.md). -## Install +## Install / Set Up Data Repo +Check that Ruby is installed, preferably 2.7.x or up. -Gemfile: +If your project already has a Gemfile, add the `gem "datura"` line. If not, create a new directory and add a file named `Gemfile` (no extension). ``` -gem "datura", git: "https://github.com/CDRH/data.git", branch: "datura" +source "https://rubygems.org" + +# fill in the latest available release for the tag +gem "datura", git: "https://github.com/CDRH/datura.git", tag: "v0.0.0" ``` -Next, install saxon as a system wide executable. [Saxon setup documentation](docs/4_developers/saxon.md). +If this is the first datura repository on your machine, install saxon as a system wide executable. [Saxon setup documentation](docs/4_developers/saxon.md). + +Then, in the directory with the Gemfile, run the following: + +``` +gem install bundler +bundle install + +bundle exec setup +``` + +The last step should add files and some basic directories. Have a look at the [setup instructions](/docs/1_setup/collection_setup.md) to learn how to add your files and start working with the data! ## Local Development @@ -31,21 +46,17 @@ Then in your repo you can run: ``` bundle install +# create the gem package if the above doesn't work +gem install --local path/to/local/datura/pkg/datura-0.x.x.gem ``` -If for some reason that is not working, you can instead run the following each time you make a change in datura: +You will need to recreate your gem package for some changes you make in Datura. From the DATURA directory, NOT your data repo directory, run: ``` bundle exec rake install ``` -then from the collection (sub in the correct version): - -``` -gem install --local path/to/local/datura/pkg/datura-0.1.2.gem -``` - -Note: You may need to delete your `scripts/.xslt-datura` folder as well. +Note: You may also need to delete your `scripts/.xslt-datura` folder if you are making changes to the default Datura scripts. ## First Steps From 224bb3e16d9a3fc1a3916edcbb223ee07d417c13 Mon Sep 17 00:00:00 2001 From: Jessica Dussault Date: Thu, 23 Apr 2020 11:20:57 -0500 Subject: [PATCH 15/15] bumps datura version for new release --- lib/datura/version.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/datura/version.rb b/lib/datura/version.rb index 681321e18..4f5a79e19 100644 --- a/lib/datura/version.rb +++ b/lib/datura/version.rb @@ -1,3 +1,3 @@ module Datura - VERSION = "0.1.5" + VERSION = "0.1.6" end