diff --git a/CHANGELOG.md b/CHANGELOG.md index 98283cbd8..0c48d6f1c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,13 @@ Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] Changelog up to date +## [v0.1.6](https://github.com/CDRH/datura/compare/v0.1.5...v0.1.6) - 2020-02-11 - WEBS HTML Object + +### Changed +- FileType elasticsearch transform now has swappable component when reading +XML-type files. Webscraping script altered to manipulate HTML instead of +XML object type + ## [v0.1.5](https://github.com/CDRH/datura/compare/v0.1.4...v0.1.5) - 2020-02-03 - VRA to Solr ### Added diff --git a/lib/datura/common_xml.rb b/lib/datura/common_xml.rb index 02a931508..d4cc8351d 100644 --- a/lib/datura/common_xml.rb +++ b/lib/datura/common_xml.rb @@ -32,11 +32,17 @@ def self.convert_tags_in_string(text) return converted.xpath("//xml").inner_html end + def self.create_html_object(filepath, remove_ns=true) + file_html = File.open(filepath) { |f| Nokogiri::HTML(f, &:noblanks) } + file_html.remove_namespaces! if remove_ns + file_html + end + def self.create_xml_object(filepath, remove_ns=true) file_xml = File.open(filepath) { |f| Nokogiri::XML(f, &:noblanks) } # TODO is this a good idea? file_xml.remove_namespaces! if remove_ns - return file_xml + file_xml end # pass in a date and identify whether it should be before or after diff --git a/lib/datura/file_type.rb b/lib/datura/file_type.rb index 38ca0806e..6077dd8f2 100644 --- a/lib/datura/file_type.rb +++ b/lib/datura/file_type.rb @@ -42,6 +42,13 @@ def filename(ext=true) end end + # typically assumed to be an XML file, parsed as XML + # but in some cases (for example, web scraping) this needs + # to be overridden to parse HTML instead + def parse_markup_lang_file + CommonXml.create_xml_object(self.file_location) + end + def post_es(url=nil) url = url || "#{@options["es_path"]}/#{@options["es_index"]}" begin @@ -108,7 +115,7 @@ def print_solr def transform_es es_req = [] begin - file_xml = CommonXml.create_xml_object(self.file_location) + file_xml = parse_markup_lang_file # check if any xpaths hit before continuing results = file_xml.xpath(*subdoc_xpaths.keys) if results.length == 0 diff --git a/lib/datura/file_types/file_webs.rb b/lib/datura/file_types/file_webs.rb index 5fd6c7525..62c8dc5d9 100644 --- a/lib/datura/file_types/file_webs.rb +++ b/lib/datura/file_types/file_webs.rb @@ -10,6 +10,10 @@ def initialize(file_location, options) super(file_location, options) end + def parse_markup_lang_file + CommonXml.create_html_object(self.file_location) + end + def subdoc_xpaths { "/" => WebsToEs } end diff --git a/lib/datura/version.rb b/lib/datura/version.rb index 681321e18..4f5a79e19 100644 --- a/lib/datura/version.rb +++ b/lib/datura/version.rb @@ -1,3 +1,3 @@ module Datura - VERSION = "0.1.5" + VERSION = "0.1.6" end