Skip to content

Commit

Permalink
Merge pull request #627 from sul-dlss/language-extraction
Browse files Browse the repository at this point in the history
Refactor language extraction macros, fixes #617
  • Loading branch information
jmartin-sul authored Nov 19, 2020
2 parents f500178 + d35e758 commit 4acb9b1
Show file tree
Hide file tree
Showing 17 changed files with 62 additions and 271 deletions.
92 changes: 6 additions & 86 deletions lib/macros/language_extraction.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,81 +20,17 @@ module LanguageExtraction
private_constant :TEI_LOWER_PREFIX

# Returns the value extracted by 'to_field' reformated as a hash with accompanying BCP47 language code.
# Should only be used when metadata is known to be either Arabic in Arabic script or none.
# Any other values will not parse correctly.
# Should only be used to differentiate between an Arabic script language '-Arab' and a Latin script
# language '-Latn'.
# @return [Proc] a proc that traject can call for each record
# @example
# naive_language_extractor => {'ar-Arab': ['من كتب محمد بن محمد الكبسي. لقطة رقم (1).']}
def arabic_or_none
# arabic_script_lang_or_default('ar-Arab', 'en') => {'ar-Arab': ['من كتب محمد بن محمد الكبسي. لقطة رقم (1).']}
def arabic_script_lang_or_default(arabic_script_lang, default)
lambda do |_record, accumulator|
extracted_string = accumulator[0]
if extracted_string
script = extracted_string.match?(/[ضصثقفغعهخحمنتالبيسشظطذدزرو]/) ? 'ar-Arab' : 'none'
accumulator.replace([{ language: script.to_s, values: [extracted_string] }])
end
end
end

# Returns the value extracted by 'to_field' reformated as a hash with accompanying BCP47 language code.
# Should only be used when metadata is known to be either Arabic in Arabic script or none.
# Any other values will not parse correctly.
# @return [Proc] a proc that traject can call for each record
# @example
# naive_language_extractor => {'ar-Arab': ['من كتب محمد بن محمد الكبسي. لقطة رقم (1).']}
def arabic_or_und_latn
lambda do |_record, accumulator|
extracted_string = accumulator[0]
if extracted_string
script = extracted_string.match?(/[ضصثقفغعهخحمنتالبيسشظطذدزرو]/) ? 'ar-Arab' : 'und-Latn'
accumulator.replace([{ language: script.to_s, values: [extracted_string] }])
end
end
end

# Returns the value extracted by 'to_field' reformated as a hash with accompanying BCP47 language code.
# Should only be used when metadata is known to be either Arabic in Arabic script or English.
# Any other values will not parse correctly.
# @return [Proc] a proc that traject can call for each record
# @example
# naive_language_extractor => {'ar-Arab': ['من كتب محمد بن محمد الكبسي. لقطة رقم (1).']}
def naive_language_extractor
lambda do |_record, accumulator|
extracted_string = accumulator[0]
if extracted_string
script = extracted_string.match?(/[ضصثقفغعهخحمنتالبيسشظطذدزرو]/) ? 'ar-Arab' : 'en'
accumulator.replace([{ language: script.to_s, values: [extracted_string] }])
end
end
end

# Returns the value extracted by 'to_field' reformated as a hash with accompanying BCP47 language code.
# Should only be used when metadata is known to be either Persian in Arabic script or an unpredictable language.
# Any other values will not parse correctly.
# @return [Proc] a proc that traject can call for each record
# @example
# persian_or_none => {'fa-Arab': ['نظامنامۀ مقياسات']}
def persian_or_none
lambda do |_record, accumulator|
extracted_string = accumulator[0]
if extracted_string
script = extracted_string.match?(/[ضصثقفغعهخحمنتالبيسشظطذدزرو]/) ? 'fa-Arab' : 'none'
accumulator.replace([{ language: script.to_s, values: [extracted_string] }])
end
end
end

# Returns the value extracted by 'to_field' reformated as a hash with accompanying BCP47 language code.
# Should only be used when metadata is known to be either Persian in Arabic script or an unpredictable language.
# Any other values will not parse correctly.
# @return [Proc] a proc that traject can call for each record
# @example
# persian_or_und_latn => {'fa-Arab': ['نظامنامۀ مقياسات']}
def persian_or_und_latn
lambda do |_record, accumulator|
extracted_string = accumulator[0]
if extracted_string
script = extracted_string.match?(/[ضصثقفغعهخحمنتالبيسشظطذدزرو]/) ? 'fa-Arab' : 'und-Latn'
accumulator.replace([{ language: script.to_s, values: [extracted_string] }])
lang_code = extracted_string.match?(/[ضصثقفغعهخحمنتالبيسشظطذدزرو]/) ? arabic_script_lang : default
accumulator.replace([{ language: lang_code, values: [extracted_string] }])
end
end
end
Expand All @@ -113,21 +49,5 @@ def tei_lower_resource_language
accumulator.replace([{ language: TO_BCP47[:"#{language}"], values: [extracted_string] }]) if extracted_string
end
end

# Returns the value extracted by 'to_field' reformated as a hash with accompanying BCP47 language code.
# Should only be used when metadata is known to be either Arabic in Arabic script or none.
# Any other values will not parse correctly.
# @return [Proc] a proc that traject can call for each record
# @example
# naive_language_extractor => {'ar-Arab': ['من كتب محمد بن محمد الكبسي. لقطة رقم (1).']}
def und_arabic_or_syriac
lambda do |_record, accumulator|
extracted_string = accumulator[0]
if extracted_string
script = extracted_string.match?(/[ضصثقفغعهخحمنتالبيسشظطذدزرو]/) ? 'und-Arab' : 'syc'
accumulator.replace([{ language: script.to_s, values: [extracted_string] }])
end
end
end
end
end
31 changes: 0 additions & 31 deletions spec/lib/traject/macros/language_extraction.rb

This file was deleted.

104 changes: 3 additions & 101 deletions spec/lib/traject/macros/language_extraction_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,111 +11,13 @@
end
let(:instance) { klass.new }

describe 'arabic_or_none' do
describe 'arabic_script_lang_or_default' do
context 'when extracted string contains Arabic characters' do
it 'returns the correct language value ar-Arab' do
it 'returns the correct script value arabic_script_lang' do
extracted_string = 'الولايات المتحدة الامريكيه'
callable = instance.arabic_or_none
callable = instance.arabic_script_lang_or_default('ar-Arab', 'en')
expect(callable.call(nil, [extracted_string])).to eq([{ language: 'ar-Arab', values: [extracted_string] }])
end
end

context 'when extracted string does not contain Arabic characters' do
it 'returns the default language of none' do
extracted_string = 'Some extracted string value'
callable = instance.arabic_or_none
expect(callable.call(nil, [extracted_string])).to eq([{ language: 'none', values: [extracted_string] }])
end
end
end

describe 'arabic_or_und_latn' do
context 'when extracted string contains Arabic characters' do
it 'returns the correct language value ar-Arab' do
extracted_string = 'الولايات المتحدة الامريكيه'
callable = instance.arabic_or_und_latn
expect(callable.call(nil, [extracted_string])).to eq([{ language: 'ar-Arab', values: [extracted_string] }])
end
end

context 'when extracted string does not contain Arabic characters' do
it 'returns the default language of und-Latn' do
extracted_string = 'Some extracted string value'
callable = instance.arabic_or_und_latn
expect(callable.call(nil, [extracted_string])).to eq([{ language: 'und-Latn', values: [extracted_string] }])
end
end
end

describe 'naive_language_extractor' do
context 'when extracted string contains Arabic characters' do
it 'returns the correct language value ar-Arab' do
extracted_string = 'الولايات المتحدة الامريكيه'
callable = instance.naive_language_extractor
expect(callable.call(nil, [extracted_string])).to eq([{ language: 'ar-Arab', values: [extracted_string] }])
end
end

context 'when extracted string does not contain Arabic characters' do
it 'returns the default language of en' do
extracted_string = 'Some extracted string value'
callable = instance.naive_language_extractor
expect(callable.call(nil, [extracted_string])).to eq([{ language: 'en', values: [extracted_string] }])
end
end
end

describe 'persian_or_none' do
context 'when extracted string contains Arabic characters' do
it 'returns the correct language value fa-Arab' do
extracted_string = 'رساله معىنيه در علم هيئت'
callable = instance.persian_or_none
expect(callable.call(nil, [extracted_string])).to eq([{ language: 'fa-Arab', values: [extracted_string] }])
end
end

context 'when extracted string does not contain Arabic characters' do
it 'returns the default language of none' do
extracted_string = 'Some extracted string value'
callable = instance.persian_or_none
expect(callable.call(nil, [extracted_string])).to eq([{ language: 'none', values: [extracted_string] }])
end
end
end

describe 'persian_or_und_latn' do
context 'when extracted string contains Arabic characters' do
it 'returns the correct language value fa-Arab' do
extracted_string = 'الولايات المتحدة الامريكيه'
callable = instance.persian_or_und_latn
expect(callable.call(nil, [extracted_string])).to eq([{ language: 'fa-Arab', values: [extracted_string] }])
end
end

context 'when extracted string does not contain Arabic characters' do
it 'returns the default language of und-Latn' do
extracted_string = 'Some extracted string value'
callable = instance.persian_or_und_latn
expect(callable.call(nil, [extracted_string])).to eq([{ language: 'und-Latn', values: [extracted_string] }])
end
end
end

describe 'und_arabic_or_syriac' do
context 'when extracted string contains Arabic characters' do
it 'returns the correct language value und-Arab' do
extracted_string = 'الولايات المتحدة الامريكيه'
callable = instance.und_arabic_or_syriac
expect(callable.call(nil, [extracted_string])).to eq([{ language: 'und-Arab', values: [extracted_string] }])
end
end

context 'when extracted string does not contain Arabic characters' do
it 'returns the default language of syc' do
extracted_string = 'Some extracted string value'
callable = instance.und_arabic_or_syriac
expect(callable.call(nil, [extracted_string])).to eq([{ language: 'syc', values: [extracted_string] }])
end
end
end
end
2 changes: 1 addition & 1 deletion traject_configs/aub_aco_config.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# frozen_string_literal: true

# Cho Required
to_field 'cho_title', xpath_title_or_desc("#{PREFIX}/dc:title", "#{PREFIX}/dc:description[1]"), arabic_or_und_latn, default('Untitled', 'بدون عنوان')
to_field 'cho_title', xpath_title_or_desc("#{PREFIX}/dc:title", "#{PREFIX}/dc:description[1]"), arabic_script_lang_or_default('ar-Arab', 'und-Latn'), default('Untitled', 'بدون عنوان')

# Cho Other
to_field 'cho_contributor', extract_oai('dc:contributor'),
Expand Down
4 changes: 2 additions & 2 deletions traject_configs/aub_poha_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@

# Cho Required
to_field 'id', extract_xpath('/*/identifier'), strip
to_field 'cho_title', extract_poha('/*/dc:title'), strip, naive_language_extractor
to_field 'cho_title', extract_poha('/*/dc:title[2]'), strip, naive_language_extractor
to_field 'cho_title', extract_poha('/*/dc:title'), strip, arabic_script_lang_or_default('ar-Arab', 'en')
to_field 'cho_title', extract_poha('/*/dc:title[2]'), strip, arabic_script_lang_or_default('ar-Arab', 'en')

# Cho Other
to_field 'cho_contributor', extract_poha('/*/dc:interviewee'), strip, prepend('Interviewee: '), lang('en')
Expand Down
2 changes: 1 addition & 1 deletion traject_configs/bodleian_arabic_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
to_field 'id', extract_json('.rendering'),
strip,
gsub('https://digital.bodleian.ox.ac.uk/inquire/p/', '')
to_field 'cho_title', extract_json('.title'), strip, arabic_or_und_latn
to_field 'cho_title', extract_json('.title'), strip, arabic_script_lang_or_default('ar-Arab', 'und-Latn')

# Cho Other
to_field 'cho_creator', extract_json('.author'), strip, lang('en')
Expand Down
2 changes: 1 addition & 1 deletion traject_configs/bodleian_persian_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
to_field 'id', extract_json('.rendering'),
strip,
gsub('https://digital.bodleian.ox.ac.uk/inquire/p/', '')
to_field 'cho_title', extract_json('.title'), strip, persian_or_und_latn
to_field 'cho_title', extract_json('.title'), strip, arabic_script_lang_or_default('fa-Arab', 'und-Latn')

# Cho Other
to_field 'cho_creator', extract_json('.author'), strip
Expand Down
4 changes: 2 additions & 2 deletions traject_configs/loc_el_taher_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@

# Cho Required
to_field 'id', extract_json('.id')
to_field 'cho_title', extract_json('.item.title'), strip, gsub('/', ''), arabic_or_none
to_field 'cho_title', extract_json('.item.other_title[0]'), strip, gsub('/', ''), arabic_or_none
to_field 'cho_title', extract_json('.item.title'), strip, gsub('/', ''), arabic_script_lang_or_default('ar-Arab', 'und-Latn')
to_field 'cho_title', extract_json('.item.other_title[0]'), strip, gsub('/', ''), arabic_script_lang_or_default('ar-Arab', 'und-Latn')

# Cho Other
to_field 'cho_contributor', extract_json('.item.contributors[0]'), strip, lang('en')
Expand Down
4 changes: 2 additions & 2 deletions traject_configs/loc_persian_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@

# Cho Required
to_field 'id', extract_json('.id')
to_field 'cho_title', extract_json('.item.title'), strip, persian_or_none
to_field 'cho_title', extract_json('.item.other_title[0]'), strip, persian_or_none
to_field 'cho_title', extract_json('.item.title'), strip, arabic_script_lang_or_default('fa-Arab', 'und-Latn')
to_field 'cho_title', extract_json('.item.other_title[0]'), strip, arabic_script_lang_or_default('fa-Arab', 'und-Latn')

# Cho Other
to_field 'cho_contributor', extract_json('item.contributors[0]'), strip, lang('en')
Expand Down
12 changes: 6 additions & 6 deletions traject_configs/openn_mmw_0001_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,12 @@
tei_lower_resource_language

# CHO Other
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[1]"), first_only, strip, split(' '), first_only, strip, naive_language_extractor
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[1]"), first_only, strip, split(' '), last, strip, naive_language_extractor
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[2]"), first_only, strip, split(' '), first_only, strip, naive_language_extractor
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[2]"), first_only, strip, split(' '), last, strip, naive_language_extractor
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[3]"), first_only, strip, split(' '), first_only, strip, naive_language_extractor
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[3]"), first_only, strip, split(' '), last, strip, naive_language_extractor
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[1]"), first_only, strip, split(' '), first_only, strip, arabic_script_lang_or_default('ar-Arab', 'en')
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[1]"), first_only, strip, split(' '), last, strip, arabic_script_lang_or_default('ar-Arab', 'en')
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[2]"), first_only, strip, split(' '), first_only, strip, arabic_script_lang_or_default('ar-Arab', 'en')
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[2]"), first_only, strip, split(' '), last, strip, arabic_script_lang_or_default('ar-Arab', 'en')
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[3]"), first_only, strip, split(' '), first_only, strip, arabic_script_lang_or_default('ar-Arab', 'en')
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[3]"), first_only, strip, split(' '), last, strip, arabic_script_lang_or_default('ar-Arab', 'en')
to_field 'cho_date', extract_tei("#{MS_DESC}/#{MS_ORIGIN}/tei:origdate"), strip, lang('en')
to_field 'cho_date_range_norm', openn_gregorian_range
to_field 'cho_date_range_hijri', openn_gregorian_range, hijri_range
Expand Down
12 changes: 6 additions & 6 deletions traject_configs/openn_mmw_0002_config.rb
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,12 @@
tei_lower_resource_language

# CHO Other
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[1]"), first_only, strip, split(' '), first_only, strip, naive_language_extractor
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[1]"), first_only, strip, split(' '), last, strip, naive_language_extractor
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[2]"), first_only, strip, split(' '), first_only, strip, naive_language_extractor
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[2]"), first_only, strip, split(' '), last, strip, naive_language_extractor
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[3]"), first_only, strip, split(' '), first_only, strip, naive_language_extractor
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[3]"), first_only, strip, split(' '), last, strip, naive_language_extractor
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[1]"), first_only, strip, split(' '), first_only, strip, arabic_script_lang_or_default('ar-Arab', 'en')
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[1]"), first_only, strip, split(' '), last, strip, arabic_script_lang_or_default('ar-Arab', 'en')
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[2]"), first_only, strip, split(' '), first_only, strip, arabic_script_lang_or_default('ar-Arab', 'en')
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[2]"), first_only, strip, split(' '), last, strip, arabic_script_lang_or_default('ar-Arab', 'en')
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[3]"), first_only, strip, split(' '), first_only, strip, arabic_script_lang_or_default('ar-Arab', 'en')
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author[3]"), first_only, strip, split(' '), last, strip, arabic_script_lang_or_default('ar-Arab', 'en')
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author/tei:persname[@type='authority']"), strip, lang('en')
to_field 'cho_creator', extract_tei("#{MS_DESC}/#{MS_CONTENTS}/#{MS_ITEM}/tei:author/tei:persname[@type='vernacular']"), strip, lang('ar-Arab')
to_field 'cho_date', extract_tei("#{MS_DESC}/#{MS_ORIGIN}/tei:origdate"), strip, lang('en')
Expand Down
Loading

0 comments on commit 4acb9b1

Please sign in to comment.