Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable specification of a config file, and generate hocr output if option set #92

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docsplit.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,6 @@ Gem::Specification.new do |s|

s.files = Dir['build/**/*', 'lib/**/*', 'bin/*', 'vendor/**/*',
'docsplit.gemspec', 'LICENSE', 'README']
end

s.add_dependency "nokogiri"
end
5 changes: 5 additions & 0 deletions lib/docsplit.rb
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ def self.clean_text(text)
TextCleaner.new.clean(text)
end

# Utility method to clean OCR'd text in hOCR output format.
def self.clean_hocr(html)
TextCleaner.new.clean_hocr(html)
end

private

# Normalize a value in an options hash for the command line.
Expand Down
5 changes: 4 additions & 1 deletion lib/docsplit/command_line.rb
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ def parse_options
opts.on('--[no-]ocr', 'force OCR to be used, or disable OCR') do |o|
@options[:ocr] = o
end
opts.on('-c', '--config [FILE]', 'use the specified config file') do |c|
@options[:config] = c
end
opts.on('--no-clean', 'disable cleaning of OCR\'d text') do |c|
@options[:clean] = false
end
Expand Down Expand Up @@ -119,4 +122,4 @@ def parse_options

end

end
end
36 changes: 27 additions & 9 deletions lib/docsplit/text_cleaner.rb
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
require 'strscan'
require 'nokogiri'

module Docsplit

Expand Down Expand Up @@ -32,16 +33,8 @@ class TextCleaner
REPEATED = /(\b\S{1,2}\s+)(\S{1,3}\s+){5,}(\S{1,2}\s+)/
SINGLETONS = /^[AaIi]$/

# For the time being, `clean` uses the regular StringScanner, and not the
# multibyte-aware version, coercing to ASCII first.
def clean(text)
if String.method_defined?(:encode)
text.encode!('ascii', :invalid => :replace, :undef => :replace, :replace => '?')
else
require 'iconv' unless defined?(Iconv)
text = Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first
end

text = get_conversion_method.call(text)
scanner = StringScanner.new(text)
cleaned = []
spaced = false
Expand All @@ -60,6 +53,31 @@ def clean(text)
end
end

# When cleaning hOCR output, we follow a slightly simplied cleaning
# heuristic. Simply look at the individual word embedded within the
# XML text node that is a child of the XML element with the class
# attribute set to '.ocrx_word.' If it is garbage, delete that node.
def clean_hocr(xhtml)
convert = get_conversion_method
xml = Nokogiri::XML(xhtml)
xml.css('.ocrx_word').each do |elt|
word = xml.css('.ocrx_word').last.xpath(".//text()").text
elt.remove if garbage(convert.call(word))
end
xml.to_s
end

# For the time being, `clean` uses the regular StringScanner, and not the
# multibyte-aware version, coercing to ASCII first.
def get_conversion_method
if String.method_defined?(:encode)
lambda { |text| text.encode('ascii', :invalid => :replace, :undef => :replace, :replace => '?') }
else
require 'iconv' unless defined?(Iconv)
lambda { |text| Iconv.iconv('ascii//translit//ignore', 'utf-8', text).first }
end
end

# Is a given word OCR garbage?
def garbage(w)
acronym = w =~ ACRONYM
Expand Down
93 changes: 84 additions & 9 deletions lib/docsplit/text_extractor.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
require 'nokogiri'

module Docsplit

# Delegates to **pdftotext** and **tesseract** in order to extract text from
Expand All @@ -21,6 +23,10 @@ class TextExtractor

MIN_TEXT_PER_PAGE = 100 # in bytes

HOCR_SECTIONS = [ [ '.ocr_par', "\n\n" ],
[ '.ocr_line', "\n" ],
[ '.ocrx_word', " " ] ]

def initialize
@pages_to_ocr = []
end
Expand Down Expand Up @@ -66,16 +72,20 @@ def extract_from_ocr(pdf, pages)
escaped_tiff = ESCAPE[tiff]
file = "#{base_path}_#{page}"
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle +adjoin #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf}[#{page - 1}] #{escaped_tiff} 2>&1"
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} 2>&1"
clean_text(file + '.txt') if @clean_ocr
run "tesseract #{escaped_tiff} #{ESCAPE[file]} -l #{@language} #{@config} 2>&1"
run "cp #{escaped_tiff} #{base_path}_#{page}.tif" if @gen_hocr
clean_ocr(file) if @clean_ocr
generate_text_and_annotate(file) if @gen_hocr
FileUtils.remove_entry_secure tiff
end
else
tiff = "#{tempdir}/#{@pdf_name}.tif"
escaped_tiff = ESCAPE[tiff]
run "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert -despeckle #{MEMORY_ARGS} #{OCR_FLAGS} #{escaped_pdf} #{escaped_tiff} 2>&1"
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} 2>&1"
clean_text(base_path + '.txt') if @clean_ocr
run "tesseract #{escaped_tiff} #{base_path} -l #{@language} #{@config} 2>&1"
run "cp #{escaped_tiff} #{base_path}.tif" if @gen_hocr
clean_ocr(base_path) if @clean_ocr
generate_text_and_annotate(base_path) if @gen_hocr
end
ensure
FileUtils.remove_entry_secure tempdir if File.exists?(tempdir)
Expand All @@ -84,15 +94,69 @@ def extract_from_ocr(pdf, pages)

private

def clean_text(file)
File.open(file, 'r+') do |f|
text = f.read
def clean_ocr(basename)
ext = @gen_hocr ? "html" : "txt"
File.open(basename + ".#{ext}", 'r+') do |f|
content = f.read
f.truncate(0)
f.rewind
f.write(Docsplit.clean_text(text))
meth = @gen_hocr ? "hocr" : "text"
f.write(Docsplit.send("clean_#{meth}".to_sym, content))
end
end

# When generating hOCR output, tesseract doesn't generate text output.
# This method will generate the text output, and also add the corresponding
# character position of the words back into the hOCR file as HTML data
# attributes.
def generate_text_and_annotate(basename)
File.open(basename + '.txt', 'w') do |output|
File.open(basename + '.html', 'r+') do |input|
xml = Nokogiri::XML(input.read)
generate_text_position(xml) do |text, pos, elt|
# Write the output text file
output.write(text)

# Annotate the hOCR element we are given
if elt
elt['data-start'] = pos
elt['data-stop' ] = pos + text.size
end
end
input.truncate(0)
input.rewind
input.write(xml.to_xml)
end
end
end

def generate_text_position(root, index=0, pos=0, &block)
raise RuntimeError, "bad section list" if index >= HOCR_SECTIONS.size
# Select the sections we want at this level
sections = root.css(HOCR_SECTIONS[index][0])
sections.each do |section|
if index < HOCR_SECTIONS.size - 1
# It is not the base section, so recurse.
pos = generate_text_position(section, index + 1, pos, &block)
else
# It is the base section (a word), so emit the
# text and the xml element so the caller can
# annotate.
block.call(section.text, pos, section) if block
pos += section.text.size
end

# We 'join' the sections with the specified separator.
# Emit the section join text, but without the xml
# element, since this is just generate text.
if section != sections.last
block.call(HOCR_SECTIONS[index][1], pos, nil) if block
pos += HOCR_SECTIONS[index][1].size
end
end
pos
end

# Run an external process and raise an exception if it fails.
def run(command)
result = `#{command}`
Expand Down Expand Up @@ -123,8 +187,19 @@ def extract_options(options)
@forbid_ocr = options[:ocr] == false
@clean_ocr = !(options[:clean] == false)
@language = options[:language] || 'eng'
@gen_hocr = check_tesseract_config(options[:config])
@config = options[:config] || ''
end

def check_tesseract_config(config)
return false unless config
hocr_configs = File.open(config, 'r').grep(/tessedit_create_hocr/)
if hocr_configs.size > 0
return hocr_configs.last.split[1] != "0"
end
false
end

end

end
end
28 changes: 28 additions & 0 deletions test/unit/test_extract_text.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
here = File.expand_path(File.dirname(__FILE__))
require File.join(here, '..', 'test_helper')
require 'fileutils'
require 'tmpdir'

class ExtractTextTest < Test::Unit::TestCase
Expand Down Expand Up @@ -38,6 +39,33 @@ def test_ocr_extraction
end
end

def test_hocr_extraction
# Create a config that enables hOCR output
FileUtils.mkdir_p(OUTPUT)
File.write("#{OUTPUT}/config", "tessedit_create_hocr 1")

Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :config => "#{OUTPUT}/config")

# Remove the file to avoid polluting the tests below
FileUtils.rm("#{OUTPUT}/config")

files = []
4.times do |i|
file = "corrosion_#{i + 1}.txt"
files.push(file)
assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with extracted text should have reasonable size"
# This page contains does not need ocr.
next if i == 2
file = "corrosion_#{i + 1}.html"
files.push(file)
assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that file with annotated html should have reasonable size"
file = "corrosion_#{i + 1}.tif"
files.push(file)
assert File.read(File.join(OUTPUT, file)).size > 1, "Expected that tif file should have reasonable size"
end
assert_directory_contains(OUTPUT, files)
end

def test_ocr_extraction_in_mock_language
exception = assert_raise(Docsplit::ExtractionFailed) {Docsplit.extract_text('test/fixtures/corrosion.pdf', :pages => 'all', :output => OUTPUT, :language => "mock")}
assert exception.message.match("tessdata/mock"), "Expected problem with loading data for language 'mock'"
Expand Down