From e90840dfa594dd3afa8dc40596d4fbfc9f5c495c Mon Sep 17 00:00:00 2001 From: eastxing Date: Wed, 14 Aug 2013 21:44:17 +0800 Subject: [PATCH 1/4] Extract images on windows platform bugfix Use set command to set "gm" environment variables on windows platform. --- lib/docsplit/image_extractor.rb | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 8c29bbc..9d91b8b 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -8,6 +8,12 @@ class ImageExtractor DEFAULT_FORMAT = :png DEFAULT_DENSITY = '150' + # Helper function to determine the OS + HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os'] + def windows? + !!HOST_OS.match(/mswin|msys|mingw|cygwin|bccwin|wince|emc/i) + end + # Extract a list of PDFs as rasterized page images, according to the # configuration in options. def extract(pdfs, options) @@ -37,12 +43,22 @@ def convert(pdf, size, format, previous=nil) common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" if previous FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) - result = `MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1`.chomp + if windows? + cmd = 'set MAGICK_TMPDIR=#{tempdir} & set OMP_NUM_THREADS=2 & gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1'.chomp + else + cmd = 'MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1'.chomp + end + result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 else page_list(pages).each do |page| - out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] - cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp + if windows? + out_file = File.join(directory, "#{basename}_#{page}.#{format}") + cmd = "set MAGICK_TMPDIR=#{tempdir} & set OMP_NUM_THREADS=2 & gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] \"#{out_file}\" 2>&1".chomp + else + out_file = ESCAPE[File.join(directory, "#{basename}_#{page}.#{format}")] + cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] #{out_file} 2>&1".chomp + end result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 end From 91f7ddfb0fe7114cafb38159aad8bdafc2e37b00 Mon Sep 17 00:00:00 2001 From: jiangdongjin Date: Thu, 15 Aug 2013 12:06:33 +0800 Subject: [PATCH 2/4] Extract pdf on windows platform bugfix: windows platform detect improvement. libreoffice detect improvement. quote paths to avoid wrong path issue on windows. --- lib/docsplit/pdf_extractor.rb | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/lib/docsplit/pdf_extractor.rb b/lib/docsplit/pdf_extractor.rb index 10fd5fc..139630f 100644 --- a/lib/docsplit/pdf_extractor.rb +++ b/lib/docsplit/pdf_extractor.rb @@ -7,7 +7,7 @@ class PdfExtractor # Provide a set of helper functions to determine the OS. HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os'] def windows? - !!HOST_OS.match(/mswin|windows|cygwin/i) + !!HOST_OS.match(/mswin|msys|mingw|cygwin|bccwin|wince|emc/i) end def osx? !!HOST_OS.match(/darwin/i) @@ -22,7 +22,13 @@ def version_string @@help ||= `#{office_executable} -h 2>&1`.split("\n").first end def libre_office? - !!version_string.match(/^LibreOffice/) + if windows? + # on windows platform we can't get version string by 'version_string' func, + # so we simply match the executable path + !!office_executable.match(/libreOffice/i) + else + !!version_string.match(/^LibreOffice/) + end end def open_office? !!version_string.match(/^OpenOffice.org/) @@ -116,7 +122,14 @@ def extract(docs, opts) ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}" options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}" - cmd = "#{office_executable} #{options} 2>&1" + + # quote path on windows platform to avoid wrong path issue + if windows? + cmd = "\"#{office_executable}\" #{options} 2>&1" + else + cmd = "#{office_executable} #{options} 2>&1" + end + result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 true @@ -141,7 +154,14 @@ def run_jod(command, pdfs, opts, return_output=false) pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ') office = osx? ? "-Doffice.home=#{office_path}" : office_path - cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1" + + # quote path on windows platform to avoid wrong path issue + if windows? + cmd = "java #{HEADLESS} #{LOGGING} \"#{office}\" -cp #{CLASSPATH} #{command} #{pdfs} 2>&1" + else + cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1" + end + result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 return return_output ? (result.empty? ? nil : result) : true From 52d7cc267d5fcd95e1b2d2f6ebe5645b3097fcf1 Mon Sep 17 00:00:00 2001 From: jiangdongjin Date: Fri, 23 Aug 2013 12:40:53 +0800 Subject: [PATCH 3/4] Only copy image files, skip other files such as Thumbs.db under windows platform when extract multi-sizes images. --- lib/docsplit/image_extractor.rb | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 9d91b8b..85f2c9e 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -42,11 +42,13 @@ def convert(pdf, size, format, previous=nil) FileUtils.mkdir_p(directory) unless File.exists?(directory) common = "#{MEMORY_ARGS} -density #{@density} #{resize_arg(size)} #{quality_arg(format)}" if previous - FileUtils.cp(Dir[directory_for(previous) + '/*'], directory) + # Only copy image files, skip other files such as Thumbs.db under windows platform + imageFiles = File.join(directory_for(previous), '*.' + format) + FileUtils.cp(Dir.glob(imageFiles), directory) if windows? - cmd = 'set MAGICK_TMPDIR=#{tempdir} & set OMP_NUM_THREADS=2 & gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1'.chomp + cmd = "set MAGICK_TMPDIR=#{tempdir} & set OMP_NUM_THREADS=2 & gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1".chomp else - cmd = 'MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1'.chomp + cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1".chomp end result = `#{cmd}`.chomp raise ExtractionFailed, result if $? != 0 From d27377befe34b6d9fd0cd2ee4bfe16a84e54f128 Mon Sep 17 00:00:00 2001 From: jiangdongjin Date: Mon, 26 Aug 2013 20:09:32 +0800 Subject: [PATCH 4/4] windows platform path escape bugfix. --- lib/docsplit.rb | 5 ++++- lib/docsplit/image_extractor.rb | 10 ++-------- lib/docsplit/pdf_extractor.rb | 2 +- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/lib/docsplit.rb b/lib/docsplit.rb index 5001413..0a8f91b 100755 --- a/lib/docsplit.rb +++ b/lib/docsplit.rb @@ -7,7 +7,10 @@ module Docsplit VERSION = '0.7.2' # Keep in sync with gemspec. - ESCAPE = lambda {|x| Shellwords.shellescape(x) } + HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os'] + IS_WIN = !!HOST_OS.match(/mswin|msys|mingw|cygwin|bccwin|wince|emc/i) + + ESCAPE = IS_WIN ? lambda {|x| "\"#{x}\"" } : lambda {|x| Shellwords.shellescape(x) } ROOT = File.expand_path(File.dirname(__FILE__) + '/..') ESCAPED_ROOT = ESCAPE[ROOT] diff --git a/lib/docsplit/image_extractor.rb b/lib/docsplit/image_extractor.rb index 85f2c9e..fbdf167 100755 --- a/lib/docsplit/image_extractor.rb +++ b/lib/docsplit/image_extractor.rb @@ -8,12 +8,6 @@ class ImageExtractor DEFAULT_FORMAT = :png DEFAULT_DENSITY = '150' - # Helper function to determine the OS - HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os'] - def windows? - !!HOST_OS.match(/mswin|msys|mingw|cygwin|bccwin|wince|emc/i) - end - # Extract a list of PDFs as rasterized page images, according to the # configuration in options. def extract(pdfs, options) @@ -45,7 +39,7 @@ def convert(pdf, size, format, previous=nil) # Only copy image files, skip other files such as Thumbs.db under windows platform imageFiles = File.join(directory_for(previous), '*.' + format) FileUtils.cp(Dir.glob(imageFiles), directory) - if windows? + if IS_WIN cmd = "set MAGICK_TMPDIR=#{tempdir} & set OMP_NUM_THREADS=2 & gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1".chomp else cmd = "MAGICK_TMPDIR=#{tempdir} OMP_NUM_THREADS=2 gm mogrify #{common} -unsharp 0x0.5+0.75 \"#{directory}/*.#{format}\" 2>&1".chomp @@ -54,7 +48,7 @@ def convert(pdf, size, format, previous=nil) raise ExtractionFailed, result if $? != 0 else page_list(pages).each do |page| - if windows? + if IS_WIN out_file = File.join(directory, "#{basename}_#{page}.#{format}") cmd = "set MAGICK_TMPDIR=#{tempdir} & set OMP_NUM_THREADS=2 & gm convert +adjoin -define pdf:use-cropbox=true #{common} #{escaped_pdf}[#{page - 1}] \"#{out_file}\" 2>&1".chomp else diff --git a/lib/docsplit/pdf_extractor.rb b/lib/docsplit/pdf_extractor.rb index 139630f..9b89764 100644 --- a/lib/docsplit/pdf_extractor.rb +++ b/lib/docsplit/pdf_extractor.rb @@ -7,7 +7,7 @@ class PdfExtractor # Provide a set of helper functions to determine the OS. HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os'] def windows? - !!HOST_OS.match(/mswin|msys|mingw|cygwin|bccwin|wince|emc/i) + IS_WIN end def osx? !!HOST_OS.match(/darwin/i)