From a843217e90048710e4ea1c614746057a81dd7dd4 Mon Sep 17 00:00:00 2001 From: Eitan Frachtenberg Date: Wed, 29 Nov 2017 10:29:09 -0800 Subject: [PATCH 1/2] Fix issue #64: uncovertible images When iText.text.pdf.parser fails to parse/convert an image, CERMINE fails to catch the exception and crashes. This patch catches the exception, reports it, and skips the page with the unconvertible image (continue to parse the rest of the pages). See issue #64: https://github.com/CeON/CERMINE/issues/64 --- .../edu/icm/cermine/structure/ITextCharacterExtractor.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cermine-impl/src/main/java/pl/edu/icm/cermine/structure/ITextCharacterExtractor.java b/cermine-impl/src/main/java/pl/edu/icm/cermine/structure/ITextCharacterExtractor.java index f465bd18..649736a8 100644 --- a/cermine-impl/src/main/java/pl/edu/icm/cermine/structure/ITextCharacterExtractor.java +++ b/cermine-impl/src/main/java/pl/edu/icm/cermine/structure/ITextCharacterExtractor.java @@ -109,7 +109,12 @@ public BxDocument extractCharacters(InputStream stream) throws AnalysisException processAlternativeColorSpace(resources); processor.reset(); - processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNumber), resources); + try { + processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNumber), resources); + } catch (com.itextpdf.text.ExceptionConverter ex) { + System.out.println("Failed to parse page" + pageNumber + " ... skipping page!"); + continue; + } TimeoutRegister.get().check(); } From dff8f62aa7d7f3afd96b86da7f289704f165b21c Mon Sep 17 00:00:00 2001 From: Eitan Frachtenberg Date: Thu, 30 Nov 2017 12:52:05 -0800 Subject: [PATCH 2/2] Bypass issue with illegal color space. Catch the following exception and skip parsing the page: Exception in thread "main" java.lang.IllegalArgumentException: Unexpected color space /CS1 at com.itextpdf.text.pdf.parser.InlineImageUtils.getComponentsPerPixel(InlineImageUtils.java:250) at com.itextpdf.text.pdf.parser.InlineImageUtils.computeBytesPerRow(InlineImageUtils.java:263) at com.itextpdf.text.pdf.parser.InlineImageUtils.parseUnfilteredSamples(InlineImageUtils.java:292) --- .../edu/icm/cermine/structure/ITextCharacterExtractor.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cermine-impl/src/main/java/pl/edu/icm/cermine/structure/ITextCharacterExtractor.java b/cermine-impl/src/main/java/pl/edu/icm/cermine/structure/ITextCharacterExtractor.java index 649736a8..7f320c3c 100644 --- a/cermine-impl/src/main/java/pl/edu/icm/cermine/structure/ITextCharacterExtractor.java +++ b/cermine-impl/src/main/java/pl/edu/icm/cermine/structure/ITextCharacterExtractor.java @@ -112,7 +112,10 @@ public BxDocument extractCharacters(InputStream stream) throws AnalysisException try { processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNumber), resources); } catch (com.itextpdf.text.ExceptionConverter ex) { - System.out.println("Failed to parse page" + pageNumber + " ... skipping page!"); + System.out.println("Failed to parse page " + pageNumber + " with error: '" + ex.getMessage() + "' ... skipping page!"); + continue; + } catch(IllegalArgumentException ex) { + System.out.println("Failed to parse page " + pageNumber + " with error: '" + ex.getMessage() + "' ... skipping page!"); continue; } TimeoutRegister.get().check();