From b7e13a33bea539a87f2a12bd07bada11bf383f85 Mon Sep 17 00:00:00 2001 From: Jozef Jankaj Date: Tue, 5 Sep 2023 14:55:55 +0200 Subject: [PATCH 01/13] feat: faster CSV library using SimpleFlatMapper --- pom.xml | 6 ++ .../dataio/iterators/CSVWSourceIterator.java | 73 ++++++++----------- .../iterators/csvw/CSVWConfiguration.java | 15 ++-- 3 files changed, 42 insertions(+), 52 deletions(-) diff --git a/pom.xml b/pom.xml index 0d9f8f3..c734bb1 100644 --- a/pom.xml +++ b/pom.xml @@ -245,6 +245,12 @@ rxjava 3.1.5 + + + org.simpleflatmapper + sfm-csv + 8.2.3 + diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java index 69dba9d..3b3977c 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java @@ -1,19 +1,19 @@ + package be.ugent.idlab.knows.dataio.iterators; import be.ugent.idlab.knows.dataio.access.Access; import be.ugent.idlab.knows.dataio.iterators.csvw.CSVWConfiguration; import be.ugent.idlab.knows.dataio.source.CSVSource; import be.ugent.idlab.knows.dataio.source.Source; -import com.opencsv.CSVReader; -import com.opencsv.CSVReaderBuilder; -import com.opencsv.enums.CSVReaderNullFieldIndicator; -import com.opencsv.exceptions.CsvValidationException; +import org.simpleflatmapper.lightningcsv.CsvParser; import java.io.IOException; import java.io.InputStreamReader; import java.io.ObjectInputStream; +import java.io.Reader; import java.sql.SQLException; import java.util.Arrays; +import java.util.Iterator; import java.util.Map; import java.util.NoSuchElementException; @@ -24,7 +24,8 @@ public class CSVWSourceIterator extends SourceIterator { private final CSVWConfiguration config; private transient String[] header; private transient String[] next; - private transient CSVReader reader; + private transient Reader inputReader; + private transient Iterator iterator; public CSVWSourceIterator(Access access, CSVWConfiguration config) throws SQLException, IOException { this.access = access; @@ -38,62 +39,50 @@ private void readObject(ObjectInputStream inputStream) throws IOException, Class } private void bootstrap() throws SQLException, IOException { - this.reader = new CSVReaderBuilder(new InputStreamReader(access.getInputStream(), config.getEncoding())) - .withFieldAsNull(CSVReaderNullFieldIndicator.EMPTY_SEPARATORS) - .withCSVParser(this.config.getParser()) - .withSkipLines(this.config.isSkipHeader() ? 1 : 0) - .build(); + this.inputReader = new InputStreamReader(access.getInputStream(), config.getEncoding()); + + CsvParser.DSL parser = config.getParser(); + this.iterator = parser.iterator(this.inputReader); if (this.config.isSkipHeader()) { this.header = config.getHeader().toArray(new String[0]); } else { - this.header = readLine(); + this.header = nextLine(); if (header == null) { throw new IllegalStateException("Unable to read the file!"); } } - this.next = readLine(); + this.next = nextLine(); if (this.next == null) { throw new IllegalStateException("No further data could be read from the file!"); } } - private String[] readLine() throws IOException { - String[] line; - do { - try { - line = this.reader.readNext(); - - if (line == null) { - return null; - } - } catch (CsvValidationException e) { - throw new IllegalArgumentException(String.format("File does not conform to configuration! Offending line: %s", Arrays.toString(this.reader.peek()))); + private String[] nextLine() { + if (this.iterator.hasNext()) { + String[] r = this.iterator.next(); + // go over the lines till uncommented line found + while (r[0].startsWith(config.getCommentPrefix()) && this.iterator.hasNext()) { + r = this.iterator.next(); } - } while (invalidLine(line)); - return line; - } + if (r[0].startsWith(config.getCommentPrefix())) { + return null; + } - /** - * Checks if the passed line corresponds to the filters set - * - * @param line line to be checked - * @return true if the line passes all checks - */ - private boolean invalidLine(String[] line) { - return Arrays.stream(line).allMatch(s -> s.length() == 0) || // all of the parts are not empty strings - line[0].startsWith(this.config.getCommentPrefix()); // line does not start with a comment prefix + return r; + } + return null; } /** * Checks if @record has a string value which is in the nulls list, if so sets this value to null in the data map. * - * @param record - * @return + * @param record record to be checked + * @return checked and possibly changed record */ public CSVSource replaceNulls(CSVSource record) { Map data = record.getData(); @@ -139,13 +128,9 @@ public Source next() { if (this.next == null) { throw new NoSuchElementException(); } - String[] line = this.next; - try { - this.next = readLine(); - } catch (IOException e) { - throw new RuntimeException(e); - } + + this.next = nextLine(); if (!config.getTrim().equals("false")) { line = applyTrimArray(line, config.getTrim()); @@ -161,6 +146,6 @@ public boolean hasNext() { @Override public void close() throws IOException { - this.reader.close(); + this.inputReader.close(); } } diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java index bd2aa39..c26e1d8 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java @@ -1,7 +1,7 @@ package be.ugent.idlab.knows.dataio.iterators.csvw; -import com.opencsv.CSVParser; -import com.opencsv.CSVParserBuilder; + +import org.simpleflatmapper.lightningcsv.CsvParser; import java.io.Serializable; import java.util.List; @@ -89,11 +89,10 @@ public String getEncoding() { return encoding; } - public CSVParser getParser() { - return new CSVParserBuilder() - .withSeparator(this.delimiter) - .withEscapeChar(this.escapeCharacter) - .withQuoteChar(this.quoteCharacter) - .build(); + public CsvParser.DSL getParser() { + return CsvParser + .separator(this.delimiter) + .escape(this.escapeCharacter) + .quote(this.quoteCharacter); } } From 0b795b97a23bf135917f18fd57f6b2b6d87f148c Mon Sep 17 00:00:00 2001 From: Gerald Haesendonck Date: Fri, 22 Sep 2023 11:11:15 +0200 Subject: [PATCH 02/13] Fix CSVWSourceIterator in `boostrap()`: don't throw IllegalStateExcsption when `this.next` is NULL, because that just means the next line is empty, causing the `hasNext()` method to return false, which is desired behaviour. Throwing this exception likely causes a consumer to crash because it doesn't expect an exception upon reading an empty line. --- .../idlab/knows/dataio/iterators/CSVWSourceIterator.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java index 3b3977c..ceb0689 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java @@ -55,10 +55,6 @@ private void bootstrap() throws SQLException, IOException { } this.next = nextLine(); - - if (this.next == null) { - throw new IllegalStateException("No further data could be read from the file!"); - } } private String[] nextLine() { From 4ce6db44fbca97296df7c90ebd70b1f7cee57abe Mon Sep 17 00:00:00 2001 From: Gerald Haesendonck Date: Fri, 22 Sep 2023 11:28:56 +0200 Subject: [PATCH 03/13] CSVWSourceIterator: fix: Also an empty header doesn't do harm: then the iterator just returns `false` on `hasNext()`. --- .../idlab/knows/dataio/iterators/CSVWSourceIterator.java | 4 ---- .../java/be/ugent/idlab/knows/dataio/cores/TestCore.java | 4 ++++ .../idlab/knows/dataio/iterator/CSVIteratorTest.java | 8 ++++++++ src/test/resources/csv/empty.csv | 0 4 files changed, 12 insertions(+), 4 deletions(-) create mode 100644 src/test/resources/csv/empty.csv diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java index ceb0689..31eab54 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java @@ -48,10 +48,6 @@ private void bootstrap() throws SQLException, IOException { this.header = config.getHeader().toArray(new String[0]); } else { this.header = nextLine(); - - if (header == null) { - throw new IllegalStateException("Unable to read the file!"); - } } this.next = nextLine(); diff --git a/src/test/java/be/ugent/idlab/knows/dataio/cores/TestCore.java b/src/test/java/be/ugent/idlab/knows/dataio/cores/TestCore.java index a748751..8d2c430 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/cores/TestCore.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/cores/TestCore.java @@ -110,6 +110,10 @@ public boolean evaluate_1001_header_short(Iterator iterator) { return compareIterator(iterator, List.of(expected1, expected2, expected3)); } + public boolean evaluate_empty(Iterator iterator) { + return !iterator.hasNext(); + } + public boolean compareIterator(Iterator iterator, Set expectedSources) { int counter = 0; while (iterator.hasNext()) { diff --git a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java index 9060f0d..ea467fc 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java @@ -60,4 +60,12 @@ public void evaluate_0002_BOM_CSV() throws SQLException, IOException { assertTrue(evaluate_0002_BOM(iterator)); } } + + @Test + public void evaluate_empty_CSV() throws SQLException, IOException { + Access access = makeLocalAccess("/csv/empty.csv", "", "csv", "utf-8"); + try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { + assertTrue(evaluate_empty(iterator)); + } + } } diff --git a/src/test/resources/csv/empty.csv b/src/test/resources/csv/empty.csv new file mode 100644 index 0000000..e69de29 From fc4129a90f30bd88f01c838a76f8e7a823fc5955 Mon Sep 17 00:00:00 2001 From: Jozef Jankaj Date: Wed, 18 Oct 2023 22:12:32 +0200 Subject: [PATCH 04/13] chore: add null injector --- .../dataio/iterators/CSVWSourceIterator.java | 3 +- .../knows/dataio/utils/CSVNullInjector.java | 89 +++++++++++++++++++ .../dataio/utility/CSVNullInjectorTest.java | 64 +++++++++++++ 3 files changed, 154 insertions(+), 2 deletions(-) create mode 100644 src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java create mode 100644 src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java index ed0f2a0..5d3276a 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java @@ -36,8 +36,7 @@ private void readObject(ObjectInputStream inputStream) throws Exception { } private void bootstrap() throws Exception { - this.inputReader = new InputStreamReader(access.getInputStream(), config.getEncoding()); - + this.inputReader = access.getInputStreamReader(); CsvParser.DSL parser = config.getSFMParser(); this.iterator = parser.iterator(this.inputReader); diff --git a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java new file mode 100644 index 0000000..884402b --- /dev/null +++ b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java @@ -0,0 +1,89 @@ +package be.ugent.idlab.knows.dataio.utils; + +import java.io.FilterInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.util.ArrayDeque; +import java.util.Deque; + +/** + * Injects a known NULL value between two commas in CSV. + * Inspired by this answer on SO + */ +public class CSVNullInjector extends InputStream { + + private static final byte[] NULL_VALUE = "DATAIO_INJECTED_NULL_VALUE".getBytes(Charset.defaultCharset()); + private final InputStream inputStream; + private final Deque backBuffer = new ArrayDeque<>(); + private final byte[] readBuffer = new byte[NULL_VALUE.length]; + private final char delimiter; + private final char quoteCharacter; + private boolean quoteMode = false; + + public CSVNullInjector(InputStream in, char delimiter, char quoteCharacter) { + this.inputStream = in; + this.delimiter = delimiter; + this.quoteCharacter = quoteCharacter; + } + + /** + * Constructor with default delimiter and quote character for CSV + * + * @param in inputstream to read from + */ + public CSVNullInjector(InputStream in) { + this(in, ',', '"'); + } + + @Override + public int read() throws IOException { + if (!backBuffer.isEmpty()) { + char value = (char) (int) backBuffer.pop(); + if (value == quoteCharacter) { + quoteMode = !quoteMode; + } + return value; + } + + int first = this.inputStream.read(); + + if (first == quoteCharacter) { + quoteMode = !quoteMode; + return first; + } + + if (first == delimiter) { + if (quoteMode) { // if in quote mode, do not inject anything + return first; + } + int second = this.inputStream.read(); + + if (second != delimiter) { + + backBuffer.push((byte) (char) second); + } else { + peekAndReplace(); + } + } + return first; + } + + private void peekAndReplace() throws IOException { + int read = super.read(readBuffer, 0, NULL_VALUE.length); + // fill in backbuffer + for (int i = read - 1; i >= 0; i--) { + backBuffer.push(readBuffer[i]); + } + backBuffer.push((byte) this.delimiter); + + for (int i = 0; i < NULL_VALUE.length; i++) { + if (read != NULL_VALUE.length || readBuffer[i] != NULL_VALUE[i]) { + for (int j = NULL_VALUE.length - 1; j >= 0; j--) { + backBuffer.push(NULL_VALUE[j]); + } + return; + } + } + } +} diff --git a/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java new file mode 100644 index 0000000..348ddb4 --- /dev/null +++ b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java @@ -0,0 +1,64 @@ +package be.ugent.idlab.knows.dataio.utility; + +import be.ugent.idlab.knows.dataio.utils.CSVNullInjector; +import org.junit.jupiter.api.Test; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class CSVNullInjectorTest { + + /** + * Simple test to replace the value in between two commas + */ + @Test + public void testInsertion() throws IOException { + String testString = "ID,,Foo"; + InputStream input = new ByteArrayInputStream(testString.getBytes()); + CSVNullInjector injector = new CSVNullInjector(input); + + // read out the injector + String output = new String(injector.readAllBytes()); + assertEquals("ID,DATAIO_INJECTED_NULL_VALUE,Foo", output); + } + + @Test + public void customDelimiter() throws IOException { + String testString = "ID;;Foo"; + InputStream input = new ByteArrayInputStream(testString.getBytes()); + CSVNullInjector injector = new CSVNullInjector(input, ';', '"'); + + // read out the injector + String output = new String(injector.readAllBytes()); + assertEquals("ID;DATAIO_INJECTED_NULL_VALUE;Foo", output); + } + + /** + * Tests ignoring quoted separators. + */ + @Test + public void ignoreQuotedSeparator() throws IOException { + String testString = "ID,\",, ,\",Foo"; + InputStream input = new ByteArrayInputStream(testString.getBytes()); + CSVNullInjector injector = new CSVNullInjector(input); + + String output = new String(injector.readAllBytes()); + assertEquals( testString, output); + } + + /** + * Tests correct injection for escaped quotes + */ + @Test + public void escapedQuote() throws IOException { + String testString = "\"aaa\",\"b\"\",,bb\",,\"ccc\""; + InputStream input = new ByteArrayInputStream(testString.getBytes()); + CSVNullInjector injector = new CSVNullInjector(input); + + String output = new String(injector.readAllBytes()); + assertEquals("\"aaa\",\"b\"\",,bb\",DATAIO_INJECTED_NULL_VALUE,\"ccc\"", output); + } +} From aa597f855d128dd42721c43bc56d988ca3570bb1 Mon Sep 17 00:00:00 2001 From: Jozef Jankaj Date: Thu, 19 Oct 2023 17:29:07 +0200 Subject: [PATCH 05/13] chore: second version of NullInjector Second version of NullInjector that uses a buffer in the preprocessing instead of reading byte per byte. From limited testing, 16 KiB buffer seems to perform best. --- .../dataio/iterators/CSVWSourceIterator.java | 7 +- .../iterators/csvw/CSVWConfiguration.java | 12 +- .../idlab/knows/dataio/record/CSVRecord.java | 2 +- .../knows/dataio/utils/CSVNullInjector.java | 52 +++--- .../knows/dataio/utils/CSVNullInjector2.java | 149 ++++++++++++++++++ .../dataio/iterator/CSVWIteratorTest.java | 37 +++++ .../dataio/utility/CSVNullInjectorTest.java | 23 ++- 7 files changed, 249 insertions(+), 33 deletions(-) create mode 100644 src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector2.java diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java index 5d3276a..8a0d59c 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java @@ -4,6 +4,8 @@ import be.ugent.idlab.knows.dataio.iterators.csvw.CSVWConfiguration; import be.ugent.idlab.knows.dataio.record.CSVRecord; import be.ugent.idlab.knows.dataio.record.Record; +import be.ugent.idlab.knows.dataio.utils.CSVNullInjector; +import be.ugent.idlab.knows.dataio.utils.CSVNullInjector2; import org.simpleflatmapper.lightningcsv.CsvParser; import java.io.IOException; @@ -36,8 +38,9 @@ private void readObject(ObjectInputStream inputStream) throws Exception { } private void bootstrap() throws Exception { - this.inputReader = access.getInputStreamReader(); - CsvParser.DSL parser = config.getSFMParser(); + this.inputReader = new InputStreamReader(new CSVNullInjector2(access.getInputStream(), this.config.getDelimiter(), this.config.getQuoteCharacter())); +// this.inputReader = access.getInputStreamReader(); + CsvParser.DSL parser = config.getSFMParser(1024 * 16); this.iterator = parser.iterator(this.inputReader); if (this.config.isSkipHeader()) { diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java index 9a34c08..cfa612a 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java @@ -7,6 +7,7 @@ import com.opencsv.enums.CSVReaderNullFieldIndicator; import java.io.Serializable; +import java.util.ArrayList; import java.util.List; /** @@ -49,7 +50,11 @@ public final class CSVWConfiguration implements Serializable { this.skipHeader = skipHeader; this.commentPrefix = commentPrefix; this.header = header; - this.nulls = nulls; + + List nullValues = new ArrayList<>(nulls); + nullValues.add("DATAIO_INJECTED_NULL_VALUE"); // add our special null value + + this.nulls = nullValues; this.encoding = encoding; } @@ -93,11 +98,12 @@ public String getEncoding() { return encoding; } - public CsvParser.DSL getSFMParser() { + public CsvParser.DSL getSFMParser(int bufferSize) { return CsvParser .separator(this.delimiter) .escape(this.escapeCharacter) - .quote(this.quoteCharacter); + .quote(this.quoteCharacter) + .bufferSize(bufferSize); } public CSVParser getOpenCSVParser() { diff --git a/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java b/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java index b3be755..ef3d664 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java @@ -26,7 +26,7 @@ public CSVRecord(String[] header, String[] data, Map datatypes) if (i < data.length) { this.data.put(header[i], data[i]); } else { - this.data.put(header[i], ""); + this.data.put(header[i], null); } } this.datatypes = datatypes; diff --git a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java index 884402b..c730a1c 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java @@ -1,9 +1,12 @@ package be.ugent.idlab.knows.dataio.utils; -import java.io.FilterInputStream; +import com.drew.lang.annotations.NotNull; +import io.reactivex.rxjava3.annotations.NonNull; + import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.ArrayDeque; import java.util.Deque; @@ -13,13 +16,13 @@ */ public class CSVNullInjector extends InputStream { - private static final byte[] NULL_VALUE = "DATAIO_INJECTED_NULL_VALUE".getBytes(Charset.defaultCharset()); + private static final byte[] NULL_VALUE = "DATAIO_INJECTED_NULL_VALUE".getBytes(StandardCharsets.UTF_8); private final InputStream inputStream; private final Deque backBuffer = new ArrayDeque<>(); - private final byte[] readBuffer = new byte[NULL_VALUE.length]; private final char delimiter; private final char quoteCharacter; private boolean quoteMode = false; + private boolean firstRead = true; public CSVNullInjector(InputStream in, char delimiter, char quoteCharacter) { this.inputStream = in; @@ -48,6 +51,11 @@ public int read() throws IOException { int first = this.inputStream.read(); + if (first == '\n') { + firstRead = true; + return first; + } + if (first == quoteCharacter) { quoteMode = !quoteMode; return first; @@ -57,33 +65,31 @@ public int read() throws IOException { if (quoteMode) { // if in quote mode, do not inject anything return first; } - int second = this.inputStream.read(); - - if (second != delimiter) { - backBuffer.push((byte) (char) second); + if (firstRead) { + backBuffer.push((byte) this.delimiter); + addNullToBackBuffer(); + firstRead = false; + return (int) backBuffer.pop(); } else { - peekAndReplace(); + int second = this.inputStream.read(); + + if (second != delimiter) { + backBuffer.push((byte) (char) second); + } else { + backBuffer.push((byte) this.delimiter); + addNullToBackBuffer(); + } } } + firstRead = false; + return first; } - private void peekAndReplace() throws IOException { - int read = super.read(readBuffer, 0, NULL_VALUE.length); - // fill in backbuffer - for (int i = read - 1; i >= 0; i--) { - backBuffer.push(readBuffer[i]); - } - backBuffer.push((byte) this.delimiter); - - for (int i = 0; i < NULL_VALUE.length; i++) { - if (read != NULL_VALUE.length || readBuffer[i] != NULL_VALUE[i]) { - for (int j = NULL_VALUE.length - 1; j >= 0; j--) { - backBuffer.push(NULL_VALUE[j]); - } - return; - } + private void addNullToBackBuffer() { + for (int i = NULL_VALUE.length - 1; i >= 0; i--) { + backBuffer.push(NULL_VALUE[i]); } } } diff --git a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector2.java b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector2.java new file mode 100644 index 0000000..d253650 --- /dev/null +++ b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector2.java @@ -0,0 +1,149 @@ +package be.ugent.idlab.knows.dataio.utils; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayDeque; +import java.util.Deque; + +public class CSVNullInjector2 extends CSVNullInjector { + private static final byte[] NULL_VALUE = "DATAIO_INJECTED_NULL_VALUE".getBytes(); + private static final int BUFFER_SIZE = 1024 * 16; // 16 KiB + private final InputStream inputStream; + private final char delimiter; + private final char quoteCharacter; + private final Buffer buffer; + private final Deque backBuffer = new ArrayDeque<>(NULL_VALUE.length); + private boolean quoteMode; + private boolean newLine = true; + + public CSVNullInjector2(InputStream inputStream, char delimiter, char quoteCharacter) { + super(inputStream, delimiter, quoteCharacter); + this.inputStream = inputStream; + this.delimiter = delimiter; + this.quoteCharacter = quoteCharacter; + + this.buffer = new Buffer(inputStream, BUFFER_SIZE); // 8 KiB + } + + public CSVNullInjector2(InputStream inputStream) { + this(inputStream, ',', '"'); + } + + @Override + public int read() throws IOException { + return this.inputStream.read(); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + int i = off; + for (; i < len; i++) { + if (!backBuffer.isEmpty()) { + b[i] = backBuffer.pop(); + } else { + byte b1 = buffer.pop(); + if (b1 == -1) { + // nothing further could be read, abort + if (i == off) { // if no data was read at all + return -1; + } + break; + } + + if (newLine && b1 == this.delimiter) { + backBuffer.push((byte) this.delimiter); + addNullToBackBuffer(); + b[i] = backBuffer.pop(); + } else { + b[i] = b1; + + // check for a new line + if (b1 == '\n') { + newLine = true; + } else if (newLine) { + newLine = false; + } + + if (b1 == quoteCharacter) { + quoteMode = !quoteMode; + } else if (b1 == delimiter) { + if (!quoteMode) { + if (newLine) { + backBuffer.push((byte) this.delimiter); + addNullToBackBuffer(); + newLine = false; + b[i] = backBuffer.pop(); + } else { + byte second = buffer.peek(); + if (second == this.delimiter) { + addNullToBackBuffer(); + } + } + } + } + } + } + } + + return i - off; + } + + private void addNullToBackBuffer() { + for (int i = NULL_VALUE.length - 1; i >= 0; i--) { + backBuffer.push(NULL_VALUE[i]); + } + } + + /** + * Class serving as a buffer of bytes + */ + private static class Buffer { + InputStream inputStream; + byte[] buffer; // buffer to hold the data + int cursor; // cursor in the buffer + int actualSize; // end of the buffer + + public Buffer(InputStream inputStream, int bufferSize) { + this.inputStream = inputStream; + this.buffer = new byte[bufferSize]; + this.cursor = 0; + } + + /** + * Pop a byte off the buffer when requested + * + * @return -1 if nothing more is available, otherwise the byte from the top of the buffer + */ + public byte pop() throws IOException { + if (this.cursor >= this.actualSize) { // if end of current buffer + if (!loadNewData()) { + return -1; + } + } + + byte out = this.buffer[this.cursor]; + cursor++; + return out; + } + + public byte peek() throws IOException { + if (this.cursor >= this.actualSize) { + if (!loadNewData()) { + return -1; + } + } + + return this.buffer[this.cursor]; + } + + private boolean loadNewData() throws IOException { + this.actualSize = this.inputStream.read(this.buffer, 0, buffer.length); + if (this.actualSize == -1) { // if nothing further was read from the input stream + return false; + } + + this.cursor = 0; + return true; + } + } +} diff --git a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java index 17d87df..fbe5255 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java @@ -4,14 +4,19 @@ import be.ugent.idlab.knows.dataio.access.LocalFileAccess; import be.ugent.idlab.knows.dataio.cores.TestCore; import be.ugent.idlab.knows.dataio.iterators.CSVWSourceIterator; +import be.ugent.idlab.knows.dataio.iterators.CSVWSourceIterator2; +import be.ugent.idlab.knows.dataio.iterators.SourceIterator; import be.ugent.idlab.knows.dataio.iterators.csvw.CSVWConfiguration; import be.ugent.idlab.knows.dataio.record.CSVRecord; +import be.ugent.idlab.knows.dataio.record.Record; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Set; +import java.util.function.Consumer; import static org.junit.jupiter.api.Assertions.*; @@ -244,4 +249,36 @@ public void evaluate_0000_trim_bogus() throws Exception { } } } + + private long getExecTime(SourceIterator iterator) { + long start = System.currentTimeMillis(); + + iterator.forEachRemaining(new Consumer<>() { + int count = 0; + + @Override + public void accept(Record source) { + if (count < 10) { + System.out.println(((CSVRecord) source).getData()); + } + count++; + } + }); + + long end = System.currentTimeMillis(); + return end - start; + } + + + @Test + @Disabled + public void execTime() throws Exception { + Access access = new LocalFileAccess("/home/messik/Work/large_files/large_csv/taxonmappings/joined.tsv", "", "tsv", "UTF-8"); + CSVWConfiguration config = CSVWConfiguration.builder().withDelimiter('\t').build(); + + try(SourceIterator iterator = new CSVWSourceIterator(access, config)) { + long execTime = getExecTime(iterator); + System.out.printf("Execution took %f seconds", execTime / 1000.0); + } + } } diff --git a/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java index 348ddb4..4e95230 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java @@ -1,6 +1,7 @@ package be.ugent.idlab.knows.dataio.utility; import be.ugent.idlab.knows.dataio.utils.CSVNullInjector; +import be.ugent.idlab.knows.dataio.utils.CSVNullInjector2; import org.junit.jupiter.api.Test; import java.io.ByteArrayInputStream; @@ -18,7 +19,7 @@ public class CSVNullInjectorTest { public void testInsertion() throws IOException { String testString = "ID,,Foo"; InputStream input = new ByteArrayInputStream(testString.getBytes()); - CSVNullInjector injector = new CSVNullInjector(input); + CSVNullInjector injector = new CSVNullInjector2(input); // read out the injector String output = new String(injector.readAllBytes()); @@ -29,7 +30,7 @@ public void testInsertion() throws IOException { public void customDelimiter() throws IOException { String testString = "ID;;Foo"; InputStream input = new ByteArrayInputStream(testString.getBytes()); - CSVNullInjector injector = new CSVNullInjector(input, ';', '"'); + CSVNullInjector injector = new CSVNullInjector2(input, ';', '"'); // read out the injector String output = new String(injector.readAllBytes()); @@ -43,7 +44,7 @@ public void customDelimiter() throws IOException { public void ignoreQuotedSeparator() throws IOException { String testString = "ID,\",, ,\",Foo"; InputStream input = new ByteArrayInputStream(testString.getBytes()); - CSVNullInjector injector = new CSVNullInjector(input); + CSVNullInjector injector = new CSVNullInjector2(input); String output = new String(injector.readAllBytes()); assertEquals( testString, output); @@ -56,9 +57,23 @@ public void ignoreQuotedSeparator() throws IOException { public void escapedQuote() throws IOException { String testString = "\"aaa\",\"b\"\",,bb\",,\"ccc\""; InputStream input = new ByteArrayInputStream(testString.getBytes()); - CSVNullInjector injector = new CSVNullInjector(input); + CSVNullInjector injector = new CSVNullInjector2(input); String output = new String(injector.readAllBytes()); assertEquals("\"aaa\",\"b\"\",,bb\",DATAIO_INJECTED_NULL_VALUE,\"ccc\"", output); } + + /** + * Tests injection of null value at the start of the string + */ + @Test + public void emptyStart() throws IOException { + String testString = ",Foo,Bar"; + InputStream input = new ByteArrayInputStream(testString.getBytes()); + CSVNullInjector injector = new CSVNullInjector2(input); + + String output = new String(injector.readAllBytes()); + assertEquals("DATAIO_INJECTED_NULL_VALUE,Foo,Bar", output); + + } } From 88a31e3b95503d1b14eba856d28ccb5358825f83 Mon Sep 17 00:00:00 2001 From: Jozef Jankaj Date: Mon, 23 Oct 2023 20:03:57 +0200 Subject: [PATCH 06/13] chore: reimplement injector with Java native buffer --- .../dataio/iterators/CSVWSourceIterator.java | 7 +- .../dataio/iterators/CSVWSourceIterator2.java | 162 ------------------ .../iterators/csvw/CSVWConfiguration.java | 15 +- .../idlab/knows/dataio/record/CSVRecord.java | 2 +- .../knows/dataio/utils/CSVNullInjector.java | 161 +++++++++++------ .../knows/dataio/utils/CSVNullInjector2.java | 149 ---------------- .../dataio/iterator/CSVWIteratorTest.java | 1 - .../dataio/utility/CSVNullInjectorTest.java | 99 +++++++---- 8 files changed, 178 insertions(+), 418 deletions(-) delete mode 100644 src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator2.java delete mode 100644 src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector2.java diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java index 8a0d59c..fe87415 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java @@ -5,7 +5,6 @@ import be.ugent.idlab.knows.dataio.record.CSVRecord; import be.ugent.idlab.knows.dataio.record.Record; import be.ugent.idlab.knows.dataio.utils.CSVNullInjector; -import be.ugent.idlab.knows.dataio.utils.CSVNullInjector2; import org.simpleflatmapper.lightningcsv.CsvParser; import java.io.IOException; @@ -38,9 +37,9 @@ private void readObject(ObjectInputStream inputStream) throws Exception { } private void bootstrap() throws Exception { - this.inputReader = new InputStreamReader(new CSVNullInjector2(access.getInputStream(), this.config.getDelimiter(), this.config.getQuoteCharacter())); -// this.inputReader = access.getInputStreamReader(); - CsvParser.DSL parser = config.getSFMParser(1024 * 16); + int bufferSize = 1024 * 128; // 128 KiB + this.inputReader = new InputStreamReader(new CSVNullInjector(this.access.getInputStream(), bufferSize, this.config.getDelimiter(), this.config.getQuoteCharacter())); + CsvParser.DSL parser = config.getSFMParser(bufferSize); this.iterator = parser.iterator(this.inputReader); if (this.config.isSkipHeader()) { diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator2.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator2.java deleted file mode 100644 index a888dce..0000000 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator2.java +++ /dev/null @@ -1,162 +0,0 @@ -package be.ugent.idlab.knows.dataio.iterators; - -import be.ugent.idlab.knows.dataio.access.Access; -import be.ugent.idlab.knows.dataio.iterators.csvw.CSVWConfiguration; -import be.ugent.idlab.knows.dataio.record.CSVRecord; -import be.ugent.idlab.knows.dataio.record.Record; -import com.opencsv.CSVReader; -import com.opencsv.CSVReaderBuilder; -import com.opencsv.exceptions.CsvValidationException; - -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.ObjectInputStream; -import java.util.Arrays; -import java.util.Map; -import java.util.NoSuchElementException; - -public class CSVWSourceIterator2 extends SourceIterator { - private static final long serialVersionUID = -5824558388620967495L; - private final Access access; - private final CSVWConfiguration config; - private transient String[] header; - private transient String[] next; - private transient CSVReader reader; - - public CSVWSourceIterator2(Access access, CSVWConfiguration config) throws Exception { - this.access = access; - this.config = config; - this.bootstrap(); - } - - private void readObject(ObjectInputStream inputStream) throws Exception { - inputStream.defaultReadObject(); - this.bootstrap(); - } - - /** - * Instantiates transient fields. This code needs to be run both at construction time and after deserialization - */ - private void bootstrap() throws Exception { - this.reader = new CSVReaderBuilder(new InputStreamReader(access.getInputStream(), config.getEncoding())) - .withCSVParser(this.config.getOpenCSVParser()) - .withSkipLines(this.config.isSkipHeader() ? 1 : 0) - .build(); - - if (this.config.isSkipHeader()) { - this.header = config.getHeader().toArray(new String[0]); - } else { - this.header = readLine(); - - if (header == null) { - throw new IllegalStateException("Unable to read the file!"); - } - } - - this.next = readLine(); - } - - private String[] readLine() throws IOException { - String[] line; - do { - try { - line = this.reader.readNext(); - - if (line == null) { - return null; - } - } catch (CsvValidationException e) { - throw new IllegalArgumentException(String.format("File does not conform to configuration! Offending line: %s", Arrays.toString(this.reader.peek()))); - } - } while (invalidLine(line)); - - return line; - } - - /** - * Checks if the passed line corresponds to the filters set - * A line is considered valid if it doesn't start with the comment prefix - * If the first value is null, the line is accepted - * - * @param line line to be checked - * @return true if the line passes all checks - */ - private boolean invalidLine(String[] line) { - return line[0] != null && line[0].startsWith(this.config.getCommentPrefix()); - } - - /** - * Checks if @record has a string value which is in the nulls list, if so sets this value to null in the data map. - * - * @param record record to be checked - * @return - */ - public CSVRecord replaceNulls(CSVRecord record) { - Map data = record.getData(); - data.forEach((key, value) -> { - if (value != null && this.config.getNulls().contains(value)) { - data.put(key, null); - } - }); - return record; - } - - public String[] applyTrimArray(String[] arr, String trim) { - return Arrays.stream(arr) - .map(item -> applyTrim(item, trim)) - .toArray(String[]::new); - } - - public String applyTrim(String item, boolean trim) { - if (trim) { - return item.trim(); - } - - return item; - } - - public String applyTrim(String item, String trim) { - switch (trim) { - case "true": - return item.trim(); - case "false": - return item; - case "start": - return item.stripLeading(); - case "end": - return item.stripTrailing(); - default: - throw new IllegalArgumentException("Unrecognized value for flag \"trim\""); - } - } - - @Override - public Record next() { - if (this.next == null) { - throw new NoSuchElementException(); - } - - String[] line = this.next; - try { - this.next = readLine(); - } catch (IOException e) { - throw new RuntimeException(e); - } - - if (!config.getTrim().equals("false")) { - line = applyTrimArray(line, config.getTrim()); - } - - return replaceNulls(new CSVRecord(header, line, this.access.getDataTypes())); - } - - @Override - public boolean hasNext() { - return this.next != null; - } - - @Override - public void close() throws IOException { - this.reader.close(); - } -} diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java index cfa612a..2bbc9d7 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java @@ -1,10 +1,8 @@ package be.ugent.idlab.knows.dataio.iterators.csvw; +import be.ugent.idlab.knows.dataio.utils.CSVNullInjector; import org.simpleflatmapper.lightningcsv.CsvParser; -import com.opencsv.CSVParser; -import com.opencsv.CSVParserBuilder; -import com.opencsv.enums.CSVReaderNullFieldIndicator; import java.io.Serializable; import java.util.ArrayList; @@ -52,7 +50,7 @@ public final class CSVWConfiguration implements Serializable { this.header = header; List nullValues = new ArrayList<>(nulls); - nullValues.add("DATAIO_INJECTED_NULL_VALUE"); // add our special null value + nullValues.add(CSVNullInjector.NULL_VALUE); // add our special null value this.nulls = nullValues; this.encoding = encoding; @@ -105,13 +103,4 @@ public CsvParser.DSL getSFMParser(int bufferSize) { .quote(this.quoteCharacter) .bufferSize(bufferSize); } - - public CSVParser getOpenCSVParser() { - return new CSVParserBuilder() - .withSeparator(this.delimiter) - .withEscapeChar(this.escapeCharacter) - .withQuoteChar(this.quoteCharacter) - .withFieldAsNull(CSVReaderNullFieldIndicator.EMPTY_SEPARATORS) - .build(); - } } diff --git a/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java b/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java index ef3d664..7ba180f 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java @@ -15,7 +15,7 @@ public class CSVRecord extends Record { private final Map datatypes; public CSVRecord(String[] header, String[] data, Map datatypes) { - this.data = new HashMap<>(); + this.data = new HashMap<>(header.length); if (header.length > data.length) { logger.warn("Header has more columns than this row"); } diff --git a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java index c730a1c..ae2718b 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java @@ -1,95 +1,144 @@ package be.ugent.idlab.knows.dataio.utils; -import com.drew.lang.annotations.NotNull; -import io.reactivex.rxjava3.annotations.NonNull; - import java.io.IOException; import java.io.InputStream; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.util.ArrayDeque; -import java.util.Deque; +import java.nio.ByteBuffer; /** * Injects a known NULL value between two commas in CSV. - * Inspired by this answer on SO + * Inspired by this answer on SO, written with Java's native buffers. */ public class CSVNullInjector extends InputStream { - - private static final byte[] NULL_VALUE = "DATAIO_INJECTED_NULL_VALUE".getBytes(StandardCharsets.UTF_8); + public static final String NULL_VALUE = "DATAIO_NULL"; + private static final int BUFFER_SIZE = 1024 * 128; + private final ByteBuffer nullBuffer; + private final ByteBuffer inputBuffer; private final InputStream inputStream; - private final Deque backBuffer = new ArrayDeque<>(); private final char delimiter; private final char quoteCharacter; private boolean quoteMode = false; - private boolean firstRead = true; + private boolean newLine = true; - public CSVNullInjector(InputStream in, char delimiter, char quoteCharacter) { - this.inputStream = in; + public CSVNullInjector(InputStream inputStream, int bufferSize, char delimiter, char quoteCharacter) throws IOException { + this.nullBuffer = ByteBuffer.allocate(NULL_VALUE.length()); + this.inputBuffer = ByteBuffer.allocate(BUFFER_SIZE); + this.inputStream = inputStream; this.delimiter = delimiter; this.quoteCharacter = quoteCharacter; + + // initialise null buffer + this.nullBuffer.put(NULL_VALUE.getBytes()); + + // initialise input buffer + byte[] bytes = this.inputStream.readNBytes(BUFFER_SIZE); + this.inputBuffer.put(bytes); + this.inputBuffer.flip(); } - /** - * Constructor with default delimiter and quote character for CSV - * - * @param in inputstream to read from - */ - public CSVNullInjector(InputStream in) { - this(in, ',', '"'); + public CSVNullInjector(InputStream inputStream, int bufferSize) throws IOException { + this(inputStream, bufferSize, ',', '"'); } @Override public int read() throws IOException { - if (!backBuffer.isEmpty()) { - char value = (char) (int) backBuffer.pop(); - if (value == quoteCharacter) { - quoteMode = !quoteMode; + return getNextByte() & 0xFF; + } + + @Override + public int read(byte[] b) throws IOException { + return this.read(b, 0, b.length); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + int i = off; + for (; i < len; i++) { + byte b1 = getNextByte(); + if (b1 == -1) { + if (i == off) { + return -1; // return -1 to let the caller known no more data is available + } + break; } - return value; + b[i] = b1; } + return i - off; + } - int first = this.inputStream.read(); + /** + * Fetches next byte to be returned in by the injector. + * This byte could come from either the nullBuffer or the inputBuffer, depending on the state of the injector + * + * @return the next byte + */ + private byte getNextByte() throws IOException { + if (this.nullBuffer.hasRemaining()) { + return this.nullBuffer.get(); + } - if (first == '\n') { - firstRead = true; - return first; + if (!this.ensureInput()) { + // nothing more in the inputBuffer + return -1; } - if (first == quoteCharacter) { - quoteMode = !quoteMode; - return first; + byte b = this.inputBuffer.get(); + + // specific case when we're on a new line and first character is the delimiter + // -> there's a missing null value that must be injected + if (this.newLine && b == this.delimiter) { + this.inputBuffer.position(this.inputBuffer.position() - 1); + this.nullBuffer.flip(); + this.newLine = false; + return this.nullBuffer.get(); } - if (first == delimiter) { - if (quoteMode) { // if in quote mode, do not inject anything - return first; - } + this.newLine = false; - if (firstRead) { - backBuffer.push((byte) this.delimiter); - addNullToBackBuffer(); - firstRead = false; - return (int) backBuffer.pop(); - } else { - int second = this.inputStream.read(); - - if (second != delimiter) { - backBuffer.push((byte) (char) second); - } else { - backBuffer.push((byte) this.delimiter); - addNullToBackBuffer(); - } + if (b == this.quoteCharacter) { + // toggle quote mode + this.quoteMode = !this.quoteMode; + } + + if (b == '\n') { + this.newLine = true; + return b; + } + + if (quoteMode) { // if in quote mode, immediately return + return b; + } + + if (b == this.delimiter) { + // look for second delimiter + if (!this.ensureInput()) { + // last byte of the input is a delimiter, add one last null value + this.nullBuffer.flip(); + return b; + } + // not the last byte, check the next + byte b1 = this.inputBuffer.get(this.inputBuffer.position()); + if (b1 == this.delimiter) { + // two delimiters, add a null value + this.nullBuffer.flip(); + return b; // return the original } } - firstRead = false; - return first; + return b; } - private void addNullToBackBuffer() { - for (int i = NULL_VALUE.length - 1; i >= 0; i--) { - backBuffer.push(NULL_VALUE[i]); + private boolean ensureInput() throws IOException { + if (this.inputBuffer.hasRemaining()) { + return true; } + + int count = this.inputStream.read(this.inputBuffer.array()); + if (count < 1) { // no bytes available + return false; + } + + this.inputBuffer.flip(); + + return true; } } diff --git a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector2.java b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector2.java deleted file mode 100644 index d253650..0000000 --- a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector2.java +++ /dev/null @@ -1,149 +0,0 @@ -package be.ugent.idlab.knows.dataio.utils; - -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayDeque; -import java.util.Deque; - -public class CSVNullInjector2 extends CSVNullInjector { - private static final byte[] NULL_VALUE = "DATAIO_INJECTED_NULL_VALUE".getBytes(); - private static final int BUFFER_SIZE = 1024 * 16; // 16 KiB - private final InputStream inputStream; - private final char delimiter; - private final char quoteCharacter; - private final Buffer buffer; - private final Deque backBuffer = new ArrayDeque<>(NULL_VALUE.length); - private boolean quoteMode; - private boolean newLine = true; - - public CSVNullInjector2(InputStream inputStream, char delimiter, char quoteCharacter) { - super(inputStream, delimiter, quoteCharacter); - this.inputStream = inputStream; - this.delimiter = delimiter; - this.quoteCharacter = quoteCharacter; - - this.buffer = new Buffer(inputStream, BUFFER_SIZE); // 8 KiB - } - - public CSVNullInjector2(InputStream inputStream) { - this(inputStream, ',', '"'); - } - - @Override - public int read() throws IOException { - return this.inputStream.read(); - } - - @Override - public int read(byte[] b, int off, int len) throws IOException { - int i = off; - for (; i < len; i++) { - if (!backBuffer.isEmpty()) { - b[i] = backBuffer.pop(); - } else { - byte b1 = buffer.pop(); - if (b1 == -1) { - // nothing further could be read, abort - if (i == off) { // if no data was read at all - return -1; - } - break; - } - - if (newLine && b1 == this.delimiter) { - backBuffer.push((byte) this.delimiter); - addNullToBackBuffer(); - b[i] = backBuffer.pop(); - } else { - b[i] = b1; - - // check for a new line - if (b1 == '\n') { - newLine = true; - } else if (newLine) { - newLine = false; - } - - if (b1 == quoteCharacter) { - quoteMode = !quoteMode; - } else if (b1 == delimiter) { - if (!quoteMode) { - if (newLine) { - backBuffer.push((byte) this.delimiter); - addNullToBackBuffer(); - newLine = false; - b[i] = backBuffer.pop(); - } else { - byte second = buffer.peek(); - if (second == this.delimiter) { - addNullToBackBuffer(); - } - } - } - } - } - } - } - - return i - off; - } - - private void addNullToBackBuffer() { - for (int i = NULL_VALUE.length - 1; i >= 0; i--) { - backBuffer.push(NULL_VALUE[i]); - } - } - - /** - * Class serving as a buffer of bytes - */ - private static class Buffer { - InputStream inputStream; - byte[] buffer; // buffer to hold the data - int cursor; // cursor in the buffer - int actualSize; // end of the buffer - - public Buffer(InputStream inputStream, int bufferSize) { - this.inputStream = inputStream; - this.buffer = new byte[bufferSize]; - this.cursor = 0; - } - - /** - * Pop a byte off the buffer when requested - * - * @return -1 if nothing more is available, otherwise the byte from the top of the buffer - */ - public byte pop() throws IOException { - if (this.cursor >= this.actualSize) { // if end of current buffer - if (!loadNewData()) { - return -1; - } - } - - byte out = this.buffer[this.cursor]; - cursor++; - return out; - } - - public byte peek() throws IOException { - if (this.cursor >= this.actualSize) { - if (!loadNewData()) { - return -1; - } - } - - return this.buffer[this.cursor]; - } - - private boolean loadNewData() throws IOException { - this.actualSize = this.inputStream.read(this.buffer, 0, buffer.length); - if (this.actualSize == -1) { // if nothing further was read from the input stream - return false; - } - - this.cursor = 0; - return true; - } - } -} diff --git a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java index fbe5255..b8ac2df 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java @@ -4,7 +4,6 @@ import be.ugent.idlab.knows.dataio.access.LocalFileAccess; import be.ugent.idlab.knows.dataio.cores.TestCore; import be.ugent.idlab.knows.dataio.iterators.CSVWSourceIterator; -import be.ugent.idlab.knows.dataio.iterators.CSVWSourceIterator2; import be.ugent.idlab.knows.dataio.iterators.SourceIterator; import be.ugent.idlab.knows.dataio.iterators.csvw.CSVWConfiguration; import be.ugent.idlab.knows.dataio.record.CSVRecord; diff --git a/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java index 4e95230..b36c7fa 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java @@ -1,7 +1,6 @@ package be.ugent.idlab.knows.dataio.utility; import be.ugent.idlab.knows.dataio.utils.CSVNullInjector; -import be.ugent.idlab.knows.dataio.utils.CSVNullInjector2; import org.junit.jupiter.api.Test; import java.io.ByteArrayInputStream; @@ -11,69 +10,105 @@ import static org.junit.jupiter.api.Assertions.assertEquals; public class CSVNullInjectorTest { + private String getProcessedString(String inputString) throws IOException { + InputStream input = new ByteArrayInputStream(inputString.getBytes()); + return new String(new CSVNullInjector(input, 1024 * 128, ',', '"').readAllBytes()); + } /** - * Simple test to replace the value in between two commas + * Tests a simple insertion in between two delimiters + * @throws IOException */ @Test public void testInsertion() throws IOException { String testString = "ID,,Foo"; - InputStream input = new ByteArrayInputStream(testString.getBytes()); - CSVNullInjector injector = new CSVNullInjector2(input); - - // read out the injector - String output = new String(injector.readAllBytes()); - assertEquals("ID,DATAIO_INJECTED_NULL_VALUE,Foo", output); + String output = getProcessedString(testString); + String expected = "ID,%s,Foo".replaceAll("%s", CSVNullInjector.NULL_VALUE); + assertEquals(expected, output); } + /** + * Tests an insertion between two custom delimiters + * @throws IOException + */ @Test public void customDelimiter() throws IOException { String testString = "ID;;Foo"; - InputStream input = new ByteArrayInputStream(testString.getBytes()); - CSVNullInjector injector = new CSVNullInjector2(input, ';', '"'); - - // read out the injector + InputStream in = new ByteArrayInputStream(testString.getBytes()); + CSVNullInjector injector = new CSVNullInjector(in, 1024 * 128,';', '"'); String output = new String(injector.readAllBytes()); - assertEquals("ID;DATAIO_INJECTED_NULL_VALUE;Foo", output); + String expected = "ID;%s;Foo".replaceAll("%s", CSVNullInjector.NULL_VALUE); + assertEquals(expected, output); + } + + /** + * Tests injection of null value at the start of the string + * @throws IOException + */ + @Test + public void emptyStart() throws IOException { + String testString = ",Foo,Bar"; + String output = getProcessedString(testString); + String expected = "%s,Foo,Bar".replaceAll("%s", CSVNullInjector.NULL_VALUE); + assertEquals(expected, output); + } + + /** + * Tests insertion of null value at the end of the string + * @throws IOException + */ + @Test + public void emptyEnd() throws IOException { + String testString = "Foo,Bar,"; + String output = getProcessedString(testString); + String expected = "Foo,Bar,%s".replaceAll("%s", CSVNullInjector.NULL_VALUE); + assertEquals(expected, output); } /** - * Tests ignoring quoted separators. + * Tests ignoring of quoted separators */ @Test public void ignoreQuotedSeparator() throws IOException { String testString = "ID,\",, ,\",Foo"; - InputStream input = new ByteArrayInputStream(testString.getBytes()); - CSVNullInjector injector = new CSVNullInjector2(input); - - String output = new String(injector.readAllBytes()); - assertEquals( testString, output); + String output = getProcessedString(testString); + assertEquals(testString, output); } /** * Tests correct injection for escaped quotes + * Input: "aaa","b"",,bb",,"ccc" + * Output: "aaa","b"",,bb",${nullValue},"ccc" */ @Test public void escapedQuote() throws IOException { String testString = "\"aaa\",\"b\"\",,bb\",,\"ccc\""; - InputStream input = new ByteArrayInputStream(testString.getBytes()); - CSVNullInjector injector = new CSVNullInjector2(input); - - String output = new String(injector.readAllBytes()); - assertEquals("\"aaa\",\"b\"\",,bb\",DATAIO_INJECTED_NULL_VALUE,\"ccc\"", output); + String output = getProcessedString(testString); + String expected = "\"aaa\",\"b\"\",,bb\",%s,\"ccc\"".replaceAll("%s", CSVNullInjector.NULL_VALUE); + assertEquals(expected, output); } /** - * Tests injection of null value at the start of the string + * Tests the injector's correct recognition of Linux newlines. + * @throws IOException */ @Test - public void emptyStart() throws IOException { - String testString = ",Foo,Bar"; - InputStream input = new ByteArrayInputStream(testString.getBytes()); - CSVNullInjector injector = new CSVNullInjector2(input); - - String output = new String(injector.readAllBytes()); - assertEquals("DATAIO_INJECTED_NULL_VALUE,Foo,Bar", output); + public void unixNewLine() throws IOException { + String testString = "Foo,,Bar\n,B"; + String output = getProcessedString(testString); + String expected = "Foo,%s,Bar\n%s,B".replaceAll("%s", CSVNullInjector.NULL_VALUE); + assertEquals(expected, output); + } + /** + * Tests the injector's correct recognition of Windows newlines. + * @throws IOException + */ + @Test + public void windowsNewLine() throws IOException { + String testString = "Foo,,Bar\r\n,B"; + String output = getProcessedString(testString); + String expected = "Foo,%s,Bar\r\n%s,B".replaceAll("%s", CSVNullInjector.NULL_VALUE); + assertEquals(expected, output); } } From a247fdfe18339860b4b1e278e2f7529366f3e943 Mon Sep 17 00:00:00 2001 From: Jozef Jankaj Date: Wed, 25 Oct 2023 15:13:50 +0200 Subject: [PATCH 07/13] chore: fix encoding issues NullInjector's way of reporting no more bytes was through reporting a -1. This however clashed with characters that are longer than what Java's byte can represent. A new record ReadingResult is used to report back both the success or failure of the reading and the byte itself. In case of reading failure, the result field is by default set to -1. --- pom.xml | 4 +- .../dataio/iterators/CSVWSourceIterator.java | 7 ++- .../knows/dataio/utils/CSVNullInjector.java | 63 ++++++++++++------- .../dataio/utility/CSVNullInjectorTest.java | 5 +- 4 files changed, 48 insertions(+), 31 deletions(-) diff --git a/pom.xml b/pom.xml index 5beb2b0..aac6761 100644 --- a/pom.xml +++ b/pom.xml @@ -282,8 +282,8 @@ maven-compiler-plugin 3.10.1 - 11 - 11 + 17 + 17 diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java index fe87415..2d01d98 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java @@ -18,6 +18,7 @@ public class CSVWSourceIterator extends SourceIterator { private static final long serialVersionUID = -5824558388620967495L; + private static final int BUFFER_SIZE = 1024 * 128; // 128 KiB private final Access access; private final CSVWConfiguration config; private transient String[] header; @@ -37,9 +38,9 @@ private void readObject(ObjectInputStream inputStream) throws Exception { } private void bootstrap() throws Exception { - int bufferSize = 1024 * 128; // 128 KiB - this.inputReader = new InputStreamReader(new CSVNullInjector(this.access.getInputStream(), bufferSize, this.config.getDelimiter(), this.config.getQuoteCharacter())); - CsvParser.DSL parser = config.getSFMParser(bufferSize); + CSVNullInjector injector = new CSVNullInjector(this.access.getInputStream(), BUFFER_SIZE, this.config.getDelimiter(), this.config.getQuoteCharacter()); + this.inputReader = new InputStreamReader(injector, this.config.getEncoding()); + CsvParser.DSL parser = config.getSFMParser(BUFFER_SIZE); this.iterator = parser.iterator(this.inputReader); if (this.config.isSkipHeader()) { diff --git a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java index ae2718b..81a3218 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java @@ -10,7 +10,6 @@ */ public class CSVNullInjector extends InputStream { public static final String NULL_VALUE = "DATAIO_NULL"; - private static final int BUFFER_SIZE = 1024 * 128; private final ByteBuffer nullBuffer; private final ByteBuffer inputBuffer; private final InputStream inputStream; @@ -18,10 +17,11 @@ public class CSVNullInjector extends InputStream { private final char quoteCharacter; private boolean quoteMode = false; private boolean newLine = true; + private byte lastByte = 0; public CSVNullInjector(InputStream inputStream, int bufferSize, char delimiter, char quoteCharacter) throws IOException { this.nullBuffer = ByteBuffer.allocate(NULL_VALUE.length()); - this.inputBuffer = ByteBuffer.allocate(BUFFER_SIZE); + this.inputBuffer = ByteBuffer.allocate(bufferSize); this.inputStream = inputStream; this.delimiter = delimiter; this.quoteCharacter = quoteCharacter; @@ -30,9 +30,10 @@ public CSVNullInjector(InputStream inputStream, int bufferSize, char delimiter, this.nullBuffer.put(NULL_VALUE.getBytes()); // initialise input buffer - byte[] bytes = this.inputStream.readNBytes(BUFFER_SIZE); - this.inputBuffer.put(bytes); - this.inputBuffer.flip(); + int count = this.inputStream.read(this.inputBuffer.array()); + if (count > 0) { + this.inputBuffer.limit(count); + } } public CSVNullInjector(InputStream inputStream, int bufferSize) throws IOException { @@ -41,7 +42,11 @@ public CSVNullInjector(InputStream inputStream, int bufferSize) throws IOExcepti @Override public int read() throws IOException { - return getNextByte() & 0xFF; + ReadingResult rr = getNextByte(); + if (rr.valid) { + return rr.result & 0xFF; // sign extend the byte + } + return -1; // nothing could be read } @Override @@ -53,14 +58,14 @@ public int read(byte[] b) throws IOException { public int read(byte[] b, int off, int len) throws IOException { int i = off; for (; i < len; i++) { - byte b1 = getNextByte(); - if (b1 == -1) { + ReadingResult rr = getNextByte(); + if (!rr.valid()) { if (i == off) { return -1; // return -1 to let the caller known no more data is available } break; } - b[i] = b1; + b[i] = rr.result(); } return i - off; } @@ -71,74 +76,84 @@ public int read(byte[] b, int off, int len) throws IOException { * * @return the next byte */ - private byte getNextByte() throws IOException { + private ReadingResult getNextByte() throws IOException { if (this.nullBuffer.hasRemaining()) { - return this.nullBuffer.get(); + this.lastByte = this.nullBuffer.get(); + return new ReadingResult(true, this.lastByte); } if (!this.ensureInput()) { // nothing more in the inputBuffer - return -1; + return new ReadingResult(false, (byte) -1); } - byte b = this.inputBuffer.get(); + byte currentByte = this.inputBuffer.get(); // specific case when we're on a new line and first character is the delimiter // -> there's a missing null value that must be injected - if (this.newLine && b == this.delimiter) { + if (this.newLine && currentByte == this.delimiter) { this.inputBuffer.position(this.inputBuffer.position() - 1); this.nullBuffer.flip(); this.newLine = false; - return this.nullBuffer.get(); + return new ReadingResult(true, this.nullBuffer.get()); } this.newLine = false; - if (b == this.quoteCharacter) { + if (currentByte == this.quoteCharacter) { // toggle quote mode this.quoteMode = !this.quoteMode; } - if (b == '\n') { + if (currentByte == '\n') { this.newLine = true; - return b; + return new ReadingResult(true, currentByte); } if (quoteMode) { // if in quote mode, immediately return - return b; + return new ReadingResult(true, currentByte); } - if (b == this.delimiter) { + if (currentByte == this.delimiter) { // look for second delimiter if (!this.ensureInput()) { // last byte of the input is a delimiter, add one last null value this.nullBuffer.flip(); - return b; + return new ReadingResult(true, currentByte); } // not the last byte, check the next byte b1 = this.inputBuffer.get(this.inputBuffer.position()); if (b1 == this.delimiter) { // two delimiters, add a null value this.nullBuffer.flip(); - return b; // return the original + return new ReadingResult(true, currentByte); // return the original } } - return b; + return new ReadingResult(true, currentByte); } private boolean ensureInput() throws IOException { if (this.inputBuffer.hasRemaining()) { return true; } - int count = this.inputStream.read(this.inputBuffer.array()); if (count < 1) { // no bytes available return false; } + this.inputBuffer.limit(count); + this.inputBuffer.flip(); return true; } + + @Override + public void close() throws IOException { + this.inputStream.close(); + } + + private record ReadingResult(boolean valid, byte result) { + } } diff --git a/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java index b36c7fa..a3442f9 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java @@ -6,6 +6,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -34,8 +35,8 @@ public void testInsertion() throws IOException { @Test public void customDelimiter() throws IOException { String testString = "ID;;Foo"; - InputStream in = new ByteArrayInputStream(testString.getBytes()); - CSVNullInjector injector = new CSVNullInjector(in, 1024 * 128,';', '"'); + InputStream input = new ByteArrayInputStream(testString.getBytes()); + CSVNullInjector injector = new CSVNullInjector(input, 1024 * 128,';', '"'); String output = new String(injector.readAllBytes()); String expected = "ID;%s;Foo".replaceAll("%s", CSVNullInjector.NULL_VALUE); assertEquals(expected, output); From 78cd486265da60663f95fbb64591290e98be87a9 Mon Sep 17 00:00:00 2001 From: Jozef Jankaj Date: Wed, 25 Oct 2023 16:55:41 +0200 Subject: [PATCH 08/13] chore: additional extensions for full Mapper support SFM is fast, but also very barebones: doesn't perform any processing. A string """Foo""" will simply be passed along and we need to perform the processing into "Foo" ourselves. A bug regarding repeated reading into the buffer was resolved. --- .../knows/dataio/access/COMPRESSION.java | 5 ++++ .../dataio/iterators/CSVWSourceIterator.java | 25 +++++++++--------- .../knows/dataio/utils/CSVNullInjector.java | 16 +++++++----- .../knows/dataio/access/LocalAccessTest.java | 3 ++- .../dataio/iterator/CSVIteratorTest.java | 26 +++++++++++++------ .../dataio/utility/CSVNullInjectorTest.java | 21 ++++++++++++--- src/test/resources/csv/danglingSeparator.csv | 3 +++ src/test/resources/csv/tripleQuotes.csv | 4 +++ 8 files changed, 71 insertions(+), 32 deletions(-) create mode 100644 src/main/java/be/ugent/idlab/knows/dataio/access/COMPRESSION.java create mode 100644 src/test/resources/csv/danglingSeparator.csv create mode 100644 src/test/resources/csv/tripleQuotes.csv diff --git a/src/main/java/be/ugent/idlab/knows/dataio/access/COMPRESSION.java b/src/main/java/be/ugent/idlab/knows/dataio/access/COMPRESSION.java new file mode 100644 index 0000000..50dea6f --- /dev/null +++ b/src/main/java/be/ugent/idlab/knows/dataio/access/COMPRESSION.java @@ -0,0 +1,5 @@ +package be.ugent.idlab.knows.dataio.access; + +public class COMPRESSION { + public static final String GZIP = "gzip"; +} \ No newline at end of file diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java index 4fbc490..086c7ca 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java @@ -63,23 +63,24 @@ private String[] nextLine() { return null; } + // replace any occurrence of an escaped quote with a single quote + for (int i = 0; i < r.length; i++) { + String s = r[i]; + // trim the string that is quoted + if (s.startsWith("\"") && s.endsWith("\"")) { + s = s.substring(1, s.length() - 1); + } + + s = s.replaceAll("\"\"", "\""); + + r[i] = s; + } + return r; } return null; } - /** - * Checks if the passed line corresponds to the filters set - * A line is considered valid if it doesn't start with the comment prefix - * If the first value is null, the line is accepted - * - * @param line line to be checked - * @return true if the line passes all checks - */ - private boolean invalidLine(String[] line) { - return line[0] != null && line[0].startsWith(this.config.getCommentPrefix()); - } - /** * Checks if @record has a string value which is in the nulls list, if so sets this value to null in the data map. * diff --git a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java index 81a3218..fca1105 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java @@ -17,7 +17,6 @@ public class CSVNullInjector extends InputStream { private final char quoteCharacter; private boolean quoteMode = false; private boolean newLine = true; - private byte lastByte = 0; public CSVNullInjector(InputStream inputStream, int bufferSize, char delimiter, char quoteCharacter) throws IOException { this.nullBuffer = ByteBuffer.allocate(NULL_VALUE.length()); @@ -78,8 +77,7 @@ public int read(byte[] b, int off, int len) throws IOException { */ private ReadingResult getNextByte() throws IOException { if (this.nullBuffer.hasRemaining()) { - this.lastByte = this.nullBuffer.get(); - return new ReadingResult(true, this.lastByte); + return new ReadingResult(true, this.nullBuffer.get()); } if (!this.ensureInput()) { @@ -123,8 +121,8 @@ private ReadingResult getNextByte() throws IOException { } // not the last byte, check the next byte b1 = this.inputBuffer.get(this.inputBuffer.position()); - if (b1 == this.delimiter) { - // two delimiters, add a null value + if (b1 == this.delimiter || b1 == '\n') { + // two delimiters or a newline => dangling delimiter, add a null value this.nullBuffer.flip(); return new ReadingResult(true, currentByte); // return the original } @@ -142,9 +140,8 @@ private boolean ensureInput() throws IOException { return false; } - this.inputBuffer.limit(count); - this.inputBuffer.flip(); + this.inputBuffer.limit(count); return true; } @@ -154,6 +151,11 @@ public void close() throws IOException { this.inputStream.close(); } + /** + * Record to communicate the result of the byte read and it's success + * @param valid true if the reading of the byte was successful and the result byte is usable, false otherwise + * @param result the result of reading + */ private record ReadingResult(boolean valid, byte result) { } } diff --git a/src/test/java/be/ugent/idlab/knows/dataio/access/LocalAccessTest.java b/src/test/java/be/ugent/idlab/knows/dataio/access/LocalAccessTest.java index 61b794e..0698249 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/access/LocalAccessTest.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/access/LocalAccessTest.java @@ -9,6 +9,7 @@ import java.io.File; import java.io.FileNotFoundException; +import java.nio.file.NoSuchFileException; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Path; @@ -65,7 +66,7 @@ public void relativeToBase() throws IOException, SQLException { @Test public void nonExistentFile() { Access access = new LocalFileAccess("", "not_existing_file.csv", "csv", "utf-8"); - assertThrows(FileNotFoundException.class, () -> access.getInputStream()); + assertThrows(NoSuchFileException.class, access::getInputStream); } @EnabledOnOs(OS.WINDOWS) diff --git a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java index fb3594c..134f18c 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java @@ -18,7 +18,7 @@ public class CSVIteratorTest extends TestCore { @Test public void evaluate_0000_CSV() throws Exception { - Access access = makeLocalAccess("/csv/0000.csv", "", "csv", "utf-8"); + Access access = makeLocalAccess("/csv/0000.csv", "", "csv", "UTF-8"); try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { assertTrue(evaluate_0000(iterator)); } @@ -26,7 +26,7 @@ public void evaluate_0000_CSV() throws Exception { @Test public void evaluate_0001_CSV() throws Exception { - Access access = makeLocalAccess("/csv/0001.csv", "", "csv", "utf-8"); + Access access = makeLocalAccess("/csv/0001.csv", "", "csv", "UTF-8"); try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { assertTrue(evaluate_0001(iterator)); } @@ -35,7 +35,7 @@ public void evaluate_0001_CSV() throws Exception { @Test @Disabled public void evaluate_1001_header_col_missing_CSV() throws Exception { - Access access = makeLocalAccess("/csv/1001_header_col_missing.csv", "", "csv", "utf-8"); + Access access = makeLocalAccess("/csv/1001_header_col_missing.csv", "", "csv", "UTF-8"); try (CSVSourceIterator csvSourceIterator = new CSVSourceIterator(access)) { //TODO should fail, check if it does } @@ -43,7 +43,7 @@ public void evaluate_1001_header_col_missing_CSV() throws Exception { @Test public void evaluate_1001_header_long_CSV() throws Exception { - Access access = makeLocalAccess("/csv/1001_header_long.csv", "", "csv", "utf-8"); + Access access = makeLocalAccess("/csv/1001_header_long.csv", "", "csv", "UTF-8"); try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { assertTrue(evaluate_1001_header_long(iterator)); } @@ -51,7 +51,7 @@ public void evaluate_1001_header_long_CSV() throws Exception { @Test public void evaluate_1001_header_short_CSV() throws Exception { - Access access = makeLocalAccess("/csv/1001_header_short.csv", "", "csv", "utf-8"); + Access access = makeLocalAccess("/csv/1001_header_short.csv", "", "csv", "UTF-8"); try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { assertTrue(evaluate_1001_header_short(iterator)); } @@ -59,7 +59,7 @@ public void evaluate_1001_header_short_CSV() throws Exception { @Test public void evaluate_0002_BOM_CSV() throws Exception { - Access access = makeLocalAccess("/csv/0002_BOM.csv", "", "csv", "utf-8"); + Access access = makeLocalAccess("/csv/0002_BOM.csv", "", "csv", "UTF-8"); try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { assertTrue(evaluate_0002_BOM(iterator)); } @@ -67,7 +67,7 @@ public void evaluate_0002_BOM_CSV() throws Exception { @Test public void evaluate_empty_CSV() throws Exception { - Access access = makeLocalAccess("/csv/empty.csv", "", "csv", "utf-8"); + Access access = makeLocalAccess("/csv/empty.csv", "", "csv", "UTF-8"); try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { assertTrue(evaluate_empty(iterator)); } @@ -75,7 +75,7 @@ public void evaluate_empty_CSV() throws Exception { @Test public void evaluateSparseInput() throws Exception { - Access access = makeLocalAccess("/csv/sparseInput.csv", "", "csv", "utf-8"); + Access access = makeLocalAccess("/csv/sparseInput.csv", "", "csv", "UTF-8"); try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { assertTrue(iterator.hasNext()); @@ -114,4 +114,14 @@ public void test_missing_values() throws Exception { } } } + + @Test + public void test_triple_quotes() throws Exception { + Access access = new LocalFileAccess("csv/tripleQuotes.csv", "src/test/resources", "csv"); + try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { + CSVRecord record = (CSVRecord) iterator.next(); + + assertEquals("BO", record.get("\"ISO 3166\"").get(0)); + } + } } diff --git a/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java index a3442f9..01262ef 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java @@ -3,10 +3,7 @@ import be.ugent.idlab.knows.dataio.utils.CSVNullInjector; import org.junit.jupiter.api.Test; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; +import java.io.*; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -66,6 +63,22 @@ public void emptyEnd() throws IOException { assertEquals(expected, output); } + @Test + public void danglingSeparator() throws IOException { + String testString = """ + "ID","Name","DateOfBirth" + "1","Alice", + "2","Bob","September, 2010" + """; + String expected = """ + "ID","Name","DateOfBirth" + "1","Alice",%s + "2","Bob","September, 2010" + """.replaceAll("%s", CSVNullInjector.NULL_VALUE); + String actual = getProcessedString(testString); + assertEquals(expected, actual); + } + /** * Tests ignoring of quoted separators */ diff --git a/src/test/resources/csv/danglingSeparator.csv b/src/test/resources/csv/danglingSeparator.csv new file mode 100644 index 0000000..88c7904 --- /dev/null +++ b/src/test/resources/csv/danglingSeparator.csv @@ -0,0 +1,3 @@ +"ID","Name","DateOfBirth" +"1","Alice", +"2","Bob","September, 2010" \ No newline at end of file diff --git a/src/test/resources/csv/tripleQuotes.csv b/src/test/resources/csv/tripleQuotes.csv new file mode 100644 index 0000000..559d014 --- /dev/null +++ b/src/test/resources/csv/tripleQuotes.csv @@ -0,0 +1,4 @@ +Country Code,Name,"""ISO 3166""" +1,"Bolivia, Plurinational State of",BO +2,Ireland,IE +3,Saint Martin (French part),MF From f7e1867bd1b84492edcba1142efee25bab8e9ae0 Mon Sep 17 00:00:00 2001 From: Jozef Jankaj Date: Thu, 26 Oct 2023 14:10:38 +0200 Subject: [PATCH 09/13] update CHANGELOG --- CHANGELOG.md | 1 + pom.xml | 17 +++++------------ 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 301e4d5..0a91b91 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Require Java 17 (or more recent) +- Use SFM for CSV parsing ### Fixed - Updated Maven Surefire plugin to 3.1.2 diff --git a/pom.xml b/pom.xml index 5bd47c9..1387ff7 100644 --- a/pom.xml +++ b/pom.xml @@ -71,6 +71,11 @@ opencsv 5.8 + + org.simpleflatmapper + sfm-csv + 8.2.3 + @@ -279,18 +284,6 @@ ${fuseki.version} test - - - io.reactivex.rxjava3 - rxjava - 3.1.5 - - - - org.simpleflatmapper - sfm-csv - 8.2.3 - org.apache.jena jena-fuseki-main From d6e92f0694c35b1c6474a011d723c8f5e6e4e38e Mon Sep 17 00:00:00 2001 From: Jozef Jankaj Date: Thu, 26 Oct 2023 15:44:13 +0200 Subject: [PATCH 10/13] chore: various code quality improvements - explicitly set NullInjector arguments type to byte - drop nullity check in CSVWSourceIterator::replaceNulls, as it's no longer necessary --- .../idlab/knows/dataio/iterators/CSVWSourceIterator.java | 5 ++--- .../ugent/idlab/knows/dataio/utils/CSVNullInjector.java | 8 ++++---- .../idlab/knows/dataio/utility/CSVNullInjectorTest.java | 4 ++-- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java index 086c7ca..6b487e1 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java @@ -37,7 +37,7 @@ private void readObject(ObjectInputStream inputStream) throws Exception { } private void bootstrap() throws Exception { - CSVNullInjector injector = new CSVNullInjector(this.access.getInputStream(), BUFFER_SIZE, this.config.getDelimiter(), this.config.getQuoteCharacter()); + CSVNullInjector injector = new CSVNullInjector(this.access.getInputStream(), BUFFER_SIZE, (byte) this.config.getDelimiter(), (byte) this.config.getQuoteCharacter()); this.inputReader = new InputStreamReader(injector, this.config.getEncoding()); CsvParser.DSL parser = config.getSFMParser(BUFFER_SIZE); this.iterator = parser.iterator(this.inputReader); @@ -72,7 +72,6 @@ private String[] nextLine() { } s = s.replaceAll("\"\"", "\""); - r[i] = s; } @@ -90,7 +89,7 @@ private String[] nextLine() { public CSVRecord replaceNulls(CSVRecord record) { Map data = record.getData(); data.forEach((key, value) -> { - if (value != null && this.config.getNulls().contains(value)) { + if (this.config.getNulls().contains(value)) { data.put(key, null); } }); diff --git a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java index fca1105..9d34185 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java @@ -13,12 +13,12 @@ public class CSVNullInjector extends InputStream { private final ByteBuffer nullBuffer; private final ByteBuffer inputBuffer; private final InputStream inputStream; - private final char delimiter; - private final char quoteCharacter; + private final byte delimiter; + private final byte quoteCharacter; private boolean quoteMode = false; private boolean newLine = true; - public CSVNullInjector(InputStream inputStream, int bufferSize, char delimiter, char quoteCharacter) throws IOException { + public CSVNullInjector(InputStream inputStream, int bufferSize, byte delimiter, byte quoteCharacter) throws IOException { this.nullBuffer = ByteBuffer.allocate(NULL_VALUE.length()); this.inputBuffer = ByteBuffer.allocate(bufferSize); this.inputStream = inputStream; @@ -36,7 +36,7 @@ public CSVNullInjector(InputStream inputStream, int bufferSize, char delimiter, } public CSVNullInjector(InputStream inputStream, int bufferSize) throws IOException { - this(inputStream, bufferSize, ',', '"'); + this(inputStream, bufferSize, (byte) ',', (byte) '"'); } @Override diff --git a/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java index 01262ef..b8847d1 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java @@ -10,7 +10,7 @@ public class CSVNullInjectorTest { private String getProcessedString(String inputString) throws IOException { InputStream input = new ByteArrayInputStream(inputString.getBytes()); - return new String(new CSVNullInjector(input, 1024 * 128, ',', '"').readAllBytes()); + return new String(new CSVNullInjector(input, 1024 * 128).readAllBytes()); } /** @@ -33,7 +33,7 @@ public void testInsertion() throws IOException { public void customDelimiter() throws IOException { String testString = "ID;;Foo"; InputStream input = new ByteArrayInputStream(testString.getBytes()); - CSVNullInjector injector = new CSVNullInjector(input, 1024 * 128,';', '"'); + CSVNullInjector injector = new CSVNullInjector(input, 1024 * 128,(byte) ';', (byte) '"'); String output = new String(injector.readAllBytes()); String expected = "ID;%s;Foo".replaceAll("%s", CSVNullInjector.NULL_VALUE); assertEquals(expected, output); From 5ff6c4b6b9d04f4c7d3dbf22b4474a10cbf689d3 Mon Sep 17 00:00:00 2001 From: Jozef Jankaj Date: Fri, 27 Oct 2023 15:08:48 +0200 Subject: [PATCH 11/13] chore: rewrite NullInjector with CharBuffer Replaces ByteBuffer to avoid issues with specific encodings --- .../dataio/iterators/CSVWSourceIterator.java | 9 +- .../knows/dataio/utils/CSVNullInjector.java | 133 +++++++++++------- .../dataio/utility/CSVNullInjectorTest.java | 20 +-- 3 files changed, 101 insertions(+), 61 deletions(-) diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java index 6b487e1..4fa2d06 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java @@ -21,7 +21,7 @@ public class CSVWSourceIterator extends SourceIterator { private final CSVWConfiguration config; private transient String[] header; private transient String[] next; - private transient Reader inputReader; + private transient InputStreamReader inputReader; private transient Iterator iterator; public CSVWSourceIterator(Access access, CSVWConfiguration config) throws Exception { @@ -37,10 +37,11 @@ private void readObject(ObjectInputStream inputStream) throws Exception { } private void bootstrap() throws Exception { - CSVNullInjector injector = new CSVNullInjector(this.access.getInputStream(), BUFFER_SIZE, (byte) this.config.getDelimiter(), (byte) this.config.getQuoteCharacter()); - this.inputReader = new InputStreamReader(injector, this.config.getEncoding()); + this.inputReader = new InputStreamReader(access.getInputStream(), this.config.getEncoding()); + CSVNullInjector injector = new CSVNullInjector(inputReader, BUFFER_SIZE, this.config.getDelimiter(), this.config.getQuoteCharacter()); + CsvParser.DSL parser = config.getSFMParser(BUFFER_SIZE); - this.iterator = parser.iterator(this.inputReader); + this.iterator = parser.iterator(injector.reader()); if (this.config.isSkipHeader()) { this.header = config.getHeader().toArray(new String[0]); diff --git a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java index 9d34185..3d36121 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java @@ -2,50 +2,72 @@ import java.io.IOException; import java.io.InputStream; -import java.nio.ByteBuffer; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.nio.CharBuffer; +import java.nio.charset.StandardCharsets; /** * Injects a known NULL value between two commas in CSV. + * CSVNullInjector will inject a specific null value (defined below in NULL_VALUE) between two delimiters and between a delimiter and a newline. * Inspired by this answer on SO, written with Java's native buffers. */ public class CSVNullInjector extends InputStream { public static final String NULL_VALUE = "DATAIO_NULL"; - private final ByteBuffer nullBuffer; - private final ByteBuffer inputBuffer; - private final InputStream inputStream; - private final byte delimiter; - private final byte quoteCharacter; + private final CharBuffer nullBuffer; + private final CharBuffer inputBuffer; + private final InputStreamReader reader; + private final char delimiter; + private final char quoteCharacter; private boolean quoteMode = false; private boolean newLine = true; - public CSVNullInjector(InputStream inputStream, int bufferSize, byte delimiter, byte quoteCharacter) throws IOException { - this.nullBuffer = ByteBuffer.allocate(NULL_VALUE.length()); - this.inputBuffer = ByteBuffer.allocate(bufferSize); - this.inputStream = inputStream; + /** + * Constructor for CSVNullInjector + * Will initialise buffers and read the first amount of chars from the reader. + * + * @param reader InputStreamReader containing the stream to consume + * @param bufferSize buffer size to pre-allocate for the inputBuffer and keep during reading + * @param delimiter used delimiter + * @param quoteCharacter used quote character + * @throws IOException when an I/O error occurs + */ + public CSVNullInjector(InputStreamReader reader, int bufferSize, char delimiter, char quoteCharacter) throws IOException { + this.nullBuffer = CharBuffer.allocate(NULL_VALUE.length()); + this.inputBuffer = CharBuffer.allocate(bufferSize); + this.reader = reader; this.delimiter = delimiter; this.quoteCharacter = quoteCharacter; // initialise null buffer - this.nullBuffer.put(NULL_VALUE.getBytes()); + this.nullBuffer.put(NULL_VALUE); // initialise input buffer - int count = this.inputStream.read(this.inputBuffer.array()); + int count = this.reader.read(this.inputBuffer.array()); + this.inputBuffer.flip(); if (count > 0) { this.inputBuffer.limit(count); } } - public CSVNullInjector(InputStream inputStream, int bufferSize) throws IOException { - this(inputStream, bufferSize, (byte) ',', (byte) '"'); + /** + * Constructor with default values for CSV + * + * @param reader reader to consume + * @param bufferSize size of the buffer to keep + * @throws IOException when an I/O error occurs + */ + public CSVNullInjector(InputStreamReader reader, int bufferSize) throws IOException { + this(reader, bufferSize, ',', '"'); } @Override public int read() throws IOException { ReadingResult rr = getNextByte(); - if (rr.valid) { - return rr.result & 0xFF; // sign extend the byte + if (rr.valid()) { + return rr.result(); } - return -1; // nothing could be read + return -1; } @Override @@ -56,7 +78,7 @@ public int read(byte[] b) throws IOException { @Override public int read(byte[] b, int off, int len) throws IOException { int i = off; - for (; i < len; i++) { + while (i < len) { ReadingResult rr = getNextByte(); if (!rr.valid()) { if (i == off) { @@ -64,32 +86,37 @@ public int read(byte[] b, int off, int len) throws IOException { } break; } - b[i] = rr.result(); + byte[] bytes = String.valueOf(rr.result()).getBytes(); + for (byte b1 : bytes) { + b[i] = b1; + i++; + } } return i - off; } /** - * Fetches next byte to be returned in by the injector. - * This byte could come from either the nullBuffer or the inputBuffer, depending on the state of the injector + * Fetches the next character to be returned by the injector. + * This character could come from either the nullBuffer or the inputBuffer, depending on the state of the injector * - * @return the next byte + * @return the next character */ private ReadingResult getNextByte() throws IOException { if (this.nullBuffer.hasRemaining()) { return new ReadingResult(true, this.nullBuffer.get()); } - if (!this.ensureInput()) { + if (this.noMoreInput()) { // nothing more in the inputBuffer - return new ReadingResult(false, (byte) -1); + return new ReadingResult(false, 'f'); } - byte currentByte = this.inputBuffer.get(); + char currentChar = this.inputBuffer.get(); // specific case when we're on a new line and first character is the delimiter // -> there's a missing null value that must be injected - if (this.newLine && currentByte == this.delimiter) { + if (this.newLine && currentChar == this.delimiter) { + // move the inputBuffer back to original position this.inputBuffer.position(this.inputBuffer.position() - 1); this.nullBuffer.flip(); this.newLine = false; @@ -98,64 +125,74 @@ private ReadingResult getNextByte() throws IOException { this.newLine = false; - if (currentByte == this.quoteCharacter) { + if (currentChar == this.quoteCharacter) { // toggle quote mode this.quoteMode = !this.quoteMode; } - if (currentByte == '\n') { - this.newLine = true; - return new ReadingResult(true, currentByte); + if (quoteMode) { // if in quote mode, immediately return + return new ReadingResult(true, currentChar); } - if (quoteMode) { // if in quote mode, immediately return - return new ReadingResult(true, currentByte); + if (currentChar == '\n') { // encountered end of line, return + this.newLine = true; + return new ReadingResult(true, currentChar); } - if (currentByte == this.delimiter) { + if (currentChar == this.delimiter) { // look for second delimiter - if (!this.ensureInput()) { + if (this.noMoreInput()) { // last byte of the input is a delimiter, add one last null value this.nullBuffer.flip(); - return new ReadingResult(true, currentByte); + return new ReadingResult(true, currentChar); } // not the last byte, check the next - byte b1 = this.inputBuffer.get(this.inputBuffer.position()); + char b1 = this.inputBuffer.get(this.inputBuffer.position()); if (b1 == this.delimiter || b1 == '\n') { // two delimiters or a newline => dangling delimiter, add a null value this.nullBuffer.flip(); - return new ReadingResult(true, currentByte); // return the original + return new ReadingResult(true, currentChar); // return the original } } - return new ReadingResult(true, currentByte); + return new ReadingResult(true, currentChar); } - private boolean ensureInput() throws IOException { + private boolean noMoreInput() throws IOException { if (this.inputBuffer.hasRemaining()) { - return true; + return false; } - int count = this.inputStream.read(this.inputBuffer.array()); + int count = this.reader.read(this.inputBuffer); if (count < 1) { // no bytes available - return false; + return true; } this.inputBuffer.flip(); this.inputBuffer.limit(count); - return true; + return false; } @Override public void close() throws IOException { - this.inputStream.close(); + this.reader.close(); } /** - * Record to communicate the result of the byte read and it's success - * @param valid true if the reading of the byte was successful and the result byte is usable, false otherwise - * @param result the result of reading + * A convenience method for getting an InputStreamReader + * + * @return an InputStreamReader that consumes this null injector + */ + public InputStreamReader reader() { + return new InputStreamReader(this); + } + + /** + * Record to communicate the result of the byte read and its success + * + * @param valid true if the reading of the byte was successful and the result byte is usable, false otherwise + * @param result the character produced by the read. If valid == false, its value does not matter */ - private record ReadingResult(boolean valid, byte result) { + private record ReadingResult(boolean valid, char result) { } } diff --git a/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java index b8847d1..2f40fc6 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java @@ -3,19 +3,22 @@ import be.ugent.idlab.knows.dataio.utils.CSVNullInjector; import org.junit.jupiter.api.Test; -import java.io.*; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import static org.junit.jupiter.api.Assertions.assertEquals; public class CSVNullInjectorTest { private String getProcessedString(String inputString) throws IOException { InputStream input = new ByteArrayInputStream(inputString.getBytes()); - return new String(new CSVNullInjector(input, 1024 * 128).readAllBytes()); + InputStreamReader reader = new InputStreamReader(input); + return new String(new CSVNullInjector(reader, 1024 * 128).readAllBytes()); } /** * Tests a simple insertion in between two delimiters - * @throws IOException */ @Test public void testInsertion() throws IOException { @@ -27,13 +30,13 @@ public void testInsertion() throws IOException { /** * Tests an insertion between two custom delimiters - * @throws IOException */ @Test public void customDelimiter() throws IOException { String testString = "ID;;Foo"; InputStream input = new ByteArrayInputStream(testString.getBytes()); - CSVNullInjector injector = new CSVNullInjector(input, 1024 * 128,(byte) ';', (byte) '"'); + InputStreamReader reader = new InputStreamReader(input); + CSVNullInjector injector = new CSVNullInjector(reader, 1024 * 128, ';', '"'); String output = new String(injector.readAllBytes()); String expected = "ID;%s;Foo".replaceAll("%s", CSVNullInjector.NULL_VALUE); assertEquals(expected, output); @@ -41,7 +44,6 @@ public void customDelimiter() throws IOException { /** * Tests injection of null value at the start of the string - * @throws IOException */ @Test public void emptyStart() throws IOException { @@ -53,7 +55,6 @@ public void emptyStart() throws IOException { /** * Tests insertion of null value at the end of the string - * @throws IOException */ @Test public void emptyEnd() throws IOException { @@ -63,6 +64,9 @@ public void emptyEnd() throws IOException { assertEquals(expected, output); } + /** + * Tests the insertion in between a delimiter and a newline + */ @Test public void danglingSeparator() throws IOException { String testString = """ @@ -104,7 +108,6 @@ public void escapedQuote() throws IOException { /** * Tests the injector's correct recognition of Linux newlines. - * @throws IOException */ @Test public void unixNewLine() throws IOException { @@ -116,7 +119,6 @@ public void unixNewLine() throws IOException { /** * Tests the injector's correct recognition of Windows newlines. - * @throws IOException */ @Test public void windowsNewLine() throws IOException { From 75c91aaaaae8c2da5ae559e5a3103d568ff343e8 Mon Sep 17 00:00:00 2001 From: Jozef Jankaj Date: Mon, 30 Oct 2023 11:18:53 +0100 Subject: [PATCH 12/13] chore: documentation --- .../ugent/idlab/knows/dataio/utils/CSVNullInjector.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java index 3d36121..b44a428 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java @@ -3,9 +3,7 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.io.UnsupportedEncodingException; import java.nio.CharBuffer; -import java.nio.charset.StandardCharsets; /** * Injects a known NULL value between two commas in CSV. @@ -158,6 +156,13 @@ private ReadingResult getNextByte() throws IOException { return new ReadingResult(true, currentChar); } + /** + * Method for checking if there's input to be consumed. + * Input will be read into inputBuffer should the buffer be empty. + * + * @return true if the input is exhausted, false otherwise + * @throws IOException when an I/O error occurs + */ private boolean noMoreInput() throws IOException { if (this.inputBuffer.hasRemaining()) { return false; From ab587d3693c2723ea1cab83bce499018844d804f Mon Sep 17 00:00:00 2001 From: Jozef Jankaj Date: Mon, 30 Oct 2023 11:23:14 +0100 Subject: [PATCH 13/13] chore: remove time test --- .../dataio/iterator/CSVWIteratorTest.java | 32 ------------------- 1 file changed, 32 deletions(-) diff --git a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java index b8ac2df..fb07e7f 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java @@ -248,36 +248,4 @@ public void evaluate_0000_trim_bogus() throws Exception { } } } - - private long getExecTime(SourceIterator iterator) { - long start = System.currentTimeMillis(); - - iterator.forEachRemaining(new Consumer<>() { - int count = 0; - - @Override - public void accept(Record source) { - if (count < 10) { - System.out.println(((CSVRecord) source).getData()); - } - count++; - } - }); - - long end = System.currentTimeMillis(); - return end - start; - } - - - @Test - @Disabled - public void execTime() throws Exception { - Access access = new LocalFileAccess("/home/messik/Work/large_files/large_csv/taxonmappings/joined.tsv", "", "tsv", "UTF-8"); - CSVWConfiguration config = CSVWConfiguration.builder().withDelimiter('\t').build(); - - try(SourceIterator iterator = new CSVWSourceIterator(access, config)) { - long execTime = getExecTime(iterator); - System.out.printf("Execution took %f seconds", execTime / 1000.0); - } - } }