diff --git a/CHANGELOG.md b/CHANGELOG.md index 301e4d5..0a91b91 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Changed - Require Java 17 (or more recent) +- Use SFM for CSV parsing ### Fixed - Updated Maven Surefire plugin to 3.1.2 diff --git a/pom.xml b/pom.xml index 575b55f..1387ff7 100644 --- a/pom.xml +++ b/pom.xml @@ -71,6 +71,11 @@ opencsv 5.8 + + org.simpleflatmapper + sfm-csv + 8.2.3 + diff --git a/src/main/java/be/ugent/idlab/knows/dataio/access/COMPRESSION.java b/src/main/java/be/ugent/idlab/knows/dataio/access/COMPRESSION.java new file mode 100644 index 0000000..50dea6f --- /dev/null +++ b/src/main/java/be/ugent/idlab/knows/dataio/access/COMPRESSION.java @@ -0,0 +1,5 @@ +package be.ugent.idlab.knows.dataio.access; + +public class COMPRESSION { + public static final String GZIP = "gzip"; +} \ No newline at end of file diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java index f0f84fb..4fa2d06 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java @@ -4,23 +4,25 @@ import be.ugent.idlab.knows.dataio.iterators.csvw.CSVWConfiguration; import be.ugent.idlab.knows.dataio.record.CSVRecord; import be.ugent.idlab.knows.dataio.record.Record; -import com.opencsv.CSVReader; -import com.opencsv.CSVReaderBuilder; -import com.opencsv.exceptions.CsvValidationException; +import be.ugent.idlab.knows.dataio.utils.CSVNullInjector; +import org.simpleflatmapper.lightningcsv.CsvParser; import java.io.*; import java.util.Arrays; +import java.util.Iterator; import java.util.Map; import java.util.NoSuchElementException; public class CSVWSourceIterator extends SourceIterator { @Serial private static final long serialVersionUID = -5824558388620967495L; + private static final int BUFFER_SIZE = 1024 * 128; // 128 KiB private final Access access; private final CSVWConfiguration config; private transient String[] header; private transient String[] next; - private transient CSVReader reader; + private transient InputStreamReader inputReader; + private transient Iterator iterator; public CSVWSourceIterator(Access access, CSVWConfiguration config) throws Exception { this.access = access; @@ -34,55 +36,49 @@ private void readObject(ObjectInputStream inputStream) throws Exception { this.bootstrap(); } - /** - * Instantiates transient fields. This code needs to be run both at construction time and after deserialization - */ private void bootstrap() throws Exception { - this.reader = new CSVReaderBuilder(new InputStreamReader(access.getInputStream(), config.getEncoding())) - .withCSVParser(this.config.getParser()) - .withSkipLines(this.config.isSkipHeader() ? 1 : 0) - .build(); + this.inputReader = new InputStreamReader(access.getInputStream(), this.config.getEncoding()); + CSVNullInjector injector = new CSVNullInjector(inputReader, BUFFER_SIZE, this.config.getDelimiter(), this.config.getQuoteCharacter()); + + CsvParser.DSL parser = config.getSFMParser(BUFFER_SIZE); + this.iterator = parser.iterator(injector.reader()); if (this.config.isSkipHeader()) { this.header = config.getHeader().toArray(new String[0]); } else { - this.header = readLine(); - - if (header == null) { - throw new IllegalStateException("Unable to read the file!"); - } + this.header = nextLine(); } - this.next = readLine(); + this.next = nextLine(); } - private String[] readLine() throws IOException { - String[] line; - do { - try { - line = this.reader.readNext(); + private String[] nextLine() { + if (this.iterator.hasNext()) { + String[] r = this.iterator.next(); + // go over the lines till uncommented line found + while (r[0].startsWith(config.getCommentPrefix()) && this.iterator.hasNext()) { + r = this.iterator.next(); + } - if (line == null) { - return null; - } - } catch (CsvValidationException e) { - throw new IllegalArgumentException(String.format("File does not conform to configuration! Offending line: %s", Arrays.toString(this.reader.peek()))); + if (r[0].startsWith(config.getCommentPrefix())) { + return null; } - } while (invalidLine(line)); - return line; - } + // replace any occurrence of an escaped quote with a single quote + for (int i = 0; i < r.length; i++) { + String s = r[i]; + // trim the string that is quoted + if (s.startsWith("\"") && s.endsWith("\"")) { + s = s.substring(1, s.length() - 1); + } - /** - * Checks if the passed line corresponds to the filters set - * A line is considered valid if it doesn't start with the comment prefix - * If the first value is null, the line is accepted - * - * @param line line to be checked - * @return true if the line passes all checks - */ - private boolean invalidLine(String[] line) { - return line[0] != null && line[0].startsWith(this.config.getCommentPrefix()); + s = s.replaceAll("\"\"", "\""); + r[i] = s; + } + + return r; + } + return null; } /** @@ -94,7 +90,7 @@ private boolean invalidLine(String[] line) { public CSVRecord replaceNulls(CSVRecord record) { Map data = record.getData(); data.forEach((key, value) -> { - if (value != null && this.config.getNulls().contains(value)) { + if (this.config.getNulls().contains(value)) { data.put(key, null); } }); @@ -122,13 +118,9 @@ public Record next() { if (this.next == null) { throw new NoSuchElementException(); } - String[] line = this.next; - try { - this.next = readLine(); - } catch (IOException e) { - throw new RuntimeException(e); - } + + this.next = nextLine(); if (!config.getTrim().equals("false")) { line = applyTrimArray(line, config.getTrim()); @@ -144,6 +136,6 @@ public boolean hasNext() { @Override public void close() throws IOException { - this.reader.close(); + this.inputReader.close(); } } diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java index 6d265fc..9320765 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java @@ -1,11 +1,12 @@ package be.ugent.idlab.knows.dataio.iterators.csvw; -import com.opencsv.CSVParser; -import com.opencsv.CSVParserBuilder; -import com.opencsv.enums.CSVReaderNullFieldIndicator; + +import be.ugent.idlab.knows.dataio.utils.CSVNullInjector; +import org.simpleflatmapper.lightningcsv.CsvParser; import java.io.Serial; import java.io.Serializable; +import java.util.ArrayList; import java.util.List; /** @@ -49,7 +50,11 @@ public final class CSVWConfiguration implements Serializable { this.skipHeader = skipHeader; this.commentPrefix = commentPrefix; this.header = header; - this.nulls = nulls; + + List nullValues = new ArrayList<>(nulls); + nullValues.add(CSVNullInjector.NULL_VALUE); // add our special null value + + this.nulls = nullValues; this.encoding = encoding; } @@ -93,12 +98,11 @@ public String getEncoding() { return encoding; } - public CSVParser getParser() { - return new CSVParserBuilder() - .withSeparator(this.delimiter) - .withEscapeChar(this.escapeCharacter) - .withQuoteChar(this.quoteCharacter) - .withFieldAsNull(CSVReaderNullFieldIndicator.EMPTY_SEPARATORS) - .build(); + public CsvParser.DSL getSFMParser(int bufferSize) { + return CsvParser + .separator(this.delimiter) + .escape(this.escapeCharacter) + .quote(this.quoteCharacter) + .bufferSize(bufferSize); } } diff --git a/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java b/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java index ff455aa..ac9b843 100644 --- a/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java +++ b/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java @@ -15,7 +15,7 @@ public class CSVRecord extends Record { private final Map datatypes; public CSVRecord(String[] header, String[] data, Map datatypes) { - this.data = new HashMap<>(); + this.data = new HashMap<>(header.length); if (header.length > data.length) { logger.warn("Header has more columns than this row"); } @@ -26,7 +26,7 @@ public CSVRecord(String[] header, String[] data, Map datatypes) if (i < data.length) { this.data.put(header[i], data[i]); } else { - this.data.put(header[i], ""); + this.data.put(header[i], null); } } this.datatypes = datatypes; diff --git a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java new file mode 100644 index 0000000..b44a428 --- /dev/null +++ b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java @@ -0,0 +1,203 @@ +package be.ugent.idlab.knows.dataio.utils; + +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.CharBuffer; + +/** + * Injects a known NULL value between two commas in CSV. + * CSVNullInjector will inject a specific null value (defined below in NULL_VALUE) between two delimiters and between a delimiter and a newline. + * Inspired by this answer on SO, written with Java's native buffers. + */ +public class CSVNullInjector extends InputStream { + public static final String NULL_VALUE = "DATAIO_NULL"; + private final CharBuffer nullBuffer; + private final CharBuffer inputBuffer; + private final InputStreamReader reader; + private final char delimiter; + private final char quoteCharacter; + private boolean quoteMode = false; + private boolean newLine = true; + + /** + * Constructor for CSVNullInjector + * Will initialise buffers and read the first amount of chars from the reader. + * + * @param reader InputStreamReader containing the stream to consume + * @param bufferSize buffer size to pre-allocate for the inputBuffer and keep during reading + * @param delimiter used delimiter + * @param quoteCharacter used quote character + * @throws IOException when an I/O error occurs + */ + public CSVNullInjector(InputStreamReader reader, int bufferSize, char delimiter, char quoteCharacter) throws IOException { + this.nullBuffer = CharBuffer.allocate(NULL_VALUE.length()); + this.inputBuffer = CharBuffer.allocate(bufferSize); + this.reader = reader; + this.delimiter = delimiter; + this.quoteCharacter = quoteCharacter; + + // initialise null buffer + this.nullBuffer.put(NULL_VALUE); + + // initialise input buffer + int count = this.reader.read(this.inputBuffer.array()); + this.inputBuffer.flip(); + if (count > 0) { + this.inputBuffer.limit(count); + } + } + + /** + * Constructor with default values for CSV + * + * @param reader reader to consume + * @param bufferSize size of the buffer to keep + * @throws IOException when an I/O error occurs + */ + public CSVNullInjector(InputStreamReader reader, int bufferSize) throws IOException { + this(reader, bufferSize, ',', '"'); + } + + @Override + public int read() throws IOException { + ReadingResult rr = getNextByte(); + if (rr.valid()) { + return rr.result(); + } + return -1; + } + + @Override + public int read(byte[] b) throws IOException { + return this.read(b, 0, b.length); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + int i = off; + while (i < len) { + ReadingResult rr = getNextByte(); + if (!rr.valid()) { + if (i == off) { + return -1; // return -1 to let the caller known no more data is available + } + break; + } + byte[] bytes = String.valueOf(rr.result()).getBytes(); + for (byte b1 : bytes) { + b[i] = b1; + i++; + } + } + return i - off; + } + + /** + * Fetches the next character to be returned by the injector. + * This character could come from either the nullBuffer or the inputBuffer, depending on the state of the injector + * + * @return the next character + */ + private ReadingResult getNextByte() throws IOException { + if (this.nullBuffer.hasRemaining()) { + return new ReadingResult(true, this.nullBuffer.get()); + } + + if (this.noMoreInput()) { + // nothing more in the inputBuffer + return new ReadingResult(false, 'f'); + } + + char currentChar = this.inputBuffer.get(); + + // specific case when we're on a new line and first character is the delimiter + // -> there's a missing null value that must be injected + if (this.newLine && currentChar == this.delimiter) { + // move the inputBuffer back to original position + this.inputBuffer.position(this.inputBuffer.position() - 1); + this.nullBuffer.flip(); + this.newLine = false; + return new ReadingResult(true, this.nullBuffer.get()); + } + + this.newLine = false; + + if (currentChar == this.quoteCharacter) { + // toggle quote mode + this.quoteMode = !this.quoteMode; + } + + if (quoteMode) { // if in quote mode, immediately return + return new ReadingResult(true, currentChar); + } + + if (currentChar == '\n') { // encountered end of line, return + this.newLine = true; + return new ReadingResult(true, currentChar); + } + + if (currentChar == this.delimiter) { + // look for second delimiter + if (this.noMoreInput()) { + // last byte of the input is a delimiter, add one last null value + this.nullBuffer.flip(); + return new ReadingResult(true, currentChar); + } + // not the last byte, check the next + char b1 = this.inputBuffer.get(this.inputBuffer.position()); + if (b1 == this.delimiter || b1 == '\n') { + // two delimiters or a newline => dangling delimiter, add a null value + this.nullBuffer.flip(); + return new ReadingResult(true, currentChar); // return the original + } + } + + return new ReadingResult(true, currentChar); + } + + /** + * Method for checking if there's input to be consumed. + * Input will be read into inputBuffer should the buffer be empty. + * + * @return true if the input is exhausted, false otherwise + * @throws IOException when an I/O error occurs + */ + private boolean noMoreInput() throws IOException { + if (this.inputBuffer.hasRemaining()) { + return false; + } + int count = this.reader.read(this.inputBuffer); + if (count < 1) { // no bytes available + return true; + } + + this.inputBuffer.flip(); + this.inputBuffer.limit(count); + + return false; + } + + @Override + public void close() throws IOException { + this.reader.close(); + } + + /** + * A convenience method for getting an InputStreamReader + * + * @return an InputStreamReader that consumes this null injector + */ + public InputStreamReader reader() { + return new InputStreamReader(this); + } + + /** + * Record to communicate the result of the byte read and its success + * + * @param valid true if the reading of the byte was successful and the result byte is usable, false otherwise + * @param result the character produced by the read. If valid == false, its value does not matter + */ + private record ReadingResult(boolean valid, char result) { + } +} diff --git a/src/test/java/be/ugent/idlab/knows/dataio/access/LocalAccessTest.java b/src/test/java/be/ugent/idlab/knows/dataio/access/LocalAccessTest.java index 61b794e..0698249 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/access/LocalAccessTest.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/access/LocalAccessTest.java @@ -9,6 +9,7 @@ import java.io.File; import java.io.FileNotFoundException; +import java.nio.file.NoSuchFileException; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Path; @@ -65,7 +66,7 @@ public void relativeToBase() throws IOException, SQLException { @Test public void nonExistentFile() { Access access = new LocalFileAccess("", "not_existing_file.csv", "csv", "utf-8"); - assertThrows(FileNotFoundException.class, () -> access.getInputStream()); + assertThrows(NoSuchFileException.class, access::getInputStream); } @EnabledOnOs(OS.WINDOWS) diff --git a/src/test/java/be/ugent/idlab/knows/dataio/cores/TestCore.java b/src/test/java/be/ugent/idlab/knows/dataio/cores/TestCore.java index 02f867b..f86e0e6 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/cores/TestCore.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/cores/TestCore.java @@ -110,6 +110,10 @@ public boolean evaluate_1001_header_short(Iterator iterator) { return compareIterator(iterator, List.of(expected1, expected2, expected3)); } + public boolean evaluate_empty(Iterator iterator) { + return !iterator.hasNext(); + } + public boolean compareIterator(Iterator iterator, Set expectedRecords) { int counter = 0; while (iterator.hasNext()) { diff --git a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java index 0f76fe2..134f18c 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java @@ -18,7 +18,7 @@ public class CSVIteratorTest extends TestCore { @Test public void evaluate_0000_CSV() throws Exception { - Access access = makeLocalAccess("/csv/0000.csv", "", "csv", "utf-8"); + Access access = makeLocalAccess("/csv/0000.csv", "", "csv", "UTF-8"); try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { assertTrue(evaluate_0000(iterator)); } @@ -26,7 +26,7 @@ public void evaluate_0000_CSV() throws Exception { @Test public void evaluate_0001_CSV() throws Exception { - Access access = makeLocalAccess("/csv/0001.csv", "", "csv", "utf-8"); + Access access = makeLocalAccess("/csv/0001.csv", "", "csv", "UTF-8"); try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { assertTrue(evaluate_0001(iterator)); } @@ -35,7 +35,7 @@ public void evaluate_0001_CSV() throws Exception { @Test @Disabled public void evaluate_1001_header_col_missing_CSV() throws Exception { - Access access = makeLocalAccess("/csv/1001_header_col_missing.csv", "", "csv", "utf-8"); + Access access = makeLocalAccess("/csv/1001_header_col_missing.csv", "", "csv", "UTF-8"); try (CSVSourceIterator csvSourceIterator = new CSVSourceIterator(access)) { //TODO should fail, check if it does } @@ -43,7 +43,7 @@ public void evaluate_1001_header_col_missing_CSV() throws Exception { @Test public void evaluate_1001_header_long_CSV() throws Exception { - Access access = makeLocalAccess("/csv/1001_header_long.csv", "", "csv", "utf-8"); + Access access = makeLocalAccess("/csv/1001_header_long.csv", "", "csv", "UTF-8"); try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { assertTrue(evaluate_1001_header_long(iterator)); } @@ -51,7 +51,7 @@ public void evaluate_1001_header_long_CSV() throws Exception { @Test public void evaluate_1001_header_short_CSV() throws Exception { - Access access = makeLocalAccess("/csv/1001_header_short.csv", "", "csv", "utf-8"); + Access access = makeLocalAccess("/csv/1001_header_short.csv", "", "csv", "UTF-8"); try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { assertTrue(evaluate_1001_header_short(iterator)); } @@ -59,15 +59,23 @@ public void evaluate_1001_header_short_CSV() throws Exception { @Test public void evaluate_0002_BOM_CSV() throws Exception { - Access access = makeLocalAccess("/csv/0002_BOM.csv", "", "csv", "utf-8"); + Access access = makeLocalAccess("/csv/0002_BOM.csv", "", "csv", "UTF-8"); try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { assertTrue(evaluate_0002_BOM(iterator)); } } + @Test + public void evaluate_empty_CSV() throws Exception { + Access access = makeLocalAccess("/csv/empty.csv", "", "csv", "UTF-8"); + try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { + assertTrue(evaluate_empty(iterator)); + } + } + @Test public void evaluateSparseInput() throws Exception { - Access access = makeLocalAccess("/csv/sparseInput.csv", "", "csv", "utf-8"); + Access access = makeLocalAccess("/csv/sparseInput.csv", "", "csv", "UTF-8"); try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { assertTrue(iterator.hasNext()); @@ -106,4 +114,14 @@ public void test_missing_values() throws Exception { } } } + + @Test + public void test_triple_quotes() throws Exception { + Access access = new LocalFileAccess("csv/tripleQuotes.csv", "src/test/resources", "csv"); + try (CSVSourceIterator iterator = new CSVSourceIterator(access)) { + CSVRecord record = (CSVRecord) iterator.next(); + + assertEquals("BO", record.get("\"ISO 3166\"").get(0)); + } + } } diff --git a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java index 17d87df..fb07e7f 100644 --- a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java +++ b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java @@ -4,14 +4,18 @@ import be.ugent.idlab.knows.dataio.access.LocalFileAccess; import be.ugent.idlab.knows.dataio.cores.TestCore; import be.ugent.idlab.knows.dataio.iterators.CSVWSourceIterator; +import be.ugent.idlab.knows.dataio.iterators.SourceIterator; import be.ugent.idlab.knows.dataio.iterators.csvw.CSVWConfiguration; import be.ugent.idlab.knows.dataio.record.CSVRecord; +import be.ugent.idlab.knows.dataio.record.Record; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Set; +import java.util.function.Consumer; import static org.junit.jupiter.api.Assertions.*; diff --git a/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java new file mode 100644 index 0000000..2f40fc6 --- /dev/null +++ b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java @@ -0,0 +1,130 @@ +package be.ugent.idlab.knows.dataio.utility; + +import be.ugent.idlab.knows.dataio.utils.CSVNullInjector; +import org.junit.jupiter.api.Test; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class CSVNullInjectorTest { + private String getProcessedString(String inputString) throws IOException { + InputStream input = new ByteArrayInputStream(inputString.getBytes()); + InputStreamReader reader = new InputStreamReader(input); + return new String(new CSVNullInjector(reader, 1024 * 128).readAllBytes()); + } + + /** + * Tests a simple insertion in between two delimiters + */ + @Test + public void testInsertion() throws IOException { + String testString = "ID,,Foo"; + String output = getProcessedString(testString); + String expected = "ID,%s,Foo".replaceAll("%s", CSVNullInjector.NULL_VALUE); + assertEquals(expected, output); + } + + /** + * Tests an insertion between two custom delimiters + */ + @Test + public void customDelimiter() throws IOException { + String testString = "ID;;Foo"; + InputStream input = new ByteArrayInputStream(testString.getBytes()); + InputStreamReader reader = new InputStreamReader(input); + CSVNullInjector injector = new CSVNullInjector(reader, 1024 * 128, ';', '"'); + String output = new String(injector.readAllBytes()); + String expected = "ID;%s;Foo".replaceAll("%s", CSVNullInjector.NULL_VALUE); + assertEquals(expected, output); + } + + /** + * Tests injection of null value at the start of the string + */ + @Test + public void emptyStart() throws IOException { + String testString = ",Foo,Bar"; + String output = getProcessedString(testString); + String expected = "%s,Foo,Bar".replaceAll("%s", CSVNullInjector.NULL_VALUE); + assertEquals(expected, output); + } + + /** + * Tests insertion of null value at the end of the string + */ + @Test + public void emptyEnd() throws IOException { + String testString = "Foo,Bar,"; + String output = getProcessedString(testString); + String expected = "Foo,Bar,%s".replaceAll("%s", CSVNullInjector.NULL_VALUE); + assertEquals(expected, output); + } + + /** + * Tests the insertion in between a delimiter and a newline + */ + @Test + public void danglingSeparator() throws IOException { + String testString = """ + "ID","Name","DateOfBirth" + "1","Alice", + "2","Bob","September, 2010" + """; + String expected = """ + "ID","Name","DateOfBirth" + "1","Alice",%s + "2","Bob","September, 2010" + """.replaceAll("%s", CSVNullInjector.NULL_VALUE); + String actual = getProcessedString(testString); + assertEquals(expected, actual); + } + + /** + * Tests ignoring of quoted separators + */ + @Test + public void ignoreQuotedSeparator() throws IOException { + String testString = "ID,\",, ,\",Foo"; + String output = getProcessedString(testString); + assertEquals(testString, output); + } + + /** + * Tests correct injection for escaped quotes + * Input: "aaa","b"",,bb",,"ccc" + * Output: "aaa","b"",,bb",${nullValue},"ccc" + */ + @Test + public void escapedQuote() throws IOException { + String testString = "\"aaa\",\"b\"\",,bb\",,\"ccc\""; + String output = getProcessedString(testString); + String expected = "\"aaa\",\"b\"\",,bb\",%s,\"ccc\"".replaceAll("%s", CSVNullInjector.NULL_VALUE); + assertEquals(expected, output); + } + + /** + * Tests the injector's correct recognition of Linux newlines. + */ + @Test + public void unixNewLine() throws IOException { + String testString = "Foo,,Bar\n,B"; + String output = getProcessedString(testString); + String expected = "Foo,%s,Bar\n%s,B".replaceAll("%s", CSVNullInjector.NULL_VALUE); + assertEquals(expected, output); + } + + /** + * Tests the injector's correct recognition of Windows newlines. + */ + @Test + public void windowsNewLine() throws IOException { + String testString = "Foo,,Bar\r\n,B"; + String output = getProcessedString(testString); + String expected = "Foo,%s,Bar\r\n%s,B".replaceAll("%s", CSVNullInjector.NULL_VALUE); + assertEquals(expected, output); + } +} diff --git a/src/test/resources/csv/danglingSeparator.csv b/src/test/resources/csv/danglingSeparator.csv new file mode 100644 index 0000000..88c7904 --- /dev/null +++ b/src/test/resources/csv/danglingSeparator.csv @@ -0,0 +1,3 @@ +"ID","Name","DateOfBirth" +"1","Alice", +"2","Bob","September, 2010" \ No newline at end of file diff --git a/src/test/resources/csv/empty.csv b/src/test/resources/csv/empty.csv new file mode 100644 index 0000000..e69de29 diff --git a/src/test/resources/csv/tripleQuotes.csv b/src/test/resources/csv/tripleQuotes.csv new file mode 100644 index 0000000..559d014 --- /dev/null +++ b/src/test/resources/csv/tripleQuotes.csv @@ -0,0 +1,4 @@ +Country Code,Name,"""ISO 3166""" +1,"Bolivia, Plurinational State of",BO +2,Ireland,IE +3,Saint Martin (French part),MF