diff --git a/CHANGELOG.md b/CHANGELOG.md
index 301e4d5..0a91b91 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
- Require Java 17 (or more recent)
+- Use SFM for CSV parsing
### Fixed
- Updated Maven Surefire plugin to 3.1.2
diff --git a/pom.xml b/pom.xml
index 575b55f..1387ff7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -71,6 +71,11 @@
opencsv
5.8
+
+ org.simpleflatmapper
+ sfm-csv
+ 8.2.3
+
diff --git a/src/main/java/be/ugent/idlab/knows/dataio/access/COMPRESSION.java b/src/main/java/be/ugent/idlab/knows/dataio/access/COMPRESSION.java
new file mode 100644
index 0000000..50dea6f
--- /dev/null
+++ b/src/main/java/be/ugent/idlab/knows/dataio/access/COMPRESSION.java
@@ -0,0 +1,5 @@
+package be.ugent.idlab.knows.dataio.access;
+
+public class COMPRESSION {
+ public static final String GZIP = "gzip";
+}
\ No newline at end of file
diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java
index f0f84fb..4fa2d06 100644
--- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java
+++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/CSVWSourceIterator.java
@@ -4,23 +4,25 @@
import be.ugent.idlab.knows.dataio.iterators.csvw.CSVWConfiguration;
import be.ugent.idlab.knows.dataio.record.CSVRecord;
import be.ugent.idlab.knows.dataio.record.Record;
-import com.opencsv.CSVReader;
-import com.opencsv.CSVReaderBuilder;
-import com.opencsv.exceptions.CsvValidationException;
+import be.ugent.idlab.knows.dataio.utils.CSVNullInjector;
+import org.simpleflatmapper.lightningcsv.CsvParser;
import java.io.*;
import java.util.Arrays;
+import java.util.Iterator;
import java.util.Map;
import java.util.NoSuchElementException;
public class CSVWSourceIterator extends SourceIterator {
@Serial
private static final long serialVersionUID = -5824558388620967495L;
+ private static final int BUFFER_SIZE = 1024 * 128; // 128 KiB
private final Access access;
private final CSVWConfiguration config;
private transient String[] header;
private transient String[] next;
- private transient CSVReader reader;
+ private transient InputStreamReader inputReader;
+ private transient Iterator iterator;
public CSVWSourceIterator(Access access, CSVWConfiguration config) throws Exception {
this.access = access;
@@ -34,55 +36,49 @@ private void readObject(ObjectInputStream inputStream) throws Exception {
this.bootstrap();
}
- /**
- * Instantiates transient fields. This code needs to be run both at construction time and after deserialization
- */
private void bootstrap() throws Exception {
- this.reader = new CSVReaderBuilder(new InputStreamReader(access.getInputStream(), config.getEncoding()))
- .withCSVParser(this.config.getParser())
- .withSkipLines(this.config.isSkipHeader() ? 1 : 0)
- .build();
+ this.inputReader = new InputStreamReader(access.getInputStream(), this.config.getEncoding());
+ CSVNullInjector injector = new CSVNullInjector(inputReader, BUFFER_SIZE, this.config.getDelimiter(), this.config.getQuoteCharacter());
+
+ CsvParser.DSL parser = config.getSFMParser(BUFFER_SIZE);
+ this.iterator = parser.iterator(injector.reader());
if (this.config.isSkipHeader()) {
this.header = config.getHeader().toArray(new String[0]);
} else {
- this.header = readLine();
-
- if (header == null) {
- throw new IllegalStateException("Unable to read the file!");
- }
+ this.header = nextLine();
}
- this.next = readLine();
+ this.next = nextLine();
}
- private String[] readLine() throws IOException {
- String[] line;
- do {
- try {
- line = this.reader.readNext();
+ private String[] nextLine() {
+ if (this.iterator.hasNext()) {
+ String[] r = this.iterator.next();
+ // go over the lines till uncommented line found
+ while (r[0].startsWith(config.getCommentPrefix()) && this.iterator.hasNext()) {
+ r = this.iterator.next();
+ }
- if (line == null) {
- return null;
- }
- } catch (CsvValidationException e) {
- throw new IllegalArgumentException(String.format("File does not conform to configuration! Offending line: %s", Arrays.toString(this.reader.peek())));
+ if (r[0].startsWith(config.getCommentPrefix())) {
+ return null;
}
- } while (invalidLine(line));
- return line;
- }
+ // replace any occurrence of an escaped quote with a single quote
+ for (int i = 0; i < r.length; i++) {
+ String s = r[i];
+ // trim the string that is quoted
+ if (s.startsWith("\"") && s.endsWith("\"")) {
+ s = s.substring(1, s.length() - 1);
+ }
- /**
- * Checks if the passed line corresponds to the filters set
- * A line is considered valid if it doesn't start with the comment prefix
- * If the first value is null, the line is accepted
- *
- * @param line line to be checked
- * @return true if the line passes all checks
- */
- private boolean invalidLine(String[] line) {
- return line[0] != null && line[0].startsWith(this.config.getCommentPrefix());
+ s = s.replaceAll("\"\"", "\"");
+ r[i] = s;
+ }
+
+ return r;
+ }
+ return null;
}
/**
@@ -94,7 +90,7 @@ private boolean invalidLine(String[] line) {
public CSVRecord replaceNulls(CSVRecord record) {
Map data = record.getData();
data.forEach((key, value) -> {
- if (value != null && this.config.getNulls().contains(value)) {
+ if (this.config.getNulls().contains(value)) {
data.put(key, null);
}
});
@@ -122,13 +118,9 @@ public Record next() {
if (this.next == null) {
throw new NoSuchElementException();
}
-
String[] line = this.next;
- try {
- this.next = readLine();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
+
+ this.next = nextLine();
if (!config.getTrim().equals("false")) {
line = applyTrimArray(line, config.getTrim());
@@ -144,6 +136,6 @@ public boolean hasNext() {
@Override
public void close() throws IOException {
- this.reader.close();
+ this.inputReader.close();
}
}
diff --git a/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java b/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java
index 6d265fc..9320765 100644
--- a/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java
+++ b/src/main/java/be/ugent/idlab/knows/dataio/iterators/csvw/CSVWConfiguration.java
@@ -1,11 +1,12 @@
package be.ugent.idlab.knows.dataio.iterators.csvw;
-import com.opencsv.CSVParser;
-import com.opencsv.CSVParserBuilder;
-import com.opencsv.enums.CSVReaderNullFieldIndicator;
+
+import be.ugent.idlab.knows.dataio.utils.CSVNullInjector;
+import org.simpleflatmapper.lightningcsv.CsvParser;
import java.io.Serial;
import java.io.Serializable;
+import java.util.ArrayList;
import java.util.List;
/**
@@ -49,7 +50,11 @@ public final class CSVWConfiguration implements Serializable {
this.skipHeader = skipHeader;
this.commentPrefix = commentPrefix;
this.header = header;
- this.nulls = nulls;
+
+ List nullValues = new ArrayList<>(nulls);
+ nullValues.add(CSVNullInjector.NULL_VALUE); // add our special null value
+
+ this.nulls = nullValues;
this.encoding = encoding;
}
@@ -93,12 +98,11 @@ public String getEncoding() {
return encoding;
}
- public CSVParser getParser() {
- return new CSVParserBuilder()
- .withSeparator(this.delimiter)
- .withEscapeChar(this.escapeCharacter)
- .withQuoteChar(this.quoteCharacter)
- .withFieldAsNull(CSVReaderNullFieldIndicator.EMPTY_SEPARATORS)
- .build();
+ public CsvParser.DSL getSFMParser(int bufferSize) {
+ return CsvParser
+ .separator(this.delimiter)
+ .escape(this.escapeCharacter)
+ .quote(this.quoteCharacter)
+ .bufferSize(bufferSize);
}
}
diff --git a/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java b/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java
index ff455aa..ac9b843 100644
--- a/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java
+++ b/src/main/java/be/ugent/idlab/knows/dataio/record/CSVRecord.java
@@ -15,7 +15,7 @@ public class CSVRecord extends Record {
private final Map datatypes;
public CSVRecord(String[] header, String[] data, Map datatypes) {
- this.data = new HashMap<>();
+ this.data = new HashMap<>(header.length);
if (header.length > data.length) {
logger.warn("Header has more columns than this row");
}
@@ -26,7 +26,7 @@ public CSVRecord(String[] header, String[] data, Map datatypes)
if (i < data.length) {
this.data.put(header[i], data[i]);
} else {
- this.data.put(header[i], "");
+ this.data.put(header[i], null);
}
}
this.datatypes = datatypes;
diff --git a/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java
new file mode 100644
index 0000000..b44a428
--- /dev/null
+++ b/src/main/java/be/ugent/idlab/knows/dataio/utils/CSVNullInjector.java
@@ -0,0 +1,203 @@
+package be.ugent.idlab.knows.dataio.utils;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.CharBuffer;
+
+/**
+ * Injects a known NULL value between two commas in CSV.
+ * CSVNullInjector will inject a specific null value (defined below in NULL_VALUE) between two delimiters and between a delimiter and a newline.
+ * Inspired by this answer on SO, written with Java's native buffers.
+ */
+public class CSVNullInjector extends InputStream {
+ public static final String NULL_VALUE = "DATAIO_NULL";
+ private final CharBuffer nullBuffer;
+ private final CharBuffer inputBuffer;
+ private final InputStreamReader reader;
+ private final char delimiter;
+ private final char quoteCharacter;
+ private boolean quoteMode = false;
+ private boolean newLine = true;
+
+ /**
+ * Constructor for CSVNullInjector
+ * Will initialise buffers and read the first amount of chars from the reader.
+ *
+ * @param reader InputStreamReader containing the stream to consume
+ * @param bufferSize buffer size to pre-allocate for the inputBuffer and keep during reading
+ * @param delimiter used delimiter
+ * @param quoteCharacter used quote character
+ * @throws IOException when an I/O error occurs
+ */
+ public CSVNullInjector(InputStreamReader reader, int bufferSize, char delimiter, char quoteCharacter) throws IOException {
+ this.nullBuffer = CharBuffer.allocate(NULL_VALUE.length());
+ this.inputBuffer = CharBuffer.allocate(bufferSize);
+ this.reader = reader;
+ this.delimiter = delimiter;
+ this.quoteCharacter = quoteCharacter;
+
+ // initialise null buffer
+ this.nullBuffer.put(NULL_VALUE);
+
+ // initialise input buffer
+ int count = this.reader.read(this.inputBuffer.array());
+ this.inputBuffer.flip();
+ if (count > 0) {
+ this.inputBuffer.limit(count);
+ }
+ }
+
+ /**
+ * Constructor with default values for CSV
+ *
+ * @param reader reader to consume
+ * @param bufferSize size of the buffer to keep
+ * @throws IOException when an I/O error occurs
+ */
+ public CSVNullInjector(InputStreamReader reader, int bufferSize) throws IOException {
+ this(reader, bufferSize, ',', '"');
+ }
+
+ @Override
+ public int read() throws IOException {
+ ReadingResult rr = getNextByte();
+ if (rr.valid()) {
+ return rr.result();
+ }
+ return -1;
+ }
+
+ @Override
+ public int read(byte[] b) throws IOException {
+ return this.read(b, 0, b.length);
+ }
+
+ @Override
+ public int read(byte[] b, int off, int len) throws IOException {
+ int i = off;
+ while (i < len) {
+ ReadingResult rr = getNextByte();
+ if (!rr.valid()) {
+ if (i == off) {
+ return -1; // return -1 to let the caller known no more data is available
+ }
+ break;
+ }
+ byte[] bytes = String.valueOf(rr.result()).getBytes();
+ for (byte b1 : bytes) {
+ b[i] = b1;
+ i++;
+ }
+ }
+ return i - off;
+ }
+
+ /**
+ * Fetches the next character to be returned by the injector.
+ * This character could come from either the nullBuffer or the inputBuffer, depending on the state of the injector
+ *
+ * @return the next character
+ */
+ private ReadingResult getNextByte() throws IOException {
+ if (this.nullBuffer.hasRemaining()) {
+ return new ReadingResult(true, this.nullBuffer.get());
+ }
+
+ if (this.noMoreInput()) {
+ // nothing more in the inputBuffer
+ return new ReadingResult(false, 'f');
+ }
+
+ char currentChar = this.inputBuffer.get();
+
+ // specific case when we're on a new line and first character is the delimiter
+ // -> there's a missing null value that must be injected
+ if (this.newLine && currentChar == this.delimiter) {
+ // move the inputBuffer back to original position
+ this.inputBuffer.position(this.inputBuffer.position() - 1);
+ this.nullBuffer.flip();
+ this.newLine = false;
+ return new ReadingResult(true, this.nullBuffer.get());
+ }
+
+ this.newLine = false;
+
+ if (currentChar == this.quoteCharacter) {
+ // toggle quote mode
+ this.quoteMode = !this.quoteMode;
+ }
+
+ if (quoteMode) { // if in quote mode, immediately return
+ return new ReadingResult(true, currentChar);
+ }
+
+ if (currentChar == '\n') { // encountered end of line, return
+ this.newLine = true;
+ return new ReadingResult(true, currentChar);
+ }
+
+ if (currentChar == this.delimiter) {
+ // look for second delimiter
+ if (this.noMoreInput()) {
+ // last byte of the input is a delimiter, add one last null value
+ this.nullBuffer.flip();
+ return new ReadingResult(true, currentChar);
+ }
+ // not the last byte, check the next
+ char b1 = this.inputBuffer.get(this.inputBuffer.position());
+ if (b1 == this.delimiter || b1 == '\n') {
+ // two delimiters or a newline => dangling delimiter, add a null value
+ this.nullBuffer.flip();
+ return new ReadingResult(true, currentChar); // return the original
+ }
+ }
+
+ return new ReadingResult(true, currentChar);
+ }
+
+ /**
+ * Method for checking if there's input to be consumed.
+ * Input will be read into inputBuffer should the buffer be empty.
+ *
+ * @return true if the input is exhausted, false otherwise
+ * @throws IOException when an I/O error occurs
+ */
+ private boolean noMoreInput() throws IOException {
+ if (this.inputBuffer.hasRemaining()) {
+ return false;
+ }
+ int count = this.reader.read(this.inputBuffer);
+ if (count < 1) { // no bytes available
+ return true;
+ }
+
+ this.inputBuffer.flip();
+ this.inputBuffer.limit(count);
+
+ return false;
+ }
+
+ @Override
+ public void close() throws IOException {
+ this.reader.close();
+ }
+
+ /**
+ * A convenience method for getting an InputStreamReader
+ *
+ * @return an InputStreamReader that consumes this null injector
+ */
+ public InputStreamReader reader() {
+ return new InputStreamReader(this);
+ }
+
+ /**
+ * Record to communicate the result of the byte read and its success
+ *
+ * @param valid true if the reading of the byte was successful and the result byte is usable, false otherwise
+ * @param result the character produced by the read. If valid == false, its value does not matter
+ */
+ private record ReadingResult(boolean valid, char result) {
+ }
+}
diff --git a/src/test/java/be/ugent/idlab/knows/dataio/access/LocalAccessTest.java b/src/test/java/be/ugent/idlab/knows/dataio/access/LocalAccessTest.java
index 61b794e..0698249 100644
--- a/src/test/java/be/ugent/idlab/knows/dataio/access/LocalAccessTest.java
+++ b/src/test/java/be/ugent/idlab/knows/dataio/access/LocalAccessTest.java
@@ -9,6 +9,7 @@
import java.io.File;
import java.io.FileNotFoundException;
+import java.nio.file.NoSuchFileException;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
@@ -65,7 +66,7 @@ public void relativeToBase() throws IOException, SQLException {
@Test
public void nonExistentFile() {
Access access = new LocalFileAccess("", "not_existing_file.csv", "csv", "utf-8");
- assertThrows(FileNotFoundException.class, () -> access.getInputStream());
+ assertThrows(NoSuchFileException.class, access::getInputStream);
}
@EnabledOnOs(OS.WINDOWS)
diff --git a/src/test/java/be/ugent/idlab/knows/dataio/cores/TestCore.java b/src/test/java/be/ugent/idlab/knows/dataio/cores/TestCore.java
index 02f867b..f86e0e6 100644
--- a/src/test/java/be/ugent/idlab/knows/dataio/cores/TestCore.java
+++ b/src/test/java/be/ugent/idlab/knows/dataio/cores/TestCore.java
@@ -110,6 +110,10 @@ public boolean evaluate_1001_header_short(Iterator iterator) {
return compareIterator(iterator, List.of(expected1, expected2, expected3));
}
+ public boolean evaluate_empty(Iterator iterator) {
+ return !iterator.hasNext();
+ }
+
public boolean compareIterator(Iterator iterator, Set expectedRecords) {
int counter = 0;
while (iterator.hasNext()) {
diff --git a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java
index 0f76fe2..134f18c 100644
--- a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java
+++ b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVIteratorTest.java
@@ -18,7 +18,7 @@
public class CSVIteratorTest extends TestCore {
@Test
public void evaluate_0000_CSV() throws Exception {
- Access access = makeLocalAccess("/csv/0000.csv", "", "csv", "utf-8");
+ Access access = makeLocalAccess("/csv/0000.csv", "", "csv", "UTF-8");
try (CSVSourceIterator iterator = new CSVSourceIterator(access)) {
assertTrue(evaluate_0000(iterator));
}
@@ -26,7 +26,7 @@ public void evaluate_0000_CSV() throws Exception {
@Test
public void evaluate_0001_CSV() throws Exception {
- Access access = makeLocalAccess("/csv/0001.csv", "", "csv", "utf-8");
+ Access access = makeLocalAccess("/csv/0001.csv", "", "csv", "UTF-8");
try (CSVSourceIterator iterator = new CSVSourceIterator(access)) {
assertTrue(evaluate_0001(iterator));
}
@@ -35,7 +35,7 @@ public void evaluate_0001_CSV() throws Exception {
@Test
@Disabled
public void evaluate_1001_header_col_missing_CSV() throws Exception {
- Access access = makeLocalAccess("/csv/1001_header_col_missing.csv", "", "csv", "utf-8");
+ Access access = makeLocalAccess("/csv/1001_header_col_missing.csv", "", "csv", "UTF-8");
try (CSVSourceIterator csvSourceIterator = new CSVSourceIterator(access)) {
//TODO should fail, check if it does
}
@@ -43,7 +43,7 @@ public void evaluate_1001_header_col_missing_CSV() throws Exception {
@Test
public void evaluate_1001_header_long_CSV() throws Exception {
- Access access = makeLocalAccess("/csv/1001_header_long.csv", "", "csv", "utf-8");
+ Access access = makeLocalAccess("/csv/1001_header_long.csv", "", "csv", "UTF-8");
try (CSVSourceIterator iterator = new CSVSourceIterator(access)) {
assertTrue(evaluate_1001_header_long(iterator));
}
@@ -51,7 +51,7 @@ public void evaluate_1001_header_long_CSV() throws Exception {
@Test
public void evaluate_1001_header_short_CSV() throws Exception {
- Access access = makeLocalAccess("/csv/1001_header_short.csv", "", "csv", "utf-8");
+ Access access = makeLocalAccess("/csv/1001_header_short.csv", "", "csv", "UTF-8");
try (CSVSourceIterator iterator = new CSVSourceIterator(access)) {
assertTrue(evaluate_1001_header_short(iterator));
}
@@ -59,15 +59,23 @@ public void evaluate_1001_header_short_CSV() throws Exception {
@Test
public void evaluate_0002_BOM_CSV() throws Exception {
- Access access = makeLocalAccess("/csv/0002_BOM.csv", "", "csv", "utf-8");
+ Access access = makeLocalAccess("/csv/0002_BOM.csv", "", "csv", "UTF-8");
try (CSVSourceIterator iterator = new CSVSourceIterator(access)) {
assertTrue(evaluate_0002_BOM(iterator));
}
}
+ @Test
+ public void evaluate_empty_CSV() throws Exception {
+ Access access = makeLocalAccess("/csv/empty.csv", "", "csv", "UTF-8");
+ try (CSVSourceIterator iterator = new CSVSourceIterator(access)) {
+ assertTrue(evaluate_empty(iterator));
+ }
+ }
+
@Test
public void evaluateSparseInput() throws Exception {
- Access access = makeLocalAccess("/csv/sparseInput.csv", "", "csv", "utf-8");
+ Access access = makeLocalAccess("/csv/sparseInput.csv", "", "csv", "UTF-8");
try (CSVSourceIterator iterator = new CSVSourceIterator(access)) {
assertTrue(iterator.hasNext());
@@ -106,4 +114,14 @@ public void test_missing_values() throws Exception {
}
}
}
+
+ @Test
+ public void test_triple_quotes() throws Exception {
+ Access access = new LocalFileAccess("csv/tripleQuotes.csv", "src/test/resources", "csv");
+ try (CSVSourceIterator iterator = new CSVSourceIterator(access)) {
+ CSVRecord record = (CSVRecord) iterator.next();
+
+ assertEquals("BO", record.get("\"ISO 3166\"").get(0));
+ }
+ }
}
diff --git a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java
index 17d87df..fb07e7f 100644
--- a/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java
+++ b/src/test/java/be/ugent/idlab/knows/dataio/iterator/CSVWIteratorTest.java
@@ -4,14 +4,18 @@
import be.ugent.idlab.knows.dataio.access.LocalFileAccess;
import be.ugent.idlab.knows.dataio.cores.TestCore;
import be.ugent.idlab.knows.dataio.iterators.CSVWSourceIterator;
+import be.ugent.idlab.knows.dataio.iterators.SourceIterator;
import be.ugent.idlab.knows.dataio.iterators.csvw.CSVWConfiguration;
import be.ugent.idlab.knows.dataio.record.CSVRecord;
+import be.ugent.idlab.knows.dataio.record.Record;
+import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Set;
+import java.util.function.Consumer;
import static org.junit.jupiter.api.Assertions.*;
diff --git a/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java
new file mode 100644
index 0000000..2f40fc6
--- /dev/null
+++ b/src/test/java/be/ugent/idlab/knows/dataio/utility/CSVNullInjectorTest.java
@@ -0,0 +1,130 @@
+package be.ugent.idlab.knows.dataio.utility;
+
+import be.ugent.idlab.knows.dataio.utils.CSVNullInjector;
+import org.junit.jupiter.api.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class CSVNullInjectorTest {
+ private String getProcessedString(String inputString) throws IOException {
+ InputStream input = new ByteArrayInputStream(inputString.getBytes());
+ InputStreamReader reader = new InputStreamReader(input);
+ return new String(new CSVNullInjector(reader, 1024 * 128).readAllBytes());
+ }
+
+ /**
+ * Tests a simple insertion in between two delimiters
+ */
+ @Test
+ public void testInsertion() throws IOException {
+ String testString = "ID,,Foo";
+ String output = getProcessedString(testString);
+ String expected = "ID,%s,Foo".replaceAll("%s", CSVNullInjector.NULL_VALUE);
+ assertEquals(expected, output);
+ }
+
+ /**
+ * Tests an insertion between two custom delimiters
+ */
+ @Test
+ public void customDelimiter() throws IOException {
+ String testString = "ID;;Foo";
+ InputStream input = new ByteArrayInputStream(testString.getBytes());
+ InputStreamReader reader = new InputStreamReader(input);
+ CSVNullInjector injector = new CSVNullInjector(reader, 1024 * 128, ';', '"');
+ String output = new String(injector.readAllBytes());
+ String expected = "ID;%s;Foo".replaceAll("%s", CSVNullInjector.NULL_VALUE);
+ assertEquals(expected, output);
+ }
+
+ /**
+ * Tests injection of null value at the start of the string
+ */
+ @Test
+ public void emptyStart() throws IOException {
+ String testString = ",Foo,Bar";
+ String output = getProcessedString(testString);
+ String expected = "%s,Foo,Bar".replaceAll("%s", CSVNullInjector.NULL_VALUE);
+ assertEquals(expected, output);
+ }
+
+ /**
+ * Tests insertion of null value at the end of the string
+ */
+ @Test
+ public void emptyEnd() throws IOException {
+ String testString = "Foo,Bar,";
+ String output = getProcessedString(testString);
+ String expected = "Foo,Bar,%s".replaceAll("%s", CSVNullInjector.NULL_VALUE);
+ assertEquals(expected, output);
+ }
+
+ /**
+ * Tests the insertion in between a delimiter and a newline
+ */
+ @Test
+ public void danglingSeparator() throws IOException {
+ String testString = """
+ "ID","Name","DateOfBirth"
+ "1","Alice",
+ "2","Bob","September, 2010"
+ """;
+ String expected = """
+ "ID","Name","DateOfBirth"
+ "1","Alice",%s
+ "2","Bob","September, 2010"
+ """.replaceAll("%s", CSVNullInjector.NULL_VALUE);
+ String actual = getProcessedString(testString);
+ assertEquals(expected, actual);
+ }
+
+ /**
+ * Tests ignoring of quoted separators
+ */
+ @Test
+ public void ignoreQuotedSeparator() throws IOException {
+ String testString = "ID,\",, ,\",Foo";
+ String output = getProcessedString(testString);
+ assertEquals(testString, output);
+ }
+
+ /**
+ * Tests correct injection for escaped quotes
+ * Input: "aaa","b"",,bb",,"ccc"
+ * Output: "aaa","b"",,bb",${nullValue},"ccc"
+ */
+ @Test
+ public void escapedQuote() throws IOException {
+ String testString = "\"aaa\",\"b\"\",,bb\",,\"ccc\"";
+ String output = getProcessedString(testString);
+ String expected = "\"aaa\",\"b\"\",,bb\",%s,\"ccc\"".replaceAll("%s", CSVNullInjector.NULL_VALUE);
+ assertEquals(expected, output);
+ }
+
+ /**
+ * Tests the injector's correct recognition of Linux newlines.
+ */
+ @Test
+ public void unixNewLine() throws IOException {
+ String testString = "Foo,,Bar\n,B";
+ String output = getProcessedString(testString);
+ String expected = "Foo,%s,Bar\n%s,B".replaceAll("%s", CSVNullInjector.NULL_VALUE);
+ assertEquals(expected, output);
+ }
+
+ /**
+ * Tests the injector's correct recognition of Windows newlines.
+ */
+ @Test
+ public void windowsNewLine() throws IOException {
+ String testString = "Foo,,Bar\r\n,B";
+ String output = getProcessedString(testString);
+ String expected = "Foo,%s,Bar\r\n%s,B".replaceAll("%s", CSVNullInjector.NULL_VALUE);
+ assertEquals(expected, output);
+ }
+}
diff --git a/src/test/resources/csv/danglingSeparator.csv b/src/test/resources/csv/danglingSeparator.csv
new file mode 100644
index 0000000..88c7904
--- /dev/null
+++ b/src/test/resources/csv/danglingSeparator.csv
@@ -0,0 +1,3 @@
+"ID","Name","DateOfBirth"
+"1","Alice",
+"2","Bob","September, 2010"
\ No newline at end of file
diff --git a/src/test/resources/csv/empty.csv b/src/test/resources/csv/empty.csv
new file mode 100644
index 0000000..e69de29
diff --git a/src/test/resources/csv/tripleQuotes.csv b/src/test/resources/csv/tripleQuotes.csv
new file mode 100644
index 0000000..559d014
--- /dev/null
+++ b/src/test/resources/csv/tripleQuotes.csv
@@ -0,0 +1,4 @@
+Country Code,Name,"""ISO 3166"""
+1,"Bolivia, Plurinational State of",BO
+2,Ireland,IE
+3,Saint Martin (French part),MF