Skip to content

Commit

Permalink
Merge branch 'fix/21-csvwsourceiterator-loads-only-records-until-buff…
Browse files Browse the repository at this point in the history
…er-full' into 'development'

Resolve "CSVWSourceIterator loads only records until buffer full."

Closes #21

See merge request rml/proc/dataio!22
  • Loading branch information
ghsnd committed Jan 17, 2024
2 parents 9834bf7 + a7ded97 commit dee2cfe
Show file tree
Hide file tree
Showing 25 changed files with 58,799 additions and 331 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## Unreleased

### Changed
- Use `Charset` instead of `String` for character encoding wherever possible.

### Fixed
- CSVWSourceIterator stopped after a certain number of bytes were parsed (GitLab [issue 21](https://gitlab.ilabt.imec.be/rml/proc/dataio/-/issues/21)).
The bug originated from `CSVNullInjector`, it has completely been rewritten into `NewCSVNullInjector`.

## [1.0.4] - 2023-10-31

### Fixed
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import javax.activation.MimetypesFileTypeMap;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
Expand Down Expand Up @@ -41,18 +42,15 @@ public class LocalFileAccess implements Access {
* @param type type of the file
* @param encoding encoding of the file
*/
public LocalFileAccess(String path, String base, String type, String encoding) {
public LocalFileAccess(String path, String base, String type, Charset encoding) {
if (base != null && !base.isEmpty()) {
Path basePath = Path.of(base);
this.path = basePath.resolve(path).toString();
} else {
this.path = Path.of(path).toString();
}

if (!Charset.isSupported(encoding)) {
throw new IllegalArgumentException("Passed encoding not supported.");
}
this.encoding = encoding;
this.encoding = encoding.name();
this.type = type;

fileTypeMap = new MimetypesFileTypeMap();
Expand All @@ -61,7 +59,7 @@ public LocalFileAccess(String path, String base, String type, String encoding) {
}

public LocalFileAccess(String path, String basePath, String type) {
this(path, basePath, type, "utf-8");
this(path, basePath, type, StandardCharsets.UTF_8);
}

/**
Expand Down Expand Up @@ -144,7 +142,7 @@ public String getAccessPath() {
return Path.of(path).toAbsolutePath().toString();
}

public String getEncoding() {
return encoding;
public Charset getEncoding() {
return Charset.forName(encoding);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
import be.ugent.idlab.knows.dataio.iterators.csvw.CSVWConfiguration;
import be.ugent.idlab.knows.dataio.record.CSVRecord;
import be.ugent.idlab.knows.dataio.record.Record;
import be.ugent.idlab.knows.dataio.utils.CSVNullInjector;
import be.ugent.idlab.knows.dataio.utils.NewCSVNullInjector;
import org.simpleflatmapper.lightningcsv.CsvParser;

import java.io.*;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.Serial;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
Expand All @@ -21,7 +24,6 @@ public class CSVWSourceIterator extends SourceIterator {
private final CSVWConfiguration config;
private transient String[] header;
private transient String[] next;
private transient InputStreamReader inputReader;
private transient Iterator<String[]> iterator;

public CSVWSourceIterator(Access access, CSVWConfiguration config) throws Exception {
Expand All @@ -37,11 +39,15 @@ private void readObject(ObjectInputStream inputStream) throws Exception {
}

private void bootstrap() throws Exception {
this.inputReader = new InputStreamReader(access.getInputStream(), this.config.getEncoding());
CSVNullInjector injector = new CSVNullInjector(inputReader, BUFFER_SIZE, this.config.getDelimiter(), this.config.getQuoteCharacter());
NewCSVNullInjector injector = new NewCSVNullInjector(
access.getInputStream(),
config.getDelimiter(),
config.getQuoteCharacter(),
config.getEncoding()
);

CsvParser.DSL parser = config.getSFMParser(BUFFER_SIZE);
this.iterator = parser.iterator(injector.reader());
this.iterator = parser.iterator(new InputStreamReader(injector, StandardCharsets.UTF_8));

if (this.config.isSkipHeader()) {
this.header = config.getHeader().toArray(new String[0]);
Expand Down Expand Up @@ -133,9 +139,4 @@ public Record next() {
public boolean hasNext() {
return this.next != null;
}

@Override
public void close() throws IOException {
this.inputReader.close();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import be.ugent.idlab.knows.dataio.record.Record;

import java.io.IOException;
import java.io.Serial;
import java.io.Serializable;
import java.util.Iterator;
Expand All @@ -24,4 +25,6 @@ public void forEachRemaining(Consumer<? super Record> action) {
while (hasNext())
action.accept(next());
}

public void close() throws IOException {}
}
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
package be.ugent.idlab.knows.dataio.iterators.csvw;


import be.ugent.idlab.knows.dataio.utils.CSVNullInjector;
import be.ugent.idlab.knows.dataio.utils.NewCSVNullInjector;
import org.simpleflatmapper.lightningcsv.CsvParser;

import java.io.Serial;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;

Expand Down Expand Up @@ -39,7 +40,7 @@ public final class CSVWConfiguration implements Serializable {
private final List<String> nulls;
private final String encoding;

CSVWConfiguration(char delimiter, char escapeCharacter, String trim, char quoteCharacter, boolean skipHeader, String commentPrefix, List<String> header, List<String> nulls, String encoding) {
CSVWConfiguration(char delimiter, char escapeCharacter, String trim, char quoteCharacter, boolean skipHeader, String commentPrefix, List<String> header, List<String> nulls, Charset encoding) {
// opencsv parser options
this.delimiter = delimiter;
this.escapeCharacter = escapeCharacter;
Expand All @@ -52,10 +53,10 @@ public final class CSVWConfiguration implements Serializable {
this.header = header;

List<String> nullValues = new ArrayList<>(nulls);
nullValues.add(CSVNullInjector.NULL_VALUE); // add our special null value
nullValues.add(NewCSVNullInjector.NULL_VALUE); // add our special null value

this.nulls = nullValues;
this.encoding = encoding;
this.encoding = encoding.name();
}

public static CSVWConfigurationBuilder builder() {
Expand Down Expand Up @@ -94,8 +95,8 @@ public List<String> getNulls() {
return this.nulls;
}

public String getEncoding() {
return encoding;
public Charset getEncoding() {
return Charset.forName(encoding);
}

public CsvParser.DSL getSFMParser(int bufferSize) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package be.ugent.idlab.knows.dataio.iterators.csvw;

import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.List;

Expand All @@ -13,14 +14,14 @@ public class CSVWConfigurationBuilder {
private List<String> header = List.of();
private List<String> nulls = List.of();

private String encoding = StandardCharsets.UTF_8.toString();
private Charset encoding = StandardCharsets.UTF_8;

public CSVWConfigurationBuilder withDelimiter(char delimiter) {
this.delimiter = delimiter;
return this;
}

public CSVWConfigurationBuilder withEncoding(String encoding) {
public CSVWConfigurationBuilder withEncoding(Charset encoding) {
this.encoding = encoding;
return this;
}
Expand Down
Loading

0 comments on commit dee2cfe

Please sign in to comment.