diff --git a/CHANGELOG.md b/CHANGELOG.md
index d00730b7..3f2fe374 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,6 +18,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
### Added
- FunctionLoader: throw error on missing function parameters (see [issue 125](https://gitlab.ilabt.imec.be/rml/proc/rmlmapper-java/-/issues/125))
- HTMLRecordFactory: add CSS3 selector support (see [issue 52](https://gitlab.ilabt.imec.be/rml/proc/rmlmapper-java/-/issues/52))
+- CSVRecordFactory: add spreadsheet support (see [issue 42](https://gitlab.ilabt.imec.be/rml/proc/rmlmapper-java/-/issues/42))
## [4.11.0] - 2021-07-05
diff --git a/README.md b/README.md
index f31c3d6c..a7ae0733 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,8 @@ The RMLMapper loads all data in memory, so be aware when working with big datase
### Supported
- local data sources:
+ - Excel (.xlsx)
+ - LibreOffice (.ods)
- CSV files (including CSVW)
- JSON files (JSONPath)
- XML files (XPath)
@@ -252,6 +254,9 @@ and up to which level metadata should be stored (dataset, triple, or term level
Run the tests via `test.sh`.
+#### Derived tests
+Some tests (Excel, ODS) are derived from other tests (CSV) using a script (`./generate_spreadsheet_test_cases.sh`)
+
### RDBs
Make sure you have [Docker](https://www.docker.com) running.
@@ -317,6 +322,9 @@ We also offer consulting for all-things-RML.
## Remarks
+### Typed spreadsheet files
+All spreadsheet files are as of yet regarded as plain CSV files. No type information like Currency, Date... is used.
+
### XML file parsing performance
The RMLMapper's XML parsing implementation (`javax.xml.parsers`) has been chosen to support full XPath.
diff --git a/docs/apidocs/be/ugent/rml/records/CSVRecordFactory.html b/docs/apidocs/be/ugent/rml/records/CSVRecordFactory.html
new file mode 100644
index 00000000..e69de29b
diff --git a/docs/apidocs/be/ugent/rml/records/class-use/CSVRecordFactory.html b/docs/apidocs/be/ugent/rml/records/class-use/CSVRecordFactory.html
new file mode 100644
index 00000000..e69de29b
diff --git a/generate-spreadsheet-test-cases.sh b/generate-spreadsheet-test-cases.sh
new file mode 100644
index 00000000..9b5f0744
--- /dev/null
+++ b/generate-spreadsheet-test-cases.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+
+# REQUIRES libreoffice!!
+
+TEST_LOCATION="src/test"
+TEST_FILE_LOCATION="java/be/ugent/rml"
+TEST_RESOURCES_LOCATION="resources/test-cases"
+NAME_CSV_TEST="Mapper_CSV_Test.java"
+
+# Check for libreoffice
+if [[ ! `libreoffice --help` ]]
+then
+ echo "Install libreoffice to convert CSV."
+ return 1
+fi
+
+cd ${TEST_LOCATION}
+TEST_DIR=$(pwd)
+
+for i in "EXCEL xlsx" "ODS ods"
+do
+ set -- ${i}
+ echo "Generating ${1} tests from CSV tests"
+
+ ## Test files
+ cd "${TEST_DIR}/${TEST_FILE_LOCATION}"
+ NAME_NEW_TEST="Mapper_${1}_Test.java"
+ cp ${NAME_CSV_TEST} ${NAME_NEW_TEST}
+ sed -i "s/CSV/${1}/g" ${NAME_NEW_TEST}
+
+ ## Test resources
+ cd "${TEST_DIR}/${TEST_RESOURCES_LOCATION}"
+ for csv_dir in *CSV*
+ do
+ # Copy CSV test directory
+ NEW_DIR_NAME=$(echo ${csv_dir} | sed "s/CSV/${1}/")
+ if [[ -d ${NEW_DIR_NAME} ]]
+ then
+ rm -Rf ${NEW_DIR_NAME}
+ fi
+ cp -r ${csv_dir} ${NEW_DIR_NAME}
+ cd ${NEW_DIR_NAME}
+
+ # Change files within directory
+
+ echo "Test case: ${NEW_DIR_NAME}"
+ # csv source file
+ for csv_source in *.csv
+ do
+ if [[ ! -f ${csv_source} ]]; then break; fi
+ # UTF-8 encoding issue
+ # https://bugs.documentfoundation.org/show_bug.cgi?id=36313
+ libreoffice --headless --convert-to ${2} --infilter=CSV:44,34,UTF8 ${csv_source}
+ rm ${csv_source}
+ done
+ # mapping file
+ sed -i "s/.csv/.${2}/g" "mapping.ttl"
+
+ cd ..
+ done
+done
+
+echo "Success!"
+
diff --git a/pom.xml b/pom.xml
index 74eba35a..1aa937cf 100644
--- a/pom.xml
+++ b/pom.xml
@@ -132,11 +132,6 @@
commons-lang
2.6
-
- org.apache.commons
- commons-csv
- 1.8
-
commons-cli
commons-cli
@@ -219,6 +214,11 @@
pom
3.8.0
+
+ com.hp.hpl.jena
+ arq
+ 2.8.8
+
org.apache.jena
@@ -250,6 +250,31 @@
jsoup
1.10.2
+
+
+ org.apache.commons
+ commons-csv
+ 1.8
+
+
+ org.apache.poi
+ poi-ooxml
+ 4.1.0
+
+
+
+
+ org.apache.odftoolkit
+ simple-odf
+ 0.8.2-incubating
+
+
diff --git a/src/main/java/be/ugent/rml/Executor.java b/src/main/java/be/ugent/rml/Executor.java
index 7bb02088..603fe0d1 100644
--- a/src/main/java/be/ugent/rml/Executor.java
+++ b/src/main/java/be/ugent/rml/Executor.java
@@ -423,7 +423,7 @@ private List getAllIRIs(Term triplesMap) throws Exception {
return iris;
}
- private List getRecords(Term triplesMap) throws IOException, SQLException, ClassNotFoundException {
+ private List getRecords(Term triplesMap) throws Exception {
if (!this.recordsHolders.containsKey(triplesMap)) {
this.recordsHolders.put(triplesMap, this.recordsFactory.createRecords(triplesMap, this.rmlStore));
}
diff --git a/src/main/java/be/ugent/rml/records/CSVRecordFactory.java b/src/main/java/be/ugent/rml/records/CSVRecordFactory.java
index aa1aeeb1..0ef30845 100644
--- a/src/main/java/be/ugent/rml/records/CSVRecordFactory.java
+++ b/src/main/java/be/ugent/rml/records/CSVRecordFactory.java
@@ -9,8 +9,15 @@
import be.ugent.rml.term.Term;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
+import org.apache.poi.ss.usermodel.Row;
+import org.apache.poi.ss.usermodel.Sheet;
+import org.apache.poi.ss.usermodel.Workbook;
+import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import org.apache.commons.io.FilenameUtils;
+import org.odftoolkit.simple.Document;
+import org.odftoolkit.simple.SpreadsheetDocument;
import java.io.IOException;
import java.io.InputStream;
@@ -36,39 +43,84 @@ public class CSVRecordFactory implements ReferenceFormulationRecordFactory {
* @throws IOException
*/
@Override
- public List getRecords(Access access, Term logicalSource, QuadStore rmlStore) throws IOException, SQLException, ClassNotFoundException {
+ public List getRecords(Access access, Term logicalSource, QuadStore rmlStore) throws Exception {
List sources = Utils.getObjectsFromQuads(rmlStore.getQuads(logicalSource, new NamedNode(NAMESPACES.RML + "source"), null));
Term source = sources.get(0);
- CSVParser parser;
if (source instanceof Literal) {
// We are not dealing with something like CSVW.
- parser = getParserForNormalCSV(access);
+ // Check for different spreadsheet formats
+ String filePath = source.getValue();
+ String extension = FilenameUtils.getExtension(filePath);
+ switch (extension) {
+ case "xlsx":
+ return getRecordsForExcel(access);
+ case "ods":
+ return getRecordsForODT(access);
+ default:
+ return getRecordsForCSV(access, null);
+ }
+
} else {
List sourceType = Utils.getObjectsFromQuads(rmlStore.getQuads(source, new NamedNode(NAMESPACES.RDF + "type"), null));
// Check if we are dealing with CSVW.
if (sourceType.get(0).getValue().equals(NAMESPACES.CSVW + "Table")) {
CSVW csvw = new CSVW(access.getInputStream(), rmlStore, logicalSource);
- parser = csvw.getCSVParser();
+ return getRecordsForCSV(access, csvw);
} else {
// RDBs fall under this.
- parser = getParserForNormalCSV(access);
+ return getRecordsForCSV(access, null);
}
}
+ }
- if (parser != null) {
- List myEntries = parser.getRecords();
+ /**
+ * Get Records for Excel file format.
+ * @param access
+ * @return
+ * @throws IOException
+ */
+ private List getRecordsForExcel(Access access) throws IOException, SQLException, ClassNotFoundException {
+ List output = new ArrayList<>();
+ Workbook workbook = new XSSFWorkbook(access.getInputStream());
+ for (Sheet datatypeSheet : workbook) {
+ Row header = datatypeSheet.getRow(0);
+ boolean first = true;
+ for (Row currentRow : datatypeSheet) {
+ // remove the header
+ if (first) {
+ first = false;
+ } else {
+ output.add(new ExcelRecord(header, currentRow));
+ }
+ }
+ }
+ return output;
+ }
- return myEntries.stream()
- .map(record -> new CSVRecord(record, access.getDataTypes()))
- .collect(Collectors.toList());
- } else {
- // We still return an empty list of records when a parser is not found.
- // This is to support certain use cases with RDBs where queries might not be valid,
- // but you don't want the RMLMapper to crash.
- return new ArrayList<>();
+ /**
+ * Get Records for ODT file format.
+ * @param access
+ * @return
+ * @throws IOException
+ */
+ private List getRecordsForODT(Access access) throws Exception {
+ List output = new ArrayList<>();
+ InputStream is = access.getInputStream();
+ Document document = SpreadsheetDocument.loadDocument(is);
+ for (org.odftoolkit.simple.table.Table table : document.getTableList()) {
+ org.odftoolkit.simple.table.Row header = table.getRowByIndex(0);
+ boolean first = true;
+ for (org.odftoolkit.simple.table.Row currentRow : table.getRowList()) {
+ if (first) {
+ first = false;
+ } else {
+ output.add(new ODSRecord(header, currentRow));
+ }
+ }
}
+ return output;
}
/**
@@ -78,19 +130,35 @@ public List getRecords(Access access, Term logicalSource, QuadStore rmlS
* @return a CSVParser.
* @throws IOException
*/
- private CSVParser getParserForNormalCSV(Access access) throws IOException, SQLException, ClassNotFoundException {
- CSVFormat csvFormat = CSVFormat.DEFAULT.withHeader().withSkipHeaderRecord(false).withNullString("@@@@NULL@@@@");
- InputStream inputStream = access.getInputStream();
+ private List getRecordsForCSV(Access access, CSVW csvw) throws IOException, SQLException, ClassNotFoundException {
+ CSVParser parser;
+ // Check if we are dealing with CSVW.
+ if (csvw != null) {
+ parser = csvw.getCSVParser();
+ } else {
+ // RDBs fall under this.
+ CSVFormat csvFormat = CSVFormat.DEFAULT.withHeader().withSkipHeaderRecord(false).withNullString("@@@@NULL@@@@");
+ InputStream inputStream = access.getInputStream();
- if (inputStream != null) {
try {
- return CSVParser.parse(inputStream, StandardCharsets.UTF_8, csvFormat);
+ parser = CSVParser.parse(inputStream, StandardCharsets.UTF_8, csvFormat);
} catch (IllegalArgumentException e) {
logger.debug("Could not parse CSV inputstream", e);
- return null;
+ parser = null;
}
+ }
+
+ if (parser != null) {
+ List myEntries = parser.getRecords();
+
+ return myEntries.stream()
+ .map(record -> new CSVRecord(record, access.getDataTypes()))
+ .collect(Collectors.toList());
} else {
- return null;
+ // We still return an empty list of records when a parser is not found.
+ // This is to support certain use cases with RDBs where queries might not be valid,
+ // but you don't want the RMLMapper to crash.
+ return new ArrayList<>();
}
}
}
diff --git a/src/main/java/be/ugent/rml/records/ExcelRecord.java b/src/main/java/be/ugent/rml/records/ExcelRecord.java
new file mode 100644
index 00000000..95404129
--- /dev/null
+++ b/src/main/java/be/ugent/rml/records/ExcelRecord.java
@@ -0,0 +1,102 @@
+package be.ugent.rml.records;
+
+import org.apache.jena.datatypes.xsd.XSDDatatype;
+import org.apache.poi.ss.usermodel.Cell;
+import org.apache.poi.ss.usermodel.CellType;
+import org.apache.poi.ss.usermodel.Row;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * This class is a specific implementation of a Record for Excel.
+ * Every record corresponds with a row of the Excel file.
+ */
+public class ExcelRecord extends Record {
+
+ private Row row;
+ private Map header = new HashMap<>();
+
+ ExcelRecord(Row header, Row row) {
+ for (Cell cell : header) {
+ this.header.put(cell.getStringCellValue(), cell);
+ }
+ this.row = row;
+ }
+
+ /**
+ * This method returns the datatype of a reference in the record.
+ * @param value the reference for which the datatype needs to be returned.
+ * @return the IRI of the datatype.
+ */
+ public String getDataType(String value) {
+ Cell cell = null;
+ if (header != null && header.get(value) != null) {
+ int index = header.get(value).getColumnIndex();
+ cell = row.getCell(index);
+ }
+ return getIRI(cell);
+ }
+
+
+ /**
+ * This method returns the objects for a column in the Excel record (= Excel row).
+ * @param value the column for which objects need to be returned.
+ * @return a list of objects for the column.
+ */
+ @Override
+ public List