From 2c9a96534c2785e689c7b26f90e7d994bedfd2b0 Mon Sep 17 00:00:00 2001 From: dr0i Date: Wed, 9 Nov 2016 13:14:57 +0100 Subject: [PATCH] Store dc(t):subject as a list See #4. Subjects without an ID will be given an explicit BNode ID. They can then be referenced in the moprh and thus be treated as every other resource. - store ntriples as files if log level is set to debug The ntriples are an intermediate step for producing json. For easier debugging it's also nice to have this step temporarily in the filesystem. - enable explicit given BNodes when encoding triples --- .settings/org.eclipse.core.resources.prefs | 1 + pom.xml | 2 +- .../lobid/resources/PipeEncodeTriples.java | 4 +- .../lobid/resources/RdfModelFileWriter.java | 147 ++++++++++++++++++ src/main/resources/morph-hbz01-to-lobid.xml | 52 +++++-- .../Hbz01MabXml2ElasticsearchLobidTest.java | 32 ++++ 6 files changed, 222 insertions(+), 16 deletions(-) create mode 100644 src/main/java/org/lobid/resources/RdfModelFileWriter.java diff --git a/.settings/org.eclipse.core.resources.prefs b/.settings/org.eclipse.core.resources.prefs index cdfe4f1b66..29abf99956 100644 --- a/.settings/org.eclipse.core.resources.prefs +++ b/.settings/org.eclipse.core.resources.prefs @@ -1,5 +1,6 @@ eclipse.preferences.version=1 encoding//src/main/java=UTF-8 +encoding//src/main/resources=UTF-8 encoding//src/test/java=UTF-8 encoding//src/test/resources=UTF-8 encoding/=UTF-8 diff --git a/pom.xml b/pom.xml index 5d35e07d90..2e50e7ce71 100644 --- a/pom.xml +++ b/pom.xml @@ -20,7 +20,7 @@ com.github.hbz lobid-rdf-to-json - 059d852d97e7866137b4f3001651b65dcc55642e + be2152b644913b0744686fc8c77e7cf584876f10 org.culturegraph diff --git a/src/main/java/org/lobid/resources/PipeEncodeTriples.java b/src/main/java/org/lobid/resources/PipeEncodeTriples.java index 70c05cfe0e..ad07c63c1d 100644 --- a/src/main/java/org/lobid/resources/PipeEncodeTriples.java +++ b/src/main/java/org/lobid/resources/PipeEncodeTriples.java @@ -44,6 +44,7 @@ public class PipeEncodeTriples extends AbstractGraphPipeEncoder { // dummy subject to store data even if the subject is unknown at first final static String DUMMY_SUBJECT = "dummy_subject"; final static String HTTP = "^[hH][tT][Tt][Pp].*"; + final static String BNODE = "^_:.*"; final static String FTP = "^[Ff][Tt][Pp].*"; final static String URN = "urn"; @@ -115,7 +116,8 @@ public void literal(final String name, final String value) { try { final Property prop = model.createProperty(name); if (!name.contains(PROPERTY_AS_LITERALS) && (value.matches(HTTP) - || value.matches(FTP) || (value.startsWith(URN) && storeUrnAsUri) + || value.matches(BNODE) || value.matches(FTP) + || (value.startsWith(URN) && storeUrnAsUri) || value.startsWith("mailto"))) { boolean uri = true; // either add uri ... diff --git a/src/main/java/org/lobid/resources/RdfModelFileWriter.java b/src/main/java/org/lobid/resources/RdfModelFileWriter.java new file mode 100644 index 0000000000..15109c1565 --- /dev/null +++ b/src/main/java/org/lobid/resources/RdfModelFileWriter.java @@ -0,0 +1,147 @@ +/* Copyright 2013,2016 Pascal Christoph, hbz. + * Licensed under the Eclipse Public License 1.0 */ + +package org.lobid.resources; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.StringWriter; +import java.io.Writer; +import java.util.NoSuchElementException; + +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.io.IOUtils; +import org.apache.jena.riot.Lang; +import org.apache.jena.riot.RDFDataMgr; +import org.apache.jena.riot.RDFLanguages; +import org.culturegraph.mf.exceptions.MetafactureException; +import org.culturegraph.mf.framework.DefaultObjectReceiver; +import org.culturegraph.mf.framework.annotations.Description; +import org.culturegraph.mf.framework.annotations.In; +import org.culturegraph.mf.framework.annotations.Out; +import org.culturegraph.mf.util.xml.FilenameExtractor; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.hp.hpl.jena.rdf.model.Model; + +/** + * A sink, writing triples into files. The filenames are constructed from the + * literal of an given property. + * + * @author Pascal Christoph (dr0i) + */ +@Description("Writes the object value of an RDF model into a file. Default serialization is 'NTRIPLES'. The filename is " + + "constructed from the literal of an given property (recommended properties are identifier)." + + " Variable are " + "- 'target' (determining the output directory)" + + "- 'property' (the property in the RDF model. The object of this property" + + " will be the main part of the file's name.) " + + "- 'startIndex' ( a subfolder will be extracted out of the filename. This marks the index' beginning )" + + "- 'stopIndex' ( a subfolder will be extracted out of the filename. This marks the index' end )" + + "- 'serialization (e.g. one of 'NTRIPLES', 'TURTLE', 'RDFXML','RDFJSON'") +@In(Model.class) +@Out(Void.class) +public final class RdfModelFileWriter extends DefaultObjectReceiver + implements FilenameExtractor { + private static final Logger LOG = + LoggerFactory.getLogger(RdfModelFileWriter.class); + + private FilenameUtil filenameUtil = new FilenameUtil(); + private Lang serialization; + + /** + * Default constructor + */ + public RdfModelFileWriter() { + setProperty("http://purl.org/dc/terms/identifier"); + setFileSuffix("nt"); + setSerialization("NTRIPLES"); + } + + @Override + public String getEncoding() { + return filenameUtil.encoding; + } + + @Override + public void setEncoding(final String encoding) { + filenameUtil.encoding = encoding; + } + + @Override + public void setTarget(final String target) { + filenameUtil.target = target; + } + + @Override + public void setProperty(final String property) { + filenameUtil.property = property; + } + + @Override + public void setFileSuffix(final String fileSuffix) { + filenameUtil.fileSuffix = fileSuffix; + } + + @Override + public void setStartIndex(final int startIndex) { + filenameUtil.startIndex = startIndex; + } + + @Override + public void setEndIndex(final int endIndex) { + filenameUtil.endIndex = endIndex; + } + + /** + * Sets the rdf serialization language. + * + * @param serialization the language to be serialized + */ + public void setSerialization(final String serialization) { + this.serialization = RDFLanguages.nameToLang(serialization); + } + + @Override + public void process(final Model model) { + String identifier = null; + try { + identifier = + model + .listObjectsOfProperty( + model.createProperty(filenameUtil.property)) + .next().toString(); + LOG.debug("Going to store identifier=" + identifier); + } catch (NoSuchElementException e) { + LOG.warn( + "No identifier => cannot derive a filename for " + model.toString()); + return; + } + + String directory = identifier; + if (directory.length() >= filenameUtil.endIndex) { + directory = + directory.substring(filenameUtil.startIndex, filenameUtil.endIndex); + } + final String file = FilenameUtils.concat(filenameUtil.target, + FilenameUtils.concat(directory + File.separator, + identifier + "." + filenameUtil.fileSuffix)); + LOG.debug("Write to " + file); + filenameUtil.ensurePathExists(file); + + try ( + final Writer writer = new OutputStreamWriter(new FileOutputStream(file), + filenameUtil.encoding)) { + final StringWriter tripleWriter = new StringWriter(); + RDFDataMgr.write(tripleWriter, model, this.serialization); + IOUtils.write(tripleWriter.toString(), writer); + writer.close(); + } catch (IOException e) { + e.printStackTrace(); + throw new MetafactureException(e); + } + } + +} diff --git a/src/main/resources/morph-hbz01-to-lobid.xml b/src/main/resources/morph-hbz01-to-lobid.xml index 886dd30f95..3a2d04e595 100644 --- a/src/main/resources/morph-hbz01-to-lobid.xml +++ b/src/main/resources/morph-hbz01-to-lobid.xml @@ -123,23 +123,37 @@ - + + + + + + + + + + + + + - + - - - - - - - - + + + + + + + + + + + + - - - @@ -343,8 +357,11 @@ + + + + - @@ -1428,11 +1445,17 @@ + + + + + + @@ -1514,6 +1537,7 @@ + diff --git a/src/test/java/org/lobid/resources/Hbz01MabXml2ElasticsearchLobidTest.java b/src/test/java/org/lobid/resources/Hbz01MabXml2ElasticsearchLobidTest.java index e58f99b38c..332666cb5a 100644 --- a/src/test/java/org/lobid/resources/Hbz01MabXml2ElasticsearchLobidTest.java +++ b/src/test/java/org/lobid/resources/Hbz01MabXml2ElasticsearchLobidTest.java @@ -67,9 +67,13 @@ public final class Hbz01MabXml2ElasticsearchLobidTest { static final String DIRECTORY_TO_TEST_JSON_FILES = PATH_TO_TEST + "jsonld/"; static boolean testFailed = false; + static final String NTRIPLES_DEBUG_FILES = "src/test/resources/nt"; @BeforeClass public static void setup() { + if (LOG.isDebugEnabled()) { + etlDebug(); + } node = nodeBuilder().local(true) .settings(Settings.builder().put("index.number_of_replicas", "0") .put("index.number_of_shards", "1").put("path.home", "tmp/") @@ -92,6 +96,11 @@ public static void etl(final Client cl, client = cl; final FileOpener opener = new FileOpener(); final Triples2RdfModel triple2model = new Triples2RdfModel(); + RdfModelFileWriter rdfModelFileWriter = new RdfModelFileWriter(); + rdfModelFileWriter.setProperty("http://purl.org/lobid/lv#hbzID"); + rdfModelFileWriter.setStartIndex(2); + rdfModelFileWriter.setEndIndex(7); + rdfModelFileWriter.setTarget("src/test/resources/nt"); triple2model.setInput(N_TRIPLE); opener.setReceiver(new TarReader()).setReceiver(new XmlDecoder()) .setReceiver(new AlephMabXmlHandler()) @@ -104,6 +113,29 @@ public static void etl(final Client cl, opener.closeStream(); } + /** + * Writes ntriples to the filesystem. Helper for debugging purposes. + */ + public static String etlDebug() { + final FileOpener opener = new FileOpener(); + final Triples2RdfModel triple2model = new Triples2RdfModel(); + RdfModelFileWriter rdfModelFileWriter = new RdfModelFileWriter(); + rdfModelFileWriter.setProperty("http://purl.org/lobid/lv#hbzID"); + rdfModelFileWriter.setStartIndex(2); + rdfModelFileWriter.setEndIndex(7); + rdfModelFileWriter.setTarget(NTRIPLES_DEBUG_FILES); + triple2model.setInput(N_TRIPLE); + opener.setReceiver(new TarReader()).setReceiver(new XmlDecoder()) + .setReceiver(new AlephMabXmlHandler()) + .setReceiver( + new Metamorph("src/main/resources/morph-hbz01-to-lobid.xml")) + .setReceiver(new PipeEncodeTriples()).setReceiver(triple2model) + .setReceiver(rdfModelFileWriter); + opener.process(new File(TEST_FILENAME_ALEPHXMLCLOBS).getAbsolutePath()); + opener.closeStream(); + return "Created files, see " + NTRIPLES_DEBUG_FILES; + } + @SuppressWarnings("static-method") @Test public void testJson() {