diff --git a/.travis.yml b/.travis.yml
index fbe7b2968c..3049845625 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,7 @@ dist: trusty
env:
- ACTIVATOR_VERSION=1.3.10
install:
+ - bash install-dependencies.sh
- mvn clean install -e -q --settings settings.xml
before_script:
- cd web
diff --git a/install-dependencies.sh b/install-dependencies.sh
new file mode 100644
index 0000000000..ac6d851d29
--- /dev/null
+++ b/install-dependencies.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+mkdir metafacture-core
+git clone https://github.com/metafacture/metafacture-core.git
+cd metafacture-core
+./gradlew install
+cd ..
diff --git a/pom.xml b/pom.xml
index d459b2e121..458d35e8de 100644
--- a/pom.xml
+++ b/pom.xml
@@ -13,12 +13,17 @@
1.7.25
+
+ org.metafacture
+ metafacture-strings
+ 5.1.0
+
org.metafacture
metafacture-io
5.1.0
-
+
org.metafacture
metafacture-json
5.1.0
@@ -26,7 +31,7 @@
org.metafacture
metafacture-biblio
- 5.1.0
+ master-SNAPSHOT
org.metafacture
diff --git a/src/test/java/org/lobid/resources/AlmaMarc21XmlToLobidJsonTest.java b/src/test/java/org/lobid/resources/AlmaMarc21XmlToLobidJsonTest.java
index d1a084b12a..bab8e49b36 100644
--- a/src/test/java/org/lobid/resources/AlmaMarc21XmlToLobidJsonTest.java
+++ b/src/test/java/org/lobid/resources/AlmaMarc21XmlToLobidJsonTest.java
@@ -23,9 +23,17 @@
import org.metafacture.io.FileOpener;
import org.metafacture.io.ObjectStdoutWriter;
import org.metafacture.io.ObjectWriter;
+import org.metafacture.io.TarReader;
import org.metafacture.json.JsonEncoder;
+import org.metafacture.mangling.LiteralToObject;
import org.metafacture.metamorph.Metamorph;
+import org.metafacture.monitoring.StreamBatchLogger;
+import org.metafacture.strings.StringFilter;
+import org.metafacture.strings.StringReader;
+import org.metafacture.xml.SimpleXmlEncoder;
import org.metafacture.xml.XmlDecoder;
+import org.metafacture.xml.XmlElementSplitter;
+import org.metafacture.xml.XmlFilenameWriter;
/**
* Test transformations of Alma MARC21 XML catalog data into lobid JSON-LD.
@@ -37,13 +45,17 @@ public final class AlmaMarc21XmlToLobidJsonTest {
private static final String MORPH = "src/main/resources/alma/alma.xml";
private static final File DIRECTORY = new File("src/test/resources/alma/");
+ private static final String BIG_ALMA_XML_FILE =
+ DIRECTORY + "/HT012734833_etAl.xml.tar.bz2";
private static final String XML = "xml";
final HashMap morphVariables = new HashMap<>();
- private static boolean GENERATE_TESTDATA =
+ private static boolean GENERATE_TESTDATA =
System.getProperty("generateTestData", "false").equals("true");
private static final PrintStream ORIG_OUT = System.out;
private static final Logger LOG =
LogManager.getLogger(AlmaMarc21XmlToLobidJsonTest.class);
+ private static final String PATTERN_TO_IDENTIFY_XML_RECORDS =
+ "HT005207972|HT012734833|KUR00770801";
/**
* Sets necessary morph variables.
@@ -53,11 +65,60 @@ public void setup() {
morphVariables.put("isil", "DE-632");
morphVariables.put("member", "DE-605");
morphVariables.put("catalogid", "DE-605");
+ GENERATE_TESTDATA = true;
+ if (GENERATE_TESTDATA) {
+ extractXmlTestRecords(PATTERN_TO_IDENTIFY_XML_RECORDS);
+ }
+ }
+
+ /**
+ * Splits xml and extracts records hit by a pattern. Needs 50 secs for 100.000
+ * resources in a 44_MB_XML.tar.gz. It's 100 times faster than Filter(morph).
+ * This method helps to update the Marc-Xml test files by identifying the
+ * records, determining the name of the file using an xpath to get the value
+ * from `035 .a` and writes this into the test directory.
+ *
+ * The files are not pretty printed but untouched, though.
+ *
+ * @param pattern the pattern which is searched for to identify xml records
+ */
+ public static void extractXmlTestRecords(final String pattern) {
+ long startTime = System.currentTimeMillis();
+ XmlElementSplitter xmlElementSplitter = new XmlElementSplitter();
+ xmlElementSplitter.setElementName("record");
+ XmlElementSplitter xmlElementSplitter_1 = new XmlElementSplitter();
+ xmlElementSplitter_1.setElementName("record");
+ final StringFilter stringFilter = new StringFilter(pattern);
+ XmlFilenameWriter xmlFilenameWriter = new XmlFilenameWriter();
+ xmlFilenameWriter
+ .setProperty("/record/datafield[@tag='035']/subfield[@code='a']");
+ xmlFilenameWriter.setTarget("src/test/resources/alma/");
+ StreamBatchLogger logger = new StreamBatchLogger();
+ logger.setBatchSize(10);
+ FileOpener opener = new FileOpener();
+ SimpleXmlEncoder simpleXmlEncoder = new SimpleXmlEncoder();
+ simpleXmlEncoder.setSeparateRoots(true);
+ opener.setReceiver(new TarReader()) //
+ .setReceiver(new XmlDecoder()) //
+ .setReceiver(xmlElementSplitter) //
+ .setReceiver(logger) //
+ .setReceiver(new LiteralToObject()) //
+ .setReceiver(stringFilter)
+ .setReceiver(new StringReader()) //
+ .setReceiver(new XmlDecoder()) //
+ .setReceiver(xmlElementSplitter_1) //
+ .setReceiver(xmlFilenameWriter);
+
+ opener.process(BIG_ALMA_XML_FILE);
+ opener.closeStream();
+ long endTime = System.currentTimeMillis();
+ LOG.info("Time needed:" + (endTime - startTime) / 1000);
}
/**
* Cleans a bit up. Sets the System.out to the original PrintStream.
*/
+ @SuppressWarnings("static-method")
@After
public void cleanup() {
System.setOut(ORIG_OUT);
@@ -74,6 +135,8 @@ public void cleanup() {
public void transformFiles() {
Arrays.asList(DIRECTORY.listFiles(f -> f.getAbsolutePath().endsWith(XML)))
.forEach(file -> {
+ MarcXmlHandler marcXmlHandler = new MarcXmlHandler();
+ marcXmlHandler.setNamespace(null);
JsonEncoder jsonEncoder = new JsonEncoder();
jsonEncoder.setPrettyPrinting(true);
ObjectMapper mapper = new ObjectMapper();
@@ -82,7 +145,7 @@ public void transformFiles() {
try {
FileOpener opener = new FileOpener();
opener.setReceiver(new XmlDecoder())
- .setReceiver(new MarcXmlHandler())
+ .setReceiver(marcXmlHandler)
.setReceiver(new Metamorph(MORPH, morphVariables))
.setReceiver(jsonEncoder);
@@ -97,7 +160,8 @@ public void transformFiles() {
opener.process(file.getAbsolutePath());
opener.closeStream();
if (!GENERATE_TESTDATA) {
- JsonNode expectedJsonNode = mapper.readTree(new File(filenameJson));
+ JsonNode expectedJsonNode =
+ mapper.readTree(new File(filenameJson));
Object expectedJsonObject =
mapper.readValue(expectedJsonNode.toString(), Object.class);
String expectedJson = mapper.writerWithDefaultPrettyPrinter()
diff --git a/src/test/resources/alma/HT005207972.json b/src/test/resources/alma/HT005207972.json
index 7a4494f6f4..47956a009f 100644
--- a/src/test/resources/alma/HT005207972.json
+++ b/src/test/resources/alma/HT005207972.json
@@ -107,7 +107,7 @@
},
"resultOf" : {
"type" : [ "CreateAction" ],
- "endTime" : "2020-12-18T14:44:37",
+ "endTime" : "2021-01-05T14:18:03",
"instrument" : {
"id" : "https://github.com/hbz/lobid-resources",
"type" : [ "SoftwareApplication" ],
diff --git a/src/test/resources/alma/HT005207972.xml b/src/test/resources/alma/HT005207972.xml
index 7b23e07547..81a34dec77 100644
--- a/src/test/resources/alma/HT005207972.xml
+++ b/src/test/resources/alma/HT005207972.xml
@@ -1,7 +1,5 @@
-
-
-
- 00000nam#a2200000#c#4500
+
+ 00000nam#a2200000#c#4500
DE-605
20130124165800.0
tu
@@ -63,7 +61,7 @@
XIV, 633 S.
- Arbeitsökonomie
+ Arbeitsökonomie
s
(DE-588)4322126-9
@@ -117,7 +115,7 @@
49HBZ_BIE
991020238039706442
- Universität Bielefeld
+ Universität Bielefeld
LOCAL
@@ -198,6 +196,5 @@
SL800 M367(3)
138_1155937+01
49HBZ_BIE
-
-
-
+
+
\ No newline at end of file
diff --git a/src/test/resources/alma/HT012734833.json b/src/test/resources/alma/HT012734833.json
index 0930eadff5..f598267658 100644
--- a/src/test/resources/alma/HT012734833.json
+++ b/src/test/resources/alma/HT012734833.json
@@ -79,7 +79,7 @@
},
"resultOf" : {
"type" : [ "CreateAction" ],
- "endTime" : "2020-12-18T14:44:37",
+ "endTime" : "2021-01-05T14:18:03",
"instrument" : {
"id" : "https://github.com/hbz/lobid-resources",
"type" : [ "SoftwareApplication" ],
diff --git a/src/test/resources/alma/HT012734833.xml b/src/test/resources/alma/HT012734833.xml
index 572d2bfff6..fa67727745 100644
--- a/src/test/resources/alma/HT012734833.xml
+++ b/src/test/resources/alma/HT012734833.xml
@@ -1,7 +1,5 @@
-
-
-
- 00000nms#a2200000#c#4500
+
+ 00000nms#a2200000#c#4500
DE-605
20191010224400.0
cr#|||||||||||
@@ -102,7 +100,7 @@
1989
- C!URL-Ä(19-11-14)
+ C!URL-Ä(19-11-14)
Druckausg.
@@ -122,7 +120,7 @@
http://gateway.ovid.com/ovidweb.cgi?T=JS&NEWS=N&PAGE=toc&SEARCH=00008877-000000000-00000.kc&LINKTYPE=asBody&LINKPOS=1&D=ovft
Verlag; 1.1989 - 15.2004
- Deutschlandweit zugänglich
+ Deutschlandweit zugänglich
ZDB-1-LWW
@@ -229,7 +227,7 @@
5333897520006444
49HBZ_FHA
static
- Deutschlandweit zugänglich
+ Deutschlandweit zugänglich
2020-07-23 14:31:47 Europe/Berlin
2020-07-23 14:31:42 Europe/Berlin
https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5333897520006444&Force_direct=true
@@ -247,7 +245,7 @@
5333897560006444
49HBZ_FHA
static
- Deutschlandweit zugänglich
+ Deutschlandweit zugänglich
2020-07-23 14:31:47 Europe/Berlin
2020-07-23 14:31:42 Europe/Berlin
https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5333897560006444&Force_direct=true
@@ -265,7 +263,7 @@
5333897540006444
49HBZ_FHA
static
- Deutschlandweit zugänglich
+ Deutschlandweit zugänglich
2020-07-23 14:31:47 Europe/Berlin
2020-07-23 14:31:42 Europe/Berlin
https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5333897540006444&Force_direct=true
@@ -283,7 +281,7 @@
5333897500006444
49HBZ_FHA
static
- Deutschlandweit zugänglich
+ Deutschlandweit zugänglich
2020-07-23 14:31:47 Europe/Berlin
2020-07-23 14:31:42 Europe/Berlin
https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5333897500006444&Force_direct=true
@@ -301,7 +299,7 @@
5375175020006445
49HBZ_UBD
static
- Deutschlandweit zugänglich
+ Deutschlandweit zugänglich
2020-07-23 13:09:58 Europe/Berlin
2020-07-23 13:09:49 Europe/Berlin
https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5375175020006445&Force_direct=true
@@ -319,7 +317,7 @@
5375174980006445
49HBZ_UBD
static
- Deutschlandweit zugänglich
+ Deutschlandweit zugänglich
2020-07-23 13:09:58 Europe/Berlin
2020-07-23 13:09:49 Europe/Berlin
https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5375174980006445&Force_direct=true
@@ -337,7 +335,7 @@
5375175000006445
49HBZ_UBD
static
- Deutschlandweit zugänglich
+ Deutschlandweit zugänglich
2020-07-23 13:09:58 Europe/Berlin
2020-07-23 13:09:49 Europe/Berlin
https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5375175000006445&Force_direct=true
@@ -355,7 +353,7 @@
5375174960006445
49HBZ_UBD
static
- Deutschlandweit zugänglich
+ Deutschlandweit zugänglich
2020-07-23 13:09:58 Europe/Berlin
2020-07-23 13:09:49 Europe/Berlin
https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5375174960006445&Force_direct=true
@@ -368,5 +366,4 @@
BOOK
5375174960006445
-
-
+
\ No newline at end of file
diff --git a/src/test/resources/alma/HT012734833_etAl.xml.tar.bz2 b/src/test/resources/alma/HT012734833_etAl.xml.tar.bz2
new file mode 100644
index 0000000000..b68321c632
Binary files /dev/null and b/src/test/resources/alma/HT012734833_etAl.xml.tar.bz2 differ
diff --git a/src/test/resources/alma/KUR00770801.json b/src/test/resources/alma/KUR00770801.json
index de75baf8ed..ddee464679 100644
--- a/src/test/resources/alma/KUR00770801.json
+++ b/src/test/resources/alma/KUR00770801.json
@@ -70,7 +70,7 @@
},
"resultOf" : {
"type" : [ "CreateAction" ],
- "endTime" : "2020-12-18T14:44:37",
+ "endTime" : "2021-01-05T14:18:02",
"instrument" : {
"id" : "https://github.com/hbz/lobid-resources",
"type" : [ "SoftwareApplication" ],
diff --git a/src/test/resources/alma/KUR00770801.xml b/src/test/resources/alma/KUR00770801.xml
index 9702e070bb..96283c7054 100644
--- a/src/test/resources/alma/KUR00770801.xml
+++ b/src/test/resources/alma/KUR00770801.xml
@@ -1,6 +1,4 @@
-
-
-
+
00000nam#a2200000#c#4500
20200518164500.0
cu#|||||||||||
@@ -34,7 +32,7 @@
Sebastopol, Calif.
- O'Reilly Media
+ O'Reilly Media
c2010
@@ -72,7 +70,7 @@
Connect to this resource online
http://proquest.tech.safaribooksonline.de/?uiCode=Duesseldorf&xmlId=9780735656260
- Zugriff nur im Hochschulnetz der Universität Düsseldorf
+ Zugriff nur im Hochschulnetz der Universität Düsseldorf
safari-2019
@@ -81,7 +79,7 @@
49HBZ_DUE
990042506810206443
- Universität Düsseldorf
+ Universität Düsseldorf
ILS
@@ -93,5 +91,4 @@
import
2020-07-01 12:09:43 Europe/Berlin
-
-
+
\ No newline at end of file