diff --git a/.travis.yml b/.travis.yml index fbe7b2968c..3049845625 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,6 +4,7 @@ dist: trusty env: - ACTIVATOR_VERSION=1.3.10 install: + - bash install-dependencies.sh - mvn clean install -e -q --settings settings.xml before_script: - cd web diff --git a/install-dependencies.sh b/install-dependencies.sh new file mode 100644 index 0000000000..ac6d851d29 --- /dev/null +++ b/install-dependencies.sh @@ -0,0 +1,6 @@ +#!/bin/bash +mkdir metafacture-core +git clone https://github.com/metafacture/metafacture-core.git +cd metafacture-core +./gradlew install +cd .. diff --git a/pom.xml b/pom.xml index d459b2e121..458d35e8de 100644 --- a/pom.xml +++ b/pom.xml @@ -13,12 +13,17 @@ 1.7.25 + + org.metafacture + metafacture-strings + 5.1.0 + org.metafacture metafacture-io 5.1.0 - + org.metafacture metafacture-json 5.1.0 @@ -26,7 +31,7 @@ org.metafacture metafacture-biblio - 5.1.0 + master-SNAPSHOT org.metafacture diff --git a/src/test/java/org/lobid/resources/AlmaMarc21XmlToLobidJsonTest.java b/src/test/java/org/lobid/resources/AlmaMarc21XmlToLobidJsonTest.java index d1a084b12a..bab8e49b36 100644 --- a/src/test/java/org/lobid/resources/AlmaMarc21XmlToLobidJsonTest.java +++ b/src/test/java/org/lobid/resources/AlmaMarc21XmlToLobidJsonTest.java @@ -23,9 +23,17 @@ import org.metafacture.io.FileOpener; import org.metafacture.io.ObjectStdoutWriter; import org.metafacture.io.ObjectWriter; +import org.metafacture.io.TarReader; import org.metafacture.json.JsonEncoder; +import org.metafacture.mangling.LiteralToObject; import org.metafacture.metamorph.Metamorph; +import org.metafacture.monitoring.StreamBatchLogger; +import org.metafacture.strings.StringFilter; +import org.metafacture.strings.StringReader; +import org.metafacture.xml.SimpleXmlEncoder; import org.metafacture.xml.XmlDecoder; +import org.metafacture.xml.XmlElementSplitter; +import org.metafacture.xml.XmlFilenameWriter; /** * Test transformations of Alma MARC21 XML catalog data into lobid JSON-LD. @@ -37,13 +45,17 @@ public final class AlmaMarc21XmlToLobidJsonTest { private static final String MORPH = "src/main/resources/alma/alma.xml"; private static final File DIRECTORY = new File("src/test/resources/alma/"); + private static final String BIG_ALMA_XML_FILE = + DIRECTORY + "/HT012734833_etAl.xml.tar.bz2"; private static final String XML = "xml"; final HashMap morphVariables = new HashMap<>(); - private static boolean GENERATE_TESTDATA = + private static boolean GENERATE_TESTDATA = System.getProperty("generateTestData", "false").equals("true"); private static final PrintStream ORIG_OUT = System.out; private static final Logger LOG = LogManager.getLogger(AlmaMarc21XmlToLobidJsonTest.class); + private static final String PATTERN_TO_IDENTIFY_XML_RECORDS = + "HT005207972|HT012734833|KUR00770801"; /** * Sets necessary morph variables. @@ -53,11 +65,60 @@ public void setup() { morphVariables.put("isil", "DE-632"); morphVariables.put("member", "DE-605"); morphVariables.put("catalogid", "DE-605"); + GENERATE_TESTDATA = true; + if (GENERATE_TESTDATA) { + extractXmlTestRecords(PATTERN_TO_IDENTIFY_XML_RECORDS); + } + } + + /** + * Splits xml and extracts records hit by a pattern. Needs 50 secs for 100.000 + * resources in a 44_MB_XML.tar.gz. It's 100 times faster than Filter(morph). + * This method helps to update the Marc-Xml test files by identifying the + * records, determining the name of the file using an xpath to get the value + * from `035 .a` and writes this into the test directory. + * + * The files are not pretty printed but untouched, though. + * + * @param pattern the pattern which is searched for to identify xml records + */ + public static void extractXmlTestRecords(final String pattern) { + long startTime = System.currentTimeMillis(); + XmlElementSplitter xmlElementSplitter = new XmlElementSplitter(); + xmlElementSplitter.setElementName("record"); + XmlElementSplitter xmlElementSplitter_1 = new XmlElementSplitter(); + xmlElementSplitter_1.setElementName("record"); + final StringFilter stringFilter = new StringFilter(pattern); + XmlFilenameWriter xmlFilenameWriter = new XmlFilenameWriter(); + xmlFilenameWriter + .setProperty("/record/datafield[@tag='035']/subfield[@code='a']"); + xmlFilenameWriter.setTarget("src/test/resources/alma/"); + StreamBatchLogger logger = new StreamBatchLogger(); + logger.setBatchSize(10); + FileOpener opener = new FileOpener(); + SimpleXmlEncoder simpleXmlEncoder = new SimpleXmlEncoder(); + simpleXmlEncoder.setSeparateRoots(true); + opener.setReceiver(new TarReader()) // + .setReceiver(new XmlDecoder()) // + .setReceiver(xmlElementSplitter) // + .setReceiver(logger) // + .setReceiver(new LiteralToObject()) // + .setReceiver(stringFilter) + .setReceiver(new StringReader()) // + .setReceiver(new XmlDecoder()) // + .setReceiver(xmlElementSplitter_1) // + .setReceiver(xmlFilenameWriter); + + opener.process(BIG_ALMA_XML_FILE); + opener.closeStream(); + long endTime = System.currentTimeMillis(); + LOG.info("Time needed:" + (endTime - startTime) / 1000); } /** * Cleans a bit up. Sets the System.out to the original PrintStream. */ + @SuppressWarnings("static-method") @After public void cleanup() { System.setOut(ORIG_OUT); @@ -74,6 +135,8 @@ public void cleanup() { public void transformFiles() { Arrays.asList(DIRECTORY.listFiles(f -> f.getAbsolutePath().endsWith(XML))) .forEach(file -> { + MarcXmlHandler marcXmlHandler = new MarcXmlHandler(); + marcXmlHandler.setNamespace(null); JsonEncoder jsonEncoder = new JsonEncoder(); jsonEncoder.setPrettyPrinting(true); ObjectMapper mapper = new ObjectMapper(); @@ -82,7 +145,7 @@ public void transformFiles() { try { FileOpener opener = new FileOpener(); opener.setReceiver(new XmlDecoder()) - .setReceiver(new MarcXmlHandler()) + .setReceiver(marcXmlHandler) .setReceiver(new Metamorph(MORPH, morphVariables)) .setReceiver(jsonEncoder); @@ -97,7 +160,8 @@ public void transformFiles() { opener.process(file.getAbsolutePath()); opener.closeStream(); if (!GENERATE_TESTDATA) { - JsonNode expectedJsonNode = mapper.readTree(new File(filenameJson)); + JsonNode expectedJsonNode = + mapper.readTree(new File(filenameJson)); Object expectedJsonObject = mapper.readValue(expectedJsonNode.toString(), Object.class); String expectedJson = mapper.writerWithDefaultPrettyPrinter() diff --git a/src/test/resources/alma/HT005207972.json b/src/test/resources/alma/HT005207972.json index 7a4494f6f4..47956a009f 100644 --- a/src/test/resources/alma/HT005207972.json +++ b/src/test/resources/alma/HT005207972.json @@ -107,7 +107,7 @@ }, "resultOf" : { "type" : [ "CreateAction" ], - "endTime" : "2020-12-18T14:44:37", + "endTime" : "2021-01-05T14:18:03", "instrument" : { "id" : "https://github.com/hbz/lobid-resources", "type" : [ "SoftwareApplication" ], diff --git a/src/test/resources/alma/HT005207972.xml b/src/test/resources/alma/HT005207972.xml index 7b23e07547..81a34dec77 100644 --- a/src/test/resources/alma/HT005207972.xml +++ b/src/test/resources/alma/HT005207972.xml @@ -1,7 +1,5 @@ - - - - 00000nam#a2200000#c#4500 + + 00000nam#a2200000#c#4500 DE-605 20130124165800.0 tu @@ -63,7 +61,7 @@ XIV, 633 S. - Arbeitsökonomie + Arbeitsökonomie s (DE-588)4322126-9 @@ -117,7 +115,7 @@ 49HBZ_BIE 991020238039706442 - Universität Bielefeld + Universität Bielefeld LOCAL @@ -198,6 +196,5 @@ SL800 M367(3) 138_1155937+01 49HBZ_BIE - - - + + \ No newline at end of file diff --git a/src/test/resources/alma/HT012734833.json b/src/test/resources/alma/HT012734833.json index 0930eadff5..f598267658 100644 --- a/src/test/resources/alma/HT012734833.json +++ b/src/test/resources/alma/HT012734833.json @@ -79,7 +79,7 @@ }, "resultOf" : { "type" : [ "CreateAction" ], - "endTime" : "2020-12-18T14:44:37", + "endTime" : "2021-01-05T14:18:03", "instrument" : { "id" : "https://github.com/hbz/lobid-resources", "type" : [ "SoftwareApplication" ], diff --git a/src/test/resources/alma/HT012734833.xml b/src/test/resources/alma/HT012734833.xml index 572d2bfff6..fa67727745 100644 --- a/src/test/resources/alma/HT012734833.xml +++ b/src/test/resources/alma/HT012734833.xml @@ -1,7 +1,5 @@ - - - - 00000nms#a2200000#c#4500 + + 00000nms#a2200000#c#4500 DE-605 20191010224400.0 cr#||||||||||| @@ -102,7 +100,7 @@ 1989 - C!URL-Ä(19-11-14) + C!URL-Ä(19-11-14) Druckausg. @@ -122,7 +120,7 @@ http://gateway.ovid.com/ovidweb.cgi?T=JS&NEWS=N&PAGE=toc&SEARCH=00008877-000000000-00000.kc&LINKTYPE=asBody&LINKPOS=1&D=ovft Verlag; 1.1989 - 15.2004 - Deutschlandweit zugänglich + Deutschlandweit zugänglich ZDB-1-LWW @@ -229,7 +227,7 @@ 5333897520006444 49HBZ_FHA static - Deutschlandweit zugänglich + Deutschlandweit zugänglich 2020-07-23 14:31:47 Europe/Berlin 2020-07-23 14:31:42 Europe/Berlin https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5333897520006444&Force_direct=true @@ -247,7 +245,7 @@ 5333897560006444 49HBZ_FHA static - Deutschlandweit zugänglich + Deutschlandweit zugänglich 2020-07-23 14:31:47 Europe/Berlin 2020-07-23 14:31:42 Europe/Berlin https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5333897560006444&Force_direct=true @@ -265,7 +263,7 @@ 5333897540006444 49HBZ_FHA static - Deutschlandweit zugänglich + Deutschlandweit zugänglich 2020-07-23 14:31:47 Europe/Berlin 2020-07-23 14:31:42 Europe/Berlin https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5333897540006444&Force_direct=true @@ -283,7 +281,7 @@ 5333897500006444 49HBZ_FHA static - Deutschlandweit zugänglich + Deutschlandweit zugänglich 2020-07-23 14:31:47 Europe/Berlin 2020-07-23 14:31:42 Europe/Berlin https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5333897500006444&Force_direct=true @@ -301,7 +299,7 @@ 5375175020006445 49HBZ_UBD static - Deutschlandweit zugänglich + Deutschlandweit zugänglich 2020-07-23 13:09:58 Europe/Berlin 2020-07-23 13:09:49 Europe/Berlin https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5375175020006445&Force_direct=true @@ -319,7 +317,7 @@ 5375174980006445 49HBZ_UBD static - Deutschlandweit zugänglich + Deutschlandweit zugänglich 2020-07-23 13:09:58 Europe/Berlin 2020-07-23 13:09:49 Europe/Berlin https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5375174980006445&Force_direct=true @@ -337,7 +335,7 @@ 5375175000006445 49HBZ_UBD static - Deutschlandweit zugänglich + Deutschlandweit zugänglich 2020-07-23 13:09:58 Europe/Berlin 2020-07-23 13:09:49 Europe/Berlin https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5375175000006445&Force_direct=true @@ -355,7 +353,7 @@ 5375174960006445 49HBZ_UBD static - Deutschlandweit zugänglich + Deutschlandweit zugänglich 2020-07-23 13:09:58 Europe/Berlin 2020-07-23 13:09:49 Europe/Berlin https://eu04.alma.exlibrisgroup.com/view/uresolver/49HBZ_NETWORK/openurl?u.ignore_date_coverage=true&portfolio_pid=5375174960006445&Force_direct=true @@ -368,5 +366,4 @@ BOOK 5375174960006445 - - + \ No newline at end of file diff --git a/src/test/resources/alma/HT012734833_etAl.xml.tar.bz2 b/src/test/resources/alma/HT012734833_etAl.xml.tar.bz2 new file mode 100644 index 0000000000..b68321c632 Binary files /dev/null and b/src/test/resources/alma/HT012734833_etAl.xml.tar.bz2 differ diff --git a/src/test/resources/alma/KUR00770801.json b/src/test/resources/alma/KUR00770801.json index de75baf8ed..ddee464679 100644 --- a/src/test/resources/alma/KUR00770801.json +++ b/src/test/resources/alma/KUR00770801.json @@ -70,7 +70,7 @@ }, "resultOf" : { "type" : [ "CreateAction" ], - "endTime" : "2020-12-18T14:44:37", + "endTime" : "2021-01-05T14:18:02", "instrument" : { "id" : "https://github.com/hbz/lobid-resources", "type" : [ "SoftwareApplication" ], diff --git a/src/test/resources/alma/KUR00770801.xml b/src/test/resources/alma/KUR00770801.xml index 9702e070bb..96283c7054 100644 --- a/src/test/resources/alma/KUR00770801.xml +++ b/src/test/resources/alma/KUR00770801.xml @@ -1,6 +1,4 @@ - - - + 00000nam#a2200000#c#4500 20200518164500.0 cu#||||||||||| @@ -34,7 +32,7 @@ Sebastopol, Calif. - O'Reilly Media + O'Reilly Media c2010 @@ -72,7 +70,7 @@ Connect to this resource online http://proquest.tech.safaribooksonline.de/?uiCode=Duesseldorf&xmlId=9780735656260 - Zugriff nur im Hochschulnetz der Universität Düsseldorf + Zugriff nur im Hochschulnetz der Universität Düsseldorf safari-2019 @@ -81,7 +79,7 @@ 49HBZ_DUE 990042506810206443 - Universität Düsseldorf + Universität Düsseldorf ILS @@ -93,5 +91,4 @@ import 2020-07-01 12:09:43 Europe/Berlin - - + \ No newline at end of file