diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 358280f..0f44e53 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -26,8 +26,8 @@ before_script: tests: stage: test script: - - pip install -U pytest - - pip install . + # install project inlcuding optional test dependencies + - pip install -U .[tests] - pytest # https://docs.gitlab.com/ee/ci/testing/test_coverage_visualization.html#python-example diff --git a/pyproject.toml b/pyproject.toml index 0853de6..05e7ffe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,11 +34,11 @@ dependencies = [ # We install vocexcel directly from a specific branch form github (or gitea in LIKAT) "vocexcel @ git+https://github.com/nfdi4cat/VocExcel.git@nfdi4cat-dev", #vocexcel @ git+https://gogs.catalysis.de/LIKAT_RDM/VocExcel.git@nfdi4cat-dev - "django>=4.0.0", - "networkx>=2.8", - "openpyxl>=3.0.9", - "pillow>=9.1.0", - "ontospy[Full] >= 1.9.9.4", + "django >= 4.1.3", + "networkx >= 2.8", + "openpyxl >= 3.0.9", + "pillow >= 9.1.0", + "ontospy >= 2.1.0", ] #dynamic = ["version", "readme"] @@ -58,7 +58,12 @@ tests = [ lint = [ "black", "isort", - "wemake-python-styleguide", +] +dev = [ + # Recursively including the project's own optional dependencies requires pip>=21.2 + "voc4cat[tests,lint]", + "ruff", + "wemake-python-styleguide", ] [project.scripts] @@ -155,6 +160,7 @@ exclude_lines = [ "raise AssertionError", "raise NotImplementedError", "return NotImplemented", + "if __name__ == .__main__.:", ] [tool.coverage.html] diff --git a/src/voc4cat/__init__.py b/src/voc4cat/__init__.py index e69de29..a1edfa9 100644 --- a/src/voc4cat/__init__.py +++ b/src/voc4cat/__init__.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- +from importlib.metadata import PackageNotFoundError, version + +try: + __version__ = version("voc4cat") +except PackageNotFoundError: + # package is not installed + try: + from ._version import version as __version__ + except ImportError: + __version__ = "0.0.0" diff --git a/src/voc4cat/merge_vocab.py b/src/voc4cat/merge_vocab.py index 18333b9..216b4d1 100644 --- a/src/voc4cat/merge_vocab.py +++ b/src/voc4cat/merge_vocab.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # This script is mainly useful for CI. import os import shutil @@ -5,48 +6,50 @@ import sys from pathlib import Path -VOCAB_DIR = Path("vocabularies") -OUTBOX = Path("outbox") - -def main(outbox, vocab): +def main(ttl_inbox, vocab): """ - Sync ttl-files in outbox and vocab folder + Sync ttl-files from ttl_inbox into vocab folder New files are copied, existing files are synced by git merge-file. """ retcode = 0 - for p in os.listdir(outbox): - new = outbox / Path(p) + for p in os.listdir(ttl_inbox): + new = ttl_inbox / Path(p) if not new.suffix == ".ttl" or new.is_dir(): - print(f'skipping "{new}"') + print(f'Skipping "{new}"') continue if os.path.exists(vocab / Path(new).name): exists = vocab / Path(new).name cmd = ["git", "merge-file", "--theirs", str(exists), str(exists), str(new)] - print(" ".join(cmd)) + print("Running cmd: {0}".format(" ".join(cmd))) outp = subprocess.run(cmd, capture_output=True) print(outp.stdout) if retcode := outp.returncode != 0: break else: - print(f'copying "{new}" to "{vocab}"') + print(f'Copying "{new}" to "{vocab}"') shutil.copy(new, vocab) return retcode -def run(): - if len(sys.argv) != 3: - print('Usage: "python merge_vocab.py outbox_dir vocab_dir') - sys.exit(1) - outbox, vocab = sys.argv[1:3] +def main_cli(args=None): + + if args is None: # script run via entrypoint + args = sys.argv[1:] + + if len(args) != 2: + print("Usage: python merge_vocab.py ") + return 1 + outbox, vocab = args if os.path.exists(outbox) and os.path.exists(vocab): retcode = main(Path(outbox), Path(vocab)) - sys.exit(retcode) - else: - print(f'This script requires both folders to exist: "{outbox}" and "{vocab}"') - sys.exit(1) + return retcode + + print(f'This script requires both folders to exist: "{outbox}" and "{vocab}"') + return 1 if __name__ == "__main__": - run() + err = main_cli(sys.argv[1:]) + sys.exit(err) diff --git a/src/voc4cat/util.py b/src/voc4cat/util.py index 18343de..2e963e0 100644 --- a/src/voc4cat/util.py +++ b/src/voc4cat/util.py @@ -115,7 +115,7 @@ def dag_to_node_levels(termgraph, baselevel=0): for sgc in nx.connected_components(termgraph.to_undirected()) ] tg_copy = termgraph.copy() - # We must find & eliminate cycles no matter which direction for the tree representation. + # We must find & eliminate cycles no matter if directed or undirected. cycles = nx.cycle_basis(tg_copy.to_undirected()) broken_edges = [] while cycles: @@ -125,8 +125,8 @@ def dag_to_node_levels(termgraph, baselevel=0): tg_copy.remove_edge(*edge_to_break) # nx.cycle_basis does only report cycles >= 3 members. cycles = nx.cycle_basis(tg_copy.to_undirected()) - # It is not clearly documented in networkx if the direction of edges is preserved - # when converting to an undirected graph. So we better check... + # It is not clearly documented in networkx if the direction of edges is + # preserved when converting to an undirected graph. So we better check... if termgraph.has_edge(*edge_to_break): broken_edges.append(edge_to_break) else: # pragma: no cover diff --git a/src/voc4cat/wrapper.py b/src/voc4cat/wrapper.py index baa97f3..cdd6705 100644 --- a/src/voc4cat/wrapper.py +++ b/src/voc4cat/wrapper.py @@ -11,17 +11,18 @@ import glob import os import sys -from importlib.metadata import PackageNotFoundError, version from pathlib import Path +from warnings import warn import openpyxl from django.utils.text import slugify from openpyxl.styles import Alignment, PatternFill from rdflib import URIRef -from vocexcel.convert import main +from vocexcel.convert import main as vocexcel_main from vocexcel.models import ORGANISATIONS, ORGANISATIONS_INVERSE from vocexcel.utils import EXCEL_FILE_ENDINGS, KNOWN_FILE_ENDINGS, RDF_FILE_ENDINGS +from voc4cat import __version__ from voc4cat.util import ( dag_from_indented_text, dag_from_narrower, @@ -35,15 +36,6 @@ ORGANISATIONS_INVERSE.update({v: k for k, v in ORGANISATIONS.items()}) NOW = datetime.datetime.now().strftime("%Y%m%dT%H%M%S") -try: - __version__ = version("voc4cat") -except PackageNotFoundError: - # package is not installed - try: - from ._version import version as __version__ - except ImportError: - __version__ = "0.0.0" - def is_file_available(fname, ftype): if not isinstance(ftype, list): @@ -80,7 +72,7 @@ def is_supported_template(wb): def may_overwrite(no_warn, xlf, outfile, func): if not no_warn and os.path.exists(outfile) and Path(xlf) == Path(outfile): - print( + warn( f'Option "{func.__name__}" will overwrite the existing file {outfile}\n' "Run again with --no-warn option to overwrite the file." ) @@ -100,8 +92,9 @@ def add_IRI(fpath, outfile): is_supported_template(wb) VOC_BASE_IRI = wb["Concept Scheme"].cell(row=2, column=2).value if VOC_BASE_IRI is None: - VOC_BASE_IRI = "" - elif not VOC_BASE_IRI.endswith("/"): + VOC_BASE_IRI = "https://example.org/" + wb["Concept Scheme"].cell(row=2, column=2).value = VOC_BASE_IRI + if not VOC_BASE_IRI.endswith("/"): VOC_BASE_IRI += "/" # process both worksheets @@ -109,13 +102,13 @@ def add_IRI(fpath, outfile): for sheet in ["Concepts", "Collections"]: ws = wb[sheet] # iterate over first two columns; skip header and start from row 3 - for row in ws.iter_rows(min_row=3, max_col=2): + for row in ws.iter_rows(min_row=3, max_col=2): # pragma: no branch if not row[0].value and row[1].value: concept_iri = VOC_BASE_IRI + slugify(row[1].value) concept_iri += "-coll" if sheet == "Collections" else "" ws.cell(row[0].row, column=1).value = concept_iri subsequent_empty_rows = 0 - elif row[0].value is None and row[1].value is None: + else: # stop processing a sheet after 3 empty rows if subsequent_empty_rows < 2: subsequent_empty_rows += 1 @@ -143,8 +136,9 @@ def hierarchy_from_indent(fpath, outfile, sep): concepts_indented = [] row_by_iri = {} # read concepts, determine their indentation level, then clear indentation - col_last = 10 - for row in ws.iter_rows(min_row=3, max_col=col_last): + col_last = 9 + max_row = 0 + for row in ws.iter_rows(min_row=3, max_col=col_last): # pragma: no branch iri = row[0].value row_no = row[0].row if iri and row[1].value: @@ -158,6 +152,7 @@ def hierarchy_from_indent(fpath, outfile, sep): ws.cell(row_no, column=2).value = descr ws.cell(row_no, column=2).alignment = Alignment(indent=0) concepts_indented.append(level * " " + iri) + # TODO think about how to handle language. if iri in row_by_iri: # merge needed # compare fields, merge if one is empty, error if different values @@ -168,7 +163,8 @@ def hierarchy_from_indent(fpath, outfile, sep): for old, new in zip(row_by_iri[iri], new_data): if (old and new) and (old != new): raise ValueError( - "Cannot merge rows for {iri}. Resolve differences manually." + f"Cannot merge rows for {iri}. " + "Resolve differences manually." ) merged.append(old if old else new) row_by_iri[iri] = merged @@ -177,7 +173,7 @@ def hierarchy_from_indent(fpath, outfile, sep): ws.cell(row_no, col_no).value for col_no in range(2, col_last) ] max_row = row_no - elif iri is None and row[1].value is None: + else: # stop processing a sheet after 3 empty rows if subsequent_empty_rows < 2: subsequent_empty_rows += 1 @@ -200,7 +196,9 @@ def hierarchy_from_indent(fpath, outfile, sep): # Clear remaining rows. first_row_to_clear = 3 + len(children_by_iri) - for row in ws.iter_rows(min_row=first_row_to_clear, max_col=col_last): + for row in ws.iter_rows( # pragma: no branch + min_row=first_row_to_clear, max_col=col_last + ): for col in range(1, col_last): ws.cell(row[0].row, column=col).value = None if row[0].row == max_row: @@ -226,10 +224,10 @@ def hierarchy_to_indent(fpath, outfile, sep): concept_children_dict = {} subsequent_empty_rows = 0 row_by_iri = {} - col_last = 10 + col_last = 9 # read all IRI, preferred labels, childrenURIs from the sheet - for rows_total, row in enumerate( - ws.iter_rows(min_row=3, max_col=10, values_only=True) + for rows_total, row in enumerate( # pragma: no branch + ws.iter_rows(min_row=3, max_col=col_last, values_only=True) ): if row[0] and row[1]: iri = row[0] @@ -253,7 +251,9 @@ def hierarchy_to_indent(fpath, outfile, sep): # Set cell values by breaking them down into individual cells iri_written = [] - for row, (iri, level) in zip(ws.iter_rows(min_row=3, max_col=10), concept_levels): + for row, (iri, level) in zip( + ws.iter_rows(min_row=3, max_col=col_last), concept_levels + ): row[0].value = iri concept_text = row_by_iri[iri][0] if sep is None: @@ -263,11 +263,13 @@ def hierarchy_to_indent(fpath, outfile, sep): row[1].value = sep * level + concept_text row[1].alignment = Alignment(indent=0) row[2].value = row_by_iri[iri][1] - for col, stored_value in zip(range(4, col_last), row_by_iri[iri][2:]): + for col, stored_value in zip(range(4, col_last + 1), row_by_iri[iri][2:]): if iri in iri_written: ws.cell(row[0].row, column=col).value = None else: ws.cell(row[0].row, column=col).value = stored_value + # clear children IRI column G + ws.cell(row[0].row, column=7).value = None iri_written.append(iri) wb.save(outfile) @@ -280,24 +282,34 @@ def run_ontospy(file_path, output_path): Generate Ontospy documentation for a file or directory of files. """ import ontospy - from ontospy.ontodocs.viz.viz_d3dendogram import Dataviz - from ontospy.ontodocs.viz.viz_html_single import HTMLVisualizer - - if not glob.glob("outbox/*.ttl"): - print(f'No turtle file found to document with Ontospy in "{file_path}"') + from ontospy.gendocs.viz.viz_d3dendogram import Dataviz + from ontospy.gendocs.viz.viz_html_single import HTMLVisualizer + + if Path(file_path).is_dir(): + turtle_files = glob.glob(f"{file_path}/*.ttl") + if not turtle_files: + print(f"No turtle file(s) found to document with Ontospy in {file_path}") + return 1 + elif Path(file_path).exists(): + turtle_files = [file_path] + else: + print(f"File/dir not found: {file_path}") return 1 - print(f"\nBuilding ontospy documentation for {file_path}") + for turtle_file in turtle_files: + print(f"\nBuilding ontospy documentation for {turtle_file}") + specific_output_path = Path(output_path) / Path(turtle_file).stem + + g = ontospy.Ontospy(Path(turtle_file).resolve().as_uri()) - g = ontospy.Ontospy(file_path) + docs = HTMLVisualizer(g) + docs_path = os.path.join(specific_output_path, "docs") + docs.build(docs_path) # => build and save docs/visualization. - docs = HTMLVisualizer(g) - docs_path = os.path.join(output_path, "docs") - docs.build(docs_path) # => build and save docs/visualization. + viz = Dataviz(g) + viz_path = os.path.join(specific_output_path, "dendro") + viz.build(viz_path) # => build and save docs/visualization. - viz = Dataviz(g) - viz_path = os.path.join(output_path, "dendro") - viz.build(viz_path) # => build and save docs/visualization. return 0 @@ -318,9 +330,9 @@ def check(fpath, outfile): subsequent_empty_rows = 0 seen_conceptIRIs = [] failed_check = False - for row_no, row in enumerate(ws.iter_rows(min_row=3, max_col=3), 3): + for row in ws.iter_rows(min_row=3, max_col=3): # pragma: no branch if row[0].value and row[1].value: - conceptIRI, preflabel, lang = [ + conceptIRI, _, lang = [ c.value.strip() if c.value is not None else "" for c in row ] @@ -331,7 +343,7 @@ def check(fpath, outfile): f'ERROR: Same Concept IRI "{conceptIRI}" used more than once for ' f'language "{lang}"' ) - # colorise problematic cells + # colorize problematic cells row[0].fill = color row[2].fill = color seen_in_row = 3 + seen_conceptIRIs.index(new_conceptIRI) @@ -341,7 +353,7 @@ def check(fpath, outfile): seen_conceptIRIs.append(new_conceptIRI) subsequent_empty_rows = 0 - elif row[0].value is None and row[1].value is None: + else: # stop processing a sheet after 3 empty rows if subsequent_empty_rows < 2: subsequent_empty_rows += 1 @@ -354,12 +366,14 @@ def check(fpath, outfile): print(f"Saved file with highlighted errors as {outfile}") return 1 - print("All checks passed succesfully.") + print("All checks passed successfully.") return 0 def run_vocexcel(args=None): - retval = main(args) + if args is None: # pragma: no cover + args = [] # Important! Prevents vocexcel to use args from sys.argv. + retval = vocexcel_main(args) if retval is not None: return 1 return 0 @@ -432,7 +446,7 @@ def main_cli(args=None): parser.add_argument( "-f", "--forward", - help=("Forward file resulting from other running other options to vocexcel."), + help=("Forward file resulting from running other options to vocexcel."), action="store_true", ) @@ -483,12 +497,15 @@ def main_cli(args=None): ), ) - args_wrapper, vocexcel_args = parser.parse_known_args(args) + args_wrapper, _ = parser.parse_known_args(args) + vocexcel_args = args_wrapper.vocexcel_options or [] + + err = 0 # return error code if not has_args: # show help if no args are given parser.print_help() - parser.exit() + return err outdir = args_wrapper.output_directory if outdir is not None and not os.path.isdir(outdir): @@ -503,7 +520,7 @@ def main_cli(args=None): vocexcel_args.append("--logfile") vocexcel_args.append(str(logfile)) - if args_wrapper.indent_separator: + if args_wrapper.indent_separator is not None: sep = args_wrapper.indent_separator if not len(sep): raise ValueError( @@ -511,11 +528,11 @@ def main_cli(args=None): ) else: # Excel's default indent / openpyxl.styles.Alignment(indent=0) sep = None - err = 0 + if args_wrapper.version: - print(f"{__version__}") + print(f"voc4cat {__version__}") - elif args_wrapper.hierarchy_from_indent: + elif args_wrapper.hierarchy_from_indent or args_wrapper.hierarchy_to_indent: if is_file_available(args_wrapper.file_to_preprocess, ftype="excel"): fprefix, fsuffix = str(args_wrapper.file_to_preprocess).rsplit(".", 1) fname = os.path.split(fprefix)[1] # split off leading dirs @@ -524,22 +541,12 @@ def main_cli(args=None): else: outfile = Path(outdir) / Path(f"{fname}.{fsuffix}") else: - # processing all file in directory is not supported for now. + # processing all files in directory is not supported for now. raise NotImplementedError() - hierarchy_from_indent(args_wrapper.file_to_preprocess, outfile, sep) - - elif args_wrapper.hierarchy_to_indent: - if is_file_available(args_wrapper.file_to_preprocess, ftype="excel"): - fprefix, fsuffix = str(args_wrapper.file_to_preprocess).rsplit(".", 1) - fname = os.path.split(fprefix)[1] # split off leading dirs - if outdir is None: - outfile = args_wrapper.file_to_preprocess - else: - outfile = Path(outdir) / Path(f"{fname}.{fsuffix}") + if args_wrapper.hierarchy_from_indent: + hierarchy_from_indent(args_wrapper.file_to_preprocess, outfile, sep) else: - # processin all file in directory is not supported for now. - raise NotImplementedError() - hierarchy_to_indent(args_wrapper.file_to_preprocess, outfile, sep) + hierarchy_to_indent(args_wrapper.file_to_preprocess, outfile, sep) elif args_wrapper.add_IRI or args_wrapper.check: funcs = [ @@ -558,13 +565,13 @@ def main_cli(args=None): fprefix, fsuffix = xlf.rsplit(".", 1) fname = os.path.split(fprefix)[1] # split off leading dirs if outdir is None: - outfile = xlf + outfile = Path(xlf) else: outfile = Path(outdir) / Path(f"{fname}.{fsuffix}") - infile = xlf + infile = Path(xlf) for func in funcs: if not may_overwrite(args_wrapper.no_warn, xlf, outfile, func): - parser.exit() + return 1 err += func(infile, outfile) infile = outfile if args_wrapper.forward: @@ -576,20 +583,20 @@ def main_cli(args=None): if args_wrapper.docs and args_wrapper.forward and to_build_docs: indir = args_wrapper.file_to_preprocess if outdir is None else outdir - doc_path = infile.parent[0] if outdir is None else outdir + doc_path = infile.parent if outdir is None else outdir err += run_ontospy(indir, doc_path) elif is_file_available(args_wrapper.file_to_preprocess, ftype="excel"): fprefix, fsuffix = str(args_wrapper.file_to_preprocess).rsplit(".", 1) fname = os.path.split(fprefix)[1] # split off leading dirs if outdir is None: - outfile = args_wrapper.file_to_preprocess + outfile = Path(args_wrapper.file_to_preprocess) else: outfile = Path(outdir) / Path(f"{fname}.{fsuffix}") infile = args_wrapper.file_to_preprocess for func in funcs: if not may_overwrite(args_wrapper.no_warn, infile, outfile, func): - parser.exit() + return 1 err += func(infile, outfile) infile = outfile if args_wrapper.forward: @@ -599,20 +606,29 @@ def main_cli(args=None): err += run_vocexcel(locargs) if args_wrapper.docs: infile = Path(infile).with_suffix(".ttl") if outdir is None else outfile - doc_path = infile.parent[0] if outdir is None else outdir + doc_path = infile.parent if outdir is None else outdir err += run_ontospy(infile, doc_path) else: - parser.exit() + print( + "Expected xlsx-file or directory but got: {0}".format( + args_wrapper.file_to_preprocess + ) + ) + return 1 elif args_wrapper and args_wrapper.file_to_preprocess: if os.path.isdir(args_wrapper.file_to_preprocess): - to_build_docs = False dir_ = args_wrapper.file_to_preprocess - if duplicates := has_file_in_more_than_one_format(dir_): + if duplicates := has_file_in_more_than_one_format(dir_): # noqa: WPS332 print( "Files may only be present in one format. Found more than one " "format for:\n " + "\n ".join(duplicates) ) - parser.exit() + return 1 + + turtle_files = glob.glob(os.path.join(dir_, "*.ttl")) + glob.glob( + os.path.join(dir_, "*.turtle") + ) + print("\nCalling VocExcel for Excel files") for xlf in glob.glob(os.path.join(dir_, "*.xlsx")): print(f" {xlf}") @@ -628,9 +644,7 @@ def main_cli(args=None): err += run_vocexcel(locargs) print("Calling VocExcel for turtle files") - for ttlf in glob.glob(os.path.join(dir_, "*.ttl")) + glob.glob( - os.path.join(dir_, "*.turtle") - ): + for ttlf in turtle_files: print(f" {ttlf}") locargs = list(vocexcel_args) locargs.append(ttlf) @@ -642,19 +656,19 @@ def main_cli(args=None): outfile = Path(outdir) / Path(f"{fname}.xlsx") locargs = ["--outputfile", str(outfile)] + locargs err += run_vocexcel(locargs) - to_build_docs = True - if args_wrapper.docs and args_wrapper.forward and to_build_docs: + if args_wrapper.docs and (args_wrapper.forward or turtle_files): infile = args_wrapper.file_to_preprocess - doc_path = outdir if outdir is not None else infile.parent[0] + doc_path = infile if outdir is None else outdir err += run_ontospy(infile, doc_path) elif is_file_available(args_wrapper.file_to_preprocess, ftype=["excel", "rdf"]): print(f"Calling VocExcel for file {args_wrapper.file_to_preprocess}") - err += run_vocexcel(args) + file_to_process = str(args_wrapper.file_to_preprocess) + err += run_vocexcel(vocexcel_args[:] + [file_to_process]) if args_wrapper.docs: infile = Path(args_wrapper.file_to_preprocess).with_suffix(".ttl") - doc_path = outdir if outdir is not None else infile.parent[0] + doc_path = outdir if outdir is not None else infile.parent err += run_ontospy(infile, doc_path) else: if os.path.exists(args_wrapper.file_to_preprocess): @@ -664,14 +678,15 @@ def main_cli(args=None): + list(RDF_FILE_ENDINGS.keys()) ) print(f"Files for processing must end with one of {endings}.") - parser.exit() + else: + print("File not found: {args_wrapper.file_to_preprocess}.") + err = 1 else: - print("\nThis part should not be reached!") + raise AssertionError("This part should never be reached!") return err if __name__ == "__main__": err = main_cli(sys.argv[1:]) - # CI needs to know if an error occurred (failed check or validation error) sys.exit(err) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..f234a14 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,10 @@ +# Common pytest fixtures for all test modules +import pytest + + +@pytest.fixture +def datadir(): + """DATADIR as a LocalPath""" + from pathlib import Path + + return Path(__file__).parent / "data" diff --git a/tests/data/README.md b/tests/data/README.md new file mode 100644 index 0000000..5146b50 --- /dev/null +++ b/tests/data/README.md @@ -0,0 +1 @@ +This folder holds xlsx files used in the tests. diff --git a/tests/data/concept-scheme-with-cycles.ttl b/tests/data/concept-scheme-with-cycles.ttl new file mode 100644 index 0000000..7f35419 --- /dev/null +++ b/tests/data/concept-scheme-with-cycles.ttl @@ -0,0 +1,78 @@ +@prefix cs: . +@prefix dcat: . +@prefix dcterms: . +@prefix owl: . +@prefix rdfs: . +@prefix skos: . +@prefix xsd: . + + a skos:Collection ; + dcterms:identifier "undirected-cycle"^^xsd:token ; + dcterms:isPartOf cs: ; + dcterms:provenance "Prov for unidirected cycle"@en ; + skos:definition "Collection of Term to test unidirected branch-join-cycle"@en ; + skos:member , + , + , + ; + skos:prefLabel "Undirected cycle"@en . + + a skos:Concept ; + dcterms:identifier "term2"^^xsd:token ; + dcterms:provenance "Prov for term2"@en ; + rdfs:isDefinedBy ; + skos:altLabel "AltLbl for term2"@en ; + skos:broader ; + skos:definition "def for term2"@en ; + skos:inScheme cs: ; + skos:narrower ; + skos:prefLabel "term2"@en . + + a skos:Concept ; + dcterms:identifier "term3"^^xsd:token ; + dcterms:provenance "Prov for term3"@en ; + rdfs:isDefinedBy ; + skos:altLabel "AltLbl for term3"@en ; + skos:broader ; + skos:definition "def for term3"@en ; + skos:inScheme cs: ; + skos:narrower ; + skos:prefLabel "term3"@en . + + a skos:Concept ; + dcterms:identifier "term4"^^xsd:token ; + dcterms:provenance "Prov for term4"@en ; + rdfs:isDefinedBy ; + skos:altLabel "AltLbl for term4"@en ; + skos:broader , + ; + skos:definition "def for term4"@en ; + skos:inScheme cs: ; + skos:prefLabel "term4"@en . + + a skos:Concept ; + dcterms:identifier "term1"^^xsd:token ; + dcterms:provenance "Prov for term1"@en ; + rdfs:isDefinedBy ; + skos:altLabel "AltLbl for term1"@en ; + skos:definition "def for term1"@en ; + skos:inScheme cs: ; + skos:narrower , + ; + skos:prefLabel "term1"@en ; + skos:topConceptOf cs: . + +cs: a skos:ConceptScheme ; + dcterms:created "2022-12-01"^^xsd:date ; + dcterms:creator ; + dcterms:hasPart ; + dcterms:identifier "test"^^xsd:token ; + dcterms:modified "2022-12-01"^^xsd:date ; + dcterms:provenance "Leibniz-Institut für Katalyse e.V. (LIKAT)"@en ; + dcterms:publisher ; + owl:versionInfo "0.1" ; + skos:definition "A concept scheme for unit testing voc4cat."@en ; + skos:hasTopConcept ; + skos:prefLabel "voc4cat-test-data"@en ; + dcat:contactPoint "David Linke" . + diff --git a/tests/data/concept-scheme-with-cycles.xlsx b/tests/data/concept-scheme-with-cycles.xlsx new file mode 100644 index 0000000..0f31c7f Binary files /dev/null and b/tests/data/concept-scheme-with-cycles.xlsx differ diff --git a/tests/data/concept-scheme-with-cycles_indent-by-dot.xlsx b/tests/data/concept-scheme-with-cycles_indent-by-dot.xlsx new file mode 100644 index 0000000..9145551 Binary files /dev/null and b/tests/data/concept-scheme-with-cycles_indent-by-dot.xlsx differ diff --git a/tests/data/concept-scheme-with-cycles_indent.xlsx b/tests/data/concept-scheme-with-cycles_indent.xlsx new file mode 100644 index 0000000..82b88aa Binary files /dev/null and b/tests/data/concept-scheme-with-cycles_indent.xlsx differ diff --git a/tests/data/concept-scheme-with-cycles_indent_iri.xlsx b/tests/data/concept-scheme-with-cycles_indent_iri.xlsx new file mode 100644 index 0000000..750f9dc Binary files /dev/null and b/tests/data/concept-scheme-with-cycles_indent_iri.xlsx differ diff --git a/tests/test_merge_vocab.py b/tests/test_merge_vocab.py new file mode 100644 index 0000000..0a822e4 --- /dev/null +++ b/tests/test_merge_vocab.py @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +import shutil +from unittest import mock + +from test_wrapper import CS_CYCLES_TURTLE +from voc4cat.merge_vocab import main_cli + + +def test_main_no_args_entrypoint(monkeypatch, capsys): + monkeypatch.setattr("sys.argv", ["merge_vocab"]) + exit_code = main_cli() + captured = capsys.readouterr() + assert "Usage: " in captured.out + assert exit_code == 1 + + +def test_main_no_args(capsys): + exit_code = main_cli([]) + captured = capsys.readouterr() + assert "Usage: " in captured.out + assert exit_code == 1 + + +def test_main_no_files(capsys): + exit_code = main_cli(["aa", "bb"]) + captured = capsys.readouterr() + assert 'This script requires both folders to exist: "aa" and "bb"' in captured.out + assert exit_code == 1 + + +def test_main_merge_dirs(datadir, tmp_path, capsys): + """Check merge that only copies files.""" + vocab = tmp_path / "vocab" + vocab.mkdir() + ttl_inbox = tmp_path / "ttl_inbox" + ttl_inbox.mkdir() + extra = ttl_inbox / "extra" + extra.mkdir() + shutil.copy(datadir / CS_CYCLES_TURTLE, ttl_inbox / CS_CYCLES_TURTLE) + exit_code = main_cli([str(ttl_inbox), str(vocab)]) + captured = capsys.readouterr() + assert f'Skipping "{extra}"' in captured.out + assert (vocab / CS_CYCLES_TURTLE).exists() + assert exit_code == 0 + + +def test_main_merge_files(datadir, tmp_path, capsys): + """Check merge that merges the content of files.""" + vocab = tmp_path / "vocab" + vocab.mkdir() + ttl_inbox = tmp_path / "ttl_inbox" + ttl_inbox.mkdir() + new = ttl_inbox / CS_CYCLES_TURTLE + shutil.copy(datadir / CS_CYCLES_TURTLE, new) + exists = vocab / CS_CYCLES_TURTLE + shutil.copy(datadir / CS_CYCLES_TURTLE, exists) + exit_code = main_cli([str(ttl_inbox), str(vocab)]) + captured = capsys.readouterr() + assert f"git merge-file --theirs {exists} {exists} {new}" in captured.out + assert exit_code == 0 + + with mock.patch("voc4cat.merge_vocab.subprocess") as subprocess: + subprocess.Popen.return_value.returncode = 1 + exit_code = main_cli([str(ttl_inbox), str(vocab)]) + assert exit_code == 1 diff --git a/tests/test_wrapper.py b/tests/test_wrapper.py new file mode 100644 index 0000000..f085ada --- /dev/null +++ b/tests/test_wrapper.py @@ -0,0 +1,466 @@ +# -*- coding: utf-8 -*- +import os +import shutil +from pathlib import Path + +import pytest +from openpyxl.reader.excel import load_workbook + +from voc4cat.wrapper import main_cli, run_ontospy + +CS_CYCLES = "concept-scheme-with-cycles.xlsx" +CS_CYCLES_TURTLE = "concept-scheme-with-cycles.ttl" +CS_CYCLES_INDENT = "concept-scheme-with-cycles_indent.xlsx" +CS_CYCLES_INDENT_IRI = "concept-scheme-with-cycles_indent_iri.xlsx" +CS_CYCLES_INDENT_DOT = "concept-scheme-with-cycles_indent-by-dot.xlsx" + + +def test_main_no_args_entrypoint(monkeypatch, capsys): + monkeypatch.setattr("sys.argv", ["voc4at"]) + exit_code = main_cli() + captured = capsys.readouterr() + assert "usage: voc4cat" in captured.out + assert exit_code == 0 + + +def test_main_no_args(capsys): + exit_code = main_cli([]) + captured = capsys.readouterr() + assert "usage: voc4cat" in captured.out + assert exit_code == 0 + + +def test_main_version(capsys): + exit_code = main_cli(["--version"]) + captured = capsys.readouterr() + assert captured.out.startswith("voc4cat") + assert exit_code == 0 + + +def test_add_IRI_missing_file(capsys): + # Try to run a command that would overwrite the input file with the output file + exit_code = main_cli(["--add_IRI", "missing.xyz"]) + captured = capsys.readouterr() + assert "Expected xlsx-file or directory but got:" in captured.out + assert exit_code == 1 + + +def test_add_IRI_no_voc_base_iri(datadir, tmp_path): + shutil.copy(datadir / CS_CYCLES, tmp_path) + os.chdir(tmp_path) + # change excel file: Delete vocabulary base IRI + wb = load_workbook(filename=CS_CYCLES) + ws = wb["Concept Scheme"] + ws.cell(row=2, column=2).value = None + new_filename = "no_voc_base_iri.xlsx" + wb.save(new_filename) + wb.close() + + main_cli(["--add_IRI", "--no-warn", str(tmp_path / new_filename)]) + wb = load_workbook(filename=new_filename, read_only=True, data_only=True) + ws = wb["Concept Scheme"] + assert ws.cell(row=2, column=2).value == "https://example.org/" + + +@pytest.mark.parametrize( + "indir, outdir", + [(True, ""), (True, "out"), (False, ""), (False, "out")], + ids=[ + "in:dir, out:default", + "in:dir, out:dir", + "in:file, out:default", + "in:file, out:dir", + ], +) +def test_add_IRI_variants(datadir, tmp_path, indir, outdir): + expected = [ + ("ex:test/term1", "term1"), + ("ex:test/term3", "term3"), + ("ex:test/term4", "term4"), + ("ex:test/term2", "term2"), + ("ex:test/term4", "term4"), + ("ex:test/term1", "term1"), + ("ex:test/term2", "term2"), + ] + shutil.copy(datadir / CS_CYCLES_INDENT, tmp_path) + os.chdir(tmp_path) + main_cli( + ["--add_IRI", "--no-warn"] + + (["--output_directory", outdir] if outdir else []) + + ([str(tmp_path)] if indir else [str(tmp_path / CS_CYCLES_INDENT)]) + ) + if outdir: + xlsxfile = tmp_path / outdir / CS_CYCLES_INDENT + else: + xlsxfile = tmp_path / CS_CYCLES_INDENT + wb = load_workbook(filename=xlsxfile, read_only=True, data_only=True) + ws = wb["Concepts"] + for row, expected_row in zip( + ws.iter_rows(min_row=3, max_col=2, values_only=True), expected + ): + assert row == expected_row + + +def test_add_IRI_overwrite_warning(datadir, tmp_path): + shutil.copy(datadir / CS_CYCLES_INDENT, tmp_path / CS_CYCLES_INDENT) + os.chdir(tmp_path) + # Try to run a command that would overwrite the input file with the output file + # a) dir as input: + with pytest.warns( + UserWarning, match='Option "add_IRI" will overwrite the existing file' + ): + main_cli(["--add_IRI", str(tmp_path)]) + # b) file as input + with pytest.warns( + UserWarning, match='Option "add_IRI" will overwrite the existing file' + ): + main_cli(["--add_IRI", str(tmp_path / CS_CYCLES_INDENT)]) + + +@pytest.mark.parametrize( + "xlsxfile, indent", + [(CS_CYCLES_INDENT_IRI, None), (CS_CYCLES_INDENT_DOT, "..")], + ids=["indent:Excel", "indent:dots"], +) +def test_hierarchy_from_indent(datadir, tmp_path, xlsxfile, indent): + # fmt: off + expected = [ # data in children-IRI-representation + ('ex:test/term1', 'term1', 'en', 'def for term1', 'en', 'AltLbl for term1', 'ex:test/term2, ex:test/term3', 'Prov for term1', 'ex:XYZ/term1'), # noqa:E501 + ('ex:test/term2', 'term2', 'en', 'def for term2', 'en', 'AltLbl for term2', 'ex:test/term4', 'Prov for term2', 'ex:XYZ/term2'), # noqa:E501 + ('ex:test/term3', 'term3', 'en', 'def for term3', 'en', 'AltLbl for term3', 'ex:test/term4', 'Prov for term3', 'ex:XYZ/term3'), # noqa:E501 + ('ex:test/term4', 'term4', 'en', 'def for term4', 'en', 'AltLbl for term4', None, 'Prov for term4', 'ex:XYZ/term4'), # noqa:E501 + (None, None, None, None, None, None, None, None, None) + ] + # fmt: on + expected_len = len(expected[0]) + os.chdir(datadir) + main_cli( + ["--hierarchy-from-indent"] + + ( + [ + "--indent-separator", + indent, + ] + if indent + else [] + ) + + [ + "--output_directory", + str(tmp_path), + xlsxfile, + ] + ) + os.chdir(tmp_path) + wb = load_workbook(filename=xlsxfile, read_only=True, data_only=True) + ws = wb["Concepts"] + for row, expected_row in zip(ws.iter_rows(min_row=3, values_only=True), expected): + assert len(row) == expected_len + assert row in expected # We intentionally don't check the row position here! + + +def test_hierarchy_from_indent_merge(datadir, tmp_path): + shutil.copy(datadir / CS_CYCLES_INDENT_IRI, tmp_path) + os.chdir(tmp_path) + # change excel file: Delete vocabulary base IRI + wb = load_workbook(filename=CS_CYCLES_INDENT_IRI) + ws = wb["Concepts"] + ws.cell(row=8, column=4).value = "Contradicting def." + iri = ws.cell(row=8, column=1).value + new_filename = "indent_merge_problem.xlsx" + wb.save(new_filename) + wb.close() + with pytest.raises(ValueError) as excinfo: + main_cli(["--hierarchy-from-indent", "--no-warn", str(tmp_path / new_filename)]) + assert f"Cannot merge rows for {iri}. Resolve differences manually." in str( + excinfo.value # noqa: WPS441 + ) + + +@pytest.mark.parametrize( + "indent", + ["..", None], + ids=["indent:dots", "indent:Excel"], +) +def test_hierarchy_to_indent(datadir, tmp_path, indent): + # fmt: off + expected_rows = [ # data in children-IRI-representation + ('ex:test/term1', 'term1', 'en', 'def for term1', 'en', 'AltLbl for term1', None, 'Prov for term1', 'ex:XYZ/term1'), # noqa:E501 + ('ex:test/term3', '..term3', 'en', 'def for term3', 'en', 'AltLbl for term3', None, 'Prov for term3', 'ex:XYZ/term3'), # noqa:E501 + ('ex:test/term4', '....term4', 'en', 'def for term4', 'en', 'AltLbl for term4', None, 'Prov for term4', 'ex:XYZ/term4'), # noqa:E501 + ('ex:test/term2', 'term2', 'en', 'def for term2', 'en', 'AltLbl for term2', None, 'Prov for term2', 'ex:XYZ/term2'), # noqa:E501 + ('ex:test/term4', '..term4', 'en', None, None, None, None, None, None), + ('ex:test/term1', 'term1', 'en', None, None, None, None, None, None), + ('ex:test/term2', '..term2', 'en', None, None, None, None, None, None), + (None, None, None, None, None, None, None, None, None), + ] + # fmt: on + expected_levels = [0, 1, 2, 0, 1, 0, 1, 0] + assert len(expected_rows) == len(expected_levels) + expected_len = len(expected_rows[0]) + + os.chdir(datadir) + main_cli( + ["--hierarchy-to-indent"] + + ( + [ + "--indent-separator", + indent, + ] + if indent + else [] + ) + + [ + "--output_directory", + str(tmp_path), + CS_CYCLES, + ] + ) + os.chdir(tmp_path) + wb = load_workbook(filename=CS_CYCLES, read_only=True) + ws = wb["Concepts"] + for row, expected_row, expected_level in zip( + ws.iter_rows(min_row=3), expected_rows, expected_levels + ): + assert len(row) == expected_len + if indent is None: # Excel-indent + assert int(row[1].alignment.indent) == expected_level + + for col in range(len(expected_rows)): + if indent is None and col == 1: # Excel-indent + continue + assert row[col].value == expected_row[col] + + +@pytest.mark.parametrize( + "outdir", + [None, "out"], + ids=["no outdir", "with outdir"], +) +def test_outdir_variants(datadir, tmp_path, outdir): + shutil.copy(datadir / CS_CYCLES_INDENT_IRI, tmp_path) + cmd = ["--hierarchy-from-indent"] + if outdir: + cmd.extend(["--output_directory", str(tmp_path / outdir)]) + cmd.append(str(tmp_path / CS_CYCLES_INDENT_IRI)) + # print(f"\n>>> cmd {cmd}") + os.chdir(tmp_path) + main_cli(cmd) + + expected = [ + ("ex:test/term1", "term1"), + ("ex:test/term3", "term3"), + ("ex:test/term4", "term4"), + ("ex:test/term2", "term2"), + (None, None), + ] + if outdir: + xlsxfile = tmp_path / outdir / CS_CYCLES_INDENT_IRI + else: + xlsxfile = tmp_path / CS_CYCLES_INDENT_IRI + wb = load_workbook(filename=xlsxfile, read_only=True, data_only=True) + ws = wb["Concepts"] + for row, expected_row in zip( + ws.iter_rows(min_row=3, max_col=2, values_only=True), expected + ): + assert row == expected_row + + +@pytest.mark.parametrize( + "test_file", + [CS_CYCLES_TURTLE, ""], + ids=["single file", "dir of files"], +) +def test_run_ontospy(datadir, tmp_path, test_file): + """Check that ontospy generates the expected output.""" + dst = tmp_path / test_file + shutil.copy(datadir / CS_CYCLES_TURTLE, tmp_path) + outdir = tmp_path / "ontospy" + # To test the code-path, outdir is created automatically here. + main_cli(["--docs", "--output_directory", str(outdir), str(dst)]) + assert (outdir / Path(CS_CYCLES_TURTLE).stem / "dendro" / "index.html").exists() + assert (outdir / Path(CS_CYCLES_TURTLE).stem / "docs" / "index.html").exists() + + +def test_run_ontospy_checks(tmp_path, capsys): + """Check handling of missing dir/file.""" + exit_code = run_ontospy(tmp_path, tmp_path) + captured = capsys.readouterr() + assert exit_code == 1 + assert ( + f"No turtle file(s) found to document with Ontospy in {tmp_path}" + in captured.out + ) + + # def test_run_ontospy_check_file(tmp_path, capsys): + # """Check handling of missing dir/file.""" + exit_code = run_ontospy(tmp_path / CS_CYCLES_TURTLE, tmp_path) + captured = capsys.readouterr() + assert exit_code == 1 + assert f"File/dir not found: {tmp_path/CS_CYCLES_TURTLE}" in captured.out + + +@pytest.mark.parametrize( + "test_file,err,msg", + [ + (CS_CYCLES, 0, "All checks passed successfully."), + ( + CS_CYCLES_INDENT_IRI, + 1, + 'ERROR: Same Concept IRI "ex:test/term1"' + ' used more than once for language "en"', + ), + ], + ids=["no error", "with error"], +) +def test_check(datadir, tmp_path, capsys, test_file, err, msg): + dst = tmp_path / test_file + shutil.copy(datadir / test_file, dst) + exit_code = main_cli(["--check", "--no-warn", str(dst)]) + captured = capsys.readouterr() + # TODO check that erroneous cells get colored. + assert exit_code == err + assert msg in captured.out + + +def test_unsupported_filetype(datadir, capsys): + os.chdir(datadir) + exit_code = main_cli(["README.md"]) + captured = capsys.readouterr() + assert exit_code == 1 + assert "Files for processing must end with" in captured.out + + +def test_nonexisting_file(datadir, capsys): + os.chdir(datadir) + exit_code = main_cli(["missing.txt"]) + captured = capsys.readouterr() + assert exit_code == 1 + assert "File not found:" in captured.out + + +def test_no_separator(datadir): + os.chdir(datadir) + with pytest.raises(ValueError) as excinfo: + main_cli(["--indent-separator", "", CS_CYCLES]) + assert "Setting the indent separator to zero length is not allowed." in str( + excinfo.value # noqa: WPS441 + ) + + +def test_duplicates(datadir, tmp_path, capsys): + """Check that files do not have the same stem.""" + shutil.copy(datadir / CS_CYCLES, tmp_path) + shutil.copy(datadir / CS_CYCLES_TURTLE, tmp_path) + exit_code = main_cli([str(tmp_path)]) + captured = capsys.readouterr() + assert exit_code == 1 + assert "Files may only be present in one format." in captured.out + + +def test_run_vocexcel_badfile(datadir, tmp_path, caplog): + """Check handling of failing run of vocexcel.""" + shutil.copy(datadir / CS_CYCLES_INDENT, tmp_path) + os.chdir(tmp_path) + exit_code = main_cli([CS_CYCLES_INDENT]) + assert exit_code == 1 + # Note tge next message is logged by vocexcel so it may change. + assert "VIOLATION: Validation Result in MinCountConstraintComponent" in caplog.text + + +@pytest.mark.parametrize( + "test_file", + [CS_CYCLES, ""], + ids=["single file", "dir of files"], +) +def test_run_vocexcel(datadir, tmp_path, test_file): + """Check that an xlsx file is converted to ttl by vocexcel.""" + dst = tmp_path / test_file + shutil.copy(datadir / CS_CYCLES, dst) + # We also test if the logging option is passed on to vocexcel. + log = tmp_path / "logs" / "test-run.log" + main_cli(["--logfile", str(log), str(dst)]) + expected = (tmp_path / CS_CYCLES).with_suffix(".ttl") + assert expected.exists() + assert log.exists() + + +@pytest.mark.parametrize( + "outputdir,testfile", + [ + ("out", CS_CYCLES), + ("", CS_CYCLES), + ("out", CS_CYCLES_TURTLE), + ("", CS_CYCLES_TURTLE), + ], + ids=["out:dir & xlsx", "out:default & xlsx", "out:dir & ttl", "out:default & ttl"], +) +def test_run_vocexcel_outputdir(datadir, tmp_path, outputdir, testfile): + """Check that an xlsx file is converted to ttl by vocexcel.""" + shutil.copy(datadir / testfile, tmp_path) + os.chdir(tmp_path) + # Check if log is placed in out folder. + log = "test-run.log" + main_cli( + ["--logfile", str(log)] + + (["--output_directory", str(outputdir)] if outputdir else []) + + [str(tmp_path)] + ) + outdir = tmp_path / outputdir + if testfile.endswith("xlsx"): + assert (outdir / testfile).with_suffix(".ttl").exists() + else: + assert (outdir / testfile).with_suffix(".xlsx").exists() + assert (outdir / log).exists() + + +@pytest.mark.parametrize( + "test_file", + [CS_CYCLES, ""], + ids=["single file", "dir of files"], +) +def test_forwarding_3stages(datadir, tmp_path, test_file): + """Check a file by voc4cat then forward it to vocexcel then to ontospy.""" + dst = tmp_path / test_file + shutil.copy(datadir / CS_CYCLES, tmp_path) + os.chdir(tmp_path) + main_cli( + [ + "--check", + "--forward", + "--logfile", + "test.log", + "--no-warn", + "--docs", + str(dst), + ] + ) + assert (tmp_path / CS_CYCLES).with_suffix(".ttl").exists() + assert (tmp_path / Path(CS_CYCLES).stem / "dendro" / "index.html").exists() + assert (tmp_path / Path(CS_CYCLES).stem / "docs" / "index.html").exists() + assert (tmp_path / "test.log").exists() + + +@pytest.mark.parametrize( + "test_file", + [CS_CYCLES, ""], + ids=["single file", "dir of files"], +) +def test_forwarding_2stages(datadir, tmp_path, test_file): + """Use voc4cat to run vocexcel then foward result to ontospy.""" + dst = tmp_path / test_file + shutil.copy(datadir / CS_CYCLES, tmp_path) + os.chdir(tmp_path) + main_cli( + [ + "--forward", + "--logfile", + "test.log", + "--no-warn", + "--docs", + str(dst), + ] + ) + assert (tmp_path / CS_CYCLES).with_suffix(".ttl").exists() + assert (tmp_path / Path(CS_CYCLES).stem / "dendro" / "index.html").exists() + assert (tmp_path / Path(CS_CYCLES).stem / "docs" / "index.html").exists() + assert (tmp_path / "test.log").exists()