From fcfa2422249faf39e346d4d0f0df1692c996212e Mon Sep 17 00:00:00 2001 From: Chris Markiewicz Date: Thu, 8 Feb 2024 15:52:48 -0500 Subject: [PATCH 1/7] ENH: Create a pre-receive hook that accepts ignore rules and file listing --- tools/schemacode/bidsschematools/__main__.py | 62 ++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/tools/schemacode/bidsschematools/__main__.py b/tools/schemacode/bidsschematools/__main__.py index c8d554cf0a..e7796f3441 100644 --- a/tools/schemacode/bidsschematools/__main__.py +++ b/tools/schemacode/bidsschematools/__main__.py @@ -1,9 +1,14 @@ import logging import os +import re +import sys +from itertools import chain import click +from .rules import regexify_filename_rules from .schema import export_schema, load_schema +from .validator import _bidsignore_check @click.group() @@ -32,5 +37,62 @@ def export(ctx, schema, output): fobj.write(text) +@cli.command("pre-receive-hook") +@click.option("--schema", "-s", type=click.Path(), help="Path to the BIDS schema") +@click.option( + "--input", "-i", "input_", default="-", type=click.Path(), help="Input file (default: stdin)" +) +@click.option( + "--output", + "-o", + "output", + default="-", + type=click.Path(), + help="Output file (default: stdout)", +) +def pre_receive_hook(schema, input_, output): + """Validate filenames from a list of files against the BIDS schema + + The input should be a list of ignore patterns followed by a line containing + "0001" and then a list of filenames. The output will be a list of filenames + that do not match the schema. + + This is intended to be used in a git pre-receive hook. + """ + # Slurp inputs for now; we can think about streaming later + if input_ == "-": + lines = sys.stdin.readlines() + else: + with open(input_) as fobj: + lines = fobj.readlines() + + split = lines.index("0001\n") + ignore = [line.rstrip() for line in lines[:split]] + filenames = [line.rstrip() for line in lines[split + 1 :]] + + schema = load_schema(schema) + all_rules = chain.from_iterable( + regexify_filename_rules(group, schema, level=2) + for group in (schema.rules.files.common, schema.rules.files.raw) + ) + regexes = [rule["regex"] for rule in all_rules] + # XXX Hack for phenotype files - this can be removed once we + # have a schema definition for them + regexes.append(r"phenotype/.*\.tsv") + + output = sys.stdout if output == "-" else open(output, "w") + + rc = 0 + with output: + for filename in filenames: + if any(_bidsignore_check(pattern, filename, "") for pattern in ignore): + continue + if not any(re.match(regex, filename) for regex in regexes): + output.write(f"{filename}\n") + rc = 1 + + sys.exit(rc) + + if __name__ == "__main__": cli() From 818a7cdab90c147e182c006e46bf27daa5b88aae Mon Sep 17 00:00:00 2001 From: Chris Markiewicz Date: Thu, 8 Feb 2024 20:00:58 -0500 Subject: [PATCH 2/7] FIX: Use consistent logger, set sensible default level, enable -q --- tools/schemacode/bidsschematools/__main__.py | 6 ++++-- tools/schemacode/bidsschematools/utils.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/schemacode/bidsschematools/__main__.py b/tools/schemacode/bidsschematools/__main__.py index e7796f3441..a7594a5379 100644 --- a/tools/schemacode/bidsschematools/__main__.py +++ b/tools/schemacode/bidsschematools/__main__.py @@ -13,9 +13,11 @@ @click.group() @click.option("-v", "--verbose", count=True) -def cli(verbose): +@click.option("-q", "--quiet", count=True) +def cli(verbose, quiet): """BIDS Schema Tools""" - logging.getLogger("bidsschematools").setLevel(logging.INFO - verbose * 10) + verbose = verbose - quiet + logging.getLogger("bidsschematools").setLevel(logging.WARNING - verbose * 10) @cli.command() diff --git a/tools/schemacode/bidsschematools/utils.py b/tools/schemacode/bidsschematools/utils.py index ec2947fb37..6f9850edb0 100644 --- a/tools/schemacode/bidsschematools/utils.py +++ b/tools/schemacode/bidsschematools/utils.py @@ -29,7 +29,7 @@ def get_logger(name=None): logging.Logger logger object. """ - return logging.getLogger("bids-schema" + (".%s" % name if name else "")) + return logging.getLogger("bidsschematools" + (".%s" % name if name else "")) def set_logger_level(lgr, level): From 28e3ae29ffb191679bae56f47ce4ab54a99999d5 Mon Sep 17 00:00:00 2001 From: Chris Markiewicz Date: Wed, 28 Feb 2024 19:43:28 -0500 Subject: [PATCH 3/7] ENH: Determine dataset type from bundled dataset_description.json --- tools/schemacode/bidsschematools/__main__.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/tools/schemacode/bidsschematools/__main__.py b/tools/schemacode/bidsschematools/__main__.py index a7594a5379..be23877afc 100644 --- a/tools/schemacode/bidsschematools/__main__.py +++ b/tools/schemacode/bidsschematools/__main__.py @@ -1,3 +1,4 @@ +import json import logging import os import re @@ -69,14 +70,30 @@ def pre_receive_hook(schema, input_, output): lines = fobj.readlines() split = lines.index("0001\n") - ignore = [line.rstrip() for line in lines[:split]] + preamble = [line.rstrip() for line in lines[:split]] filenames = [line.rstrip() for line in lines[split + 1 :]] + try: + split = preamble.index("0000") + except ValueError: + description = {} + ignore = preamble + else: + description = json.loads("".join(preamble[:split])) + ignore = preamble[split + 1 :] + + dataset_type = description.get("DatasetType", "raw") schema = load_schema(schema) all_rules = chain.from_iterable( regexify_filename_rules(group, schema, level=2) for group in (schema.rules.files.common, schema.rules.files.raw) ) + if dataset_type == "derivative": + all_rules = chain( + all_rules, + regexify_filename_rules(schema.rules.files.derivatives, schema, level=2), + ) + regexes = [rule["regex"] for rule in all_rules] # XXX Hack for phenotype files - this can be removed once we # have a schema definition for them From 017754390b6ec7b30732b577b87eafbf5e92440d Mon Sep 17 00:00:00 2001 From: Chris Markiewicz Date: Fri, 1 Mar 2024 12:12:42 -0500 Subject: [PATCH 4/7] RF: Stream filename validation, rewrite protocol The 0001 is a particular git-protocol-ism, and 0000 does not behave as I expected. Instead of adding multiple 0001s and attempting to identify the meaning, the new protocol preceeds the old protocol with a header line and a single JSON line. --- tools/schemacode/bidsschematools/__main__.py | 63 ++++++++++++++------ 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/tools/schemacode/bidsschematools/__main__.py b/tools/schemacode/bidsschematools/__main__.py index be23877afc..1cea65a894 100644 --- a/tools/schemacode/bidsschematools/__main__.py +++ b/tools/schemacode/bidsschematools/__main__.py @@ -62,26 +62,42 @@ def pre_receive_hook(schema, input_, output): This is intended to be used in a git pre-receive hook. """ + logger = logging.getLogger("bidsschematools") + schema = load_schema(schema) + # Slurp inputs for now; we can think about streaming later if input_ == "-": - lines = sys.stdin.readlines() + stream = sys.stdin else: - with open(input_) as fobj: - lines = fobj.readlines() - - split = lines.index("0001\n") - preamble = [line.rstrip() for line in lines[:split]] - filenames = [line.rstrip() for line in lines[split + 1 :]] - try: - split = preamble.index("0000") - except ValueError: - description = {} - ignore = preamble + stream = open(input_) + + first_line = next(stream) + if first_line == "bids-hook-v2\n": + # V2 format: header line, description JSON, followed by legacy format + description_str = next(stream) + fail = False + try: + description: dict = json.loads(description_str) + except json.JSONDecodeError: + fail = True + if fail or not isinstance(description, dict): + logger.critical("Protocol error: invalid JSON in description") + logger.critical( + "Dataset description must be one JSON object, written to a single line" + ) + logger.critical("Received: %s", description_str) + stream.close() + sys.exit(2) else: - description = json.loads("".join(preamble[:split])) - ignore = preamble[split + 1 :] + # Legacy: ignore patterns, followed by "0001", followed by filenames + stream = chain([first_line], stream) + description = {} dataset_type = description.get("DatasetType", "raw") + logger.info("Dataset type: %s", dataset_type) + + ignore = [line.strip() for line in stream if line != "0001\n"] + logger.info("Ignore patterns found: %d", len(ignore)) schema = load_schema(schema) all_rules = chain.from_iterable( @@ -97,19 +113,32 @@ def pre_receive_hook(schema, input_, output): regexes = [rule["regex"] for rule in all_rules] # XXX Hack for phenotype files - this can be removed once we # have a schema definition for them - regexes.append(r"phenotype/.*\.tsv") + regexes.append(r"phenotype/.*\.(tsv|json)") output = sys.stdout if output == "-" else open(output, "w") rc = 0 + any_files = False + valid_files = 0 with output: - for filename in filenames: + for filename in stream: + if not any_files: + logger.debug("Validating files, first file: %s", filename) + any_files = True + filename = filename.strip() if any(_bidsignore_check(pattern, filename, "") for pattern in ignore): continue if not any(re.match(regex, filename) for regex in regexes): - output.write(f"{filename}\n") + print(filename, file=output) rc = 1 + else: + valid_files += 1 + + if valid_files == 0: + logger.error("No files to validate") + rc = 2 + stream.close() sys.exit(rc) From ec42201656995fe2a546a1e03db7854bb975af11 Mon Sep 17 00:00:00 2001 From: Chris Markiewicz Date: Sat, 23 Mar 2024 14:13:49 -0400 Subject: [PATCH 5/7] DOC: Update docstring --- tools/schemacode/bidsschematools/__main__.py | 27 +++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/tools/schemacode/bidsschematools/__main__.py b/tools/schemacode/bidsschematools/__main__.py index 1cea65a894..9a35a2782a 100644 --- a/tools/schemacode/bidsschematools/__main__.py +++ b/tools/schemacode/bidsschematools/__main__.py @@ -56,9 +56,30 @@ def export(ctx, schema, output): def pre_receive_hook(schema, input_, output): """Validate filenames from a list of files against the BIDS schema - The input should be a list of ignore patterns followed by a line containing - "0001" and then a list of filenames. The output will be a list of filenames - that do not match the schema. + The expected input takes the following form: + + ``` + bids-hook-v2 + {"Name": "My dataset", "BIDSVersion": "1.9.0", "DatasetType": "raw"} + ignore-pattern1 + ... + ignore-patternN + 0001 + .datalad/config + .gitattributes + CHANGES + README + dataset_description.json + participants.tsv + sub-01/anat/sub-01_T1w.nii.gz + ... + ``` + + The header identifies the protocol version. For protocol ``bids-hook-v2``, + the second line MUST be the dataset_description.json file, with any newlines removed. + The following lines, up to the line containing "0001", are ignore patterns + from the .bidsignore file. The lines following "0001" are the filenames to + be validated. This is intended to be used in a git pre-receive hook. """ From 6389bdcfc63ce435a1c1a35fc0aec8cacfcf97bf Mon Sep 17 00:00:00 2001 From: Chris Markiewicz Date: Wed, 13 Nov 2024 15:32:49 -0500 Subject: [PATCH 6/7] Apply suggestions from code review Co-authored-by: Nell Hardcastle --- tools/schemacode/bidsschematools/__main__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/schemacode/bidsschematools/__main__.py b/tools/schemacode/bidsschematools/__main__.py index 9a35a2782a..f53fa31997 100644 --- a/tools/schemacode/bidsschematools/__main__.py +++ b/tools/schemacode/bidsschematools/__main__.py @@ -117,10 +117,13 @@ def pre_receive_hook(schema, input_, output): dataset_type = description.get("DatasetType", "raw") logger.info("Dataset type: %s", dataset_type) - ignore = [line.strip() for line in stream if line != "0001\n"] + ignore = [] + for line in stream: + if line == "0001\n": + break + ignore.append(line.strip()) logger.info("Ignore patterns found: %d", len(ignore)) - schema = load_schema(schema) all_rules = chain.from_iterable( regexify_filename_rules(group, schema, level=2) for group in (schema.rules.files.common, schema.rules.files.raw) From a31cecaf7b700ccc2097778f764abe2803218b59 Mon Sep 17 00:00:00 2001 From: Chris Markiewicz Date: Wed, 13 Nov 2024 15:36:00 -0500 Subject: [PATCH 7/7] Update tools/schemacode/bidsschematools/__main__.py --- tools/schemacode/bidsschematools/__main__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/schemacode/bidsschematools/__main__.py b/tools/schemacode/bidsschematools/__main__.py index 1cd508b596..777bea2d87 100644 --- a/tools/schemacode/bidsschematools/__main__.py +++ b/tools/schemacode/bidsschematools/__main__.py @@ -1,6 +1,7 @@ import json import logging import os +import re import sys from itertools import chain