From fdbdff86df7d41368829269bbe9975c6f37b7aa9 Mon Sep 17 00:00:00 2001 From: Luke Zappia Date: Thu, 6 Feb 2025 11:21:10 +0100 Subject: [PATCH] Update get_dataset_info to new schema --- .../get_dataset_info/config.vsh.yaml | 50 ++++++----- src/reporting/get_dataset_info/script.R | 84 ++++++++++--------- src/reporting/get_method_info/config.vsh.yaml | 6 +- src/reporting/get_metric_info/config.vsh.yaml | 6 +- .../schemas/dataset_info_schema.json | 78 +++++++++++++++++ src/reporting/schemas/method_info_schema.json | 2 +- src/reporting/schemas/metric_info_schema.json | 2 +- 7 files changed, 163 insertions(+), 65 deletions(-) create mode 100644 src/reporting/schemas/dataset_info_schema.json diff --git a/src/reporting/get_dataset_info/config.vsh.yaml b/src/reporting/get_dataset_info/config.vsh.yaml index 0c32dd555..cfdf99205 100644 --- a/src/reporting/get_dataset_info/config.vsh.yaml +++ b/src/reporting/get_dataset_info/config.vsh.yaml @@ -1,35 +1,47 @@ name: get_dataset_info namespace: reporting -description: Extract dataset info and convert to expected format for website results -arguments: - - name: --input - type: file - description: A yaml file - required: true - example: resources_test/openproblems/task_results_v3/raw/dataset_uns.yaml - - name: --output - type: file - direction: output - default: output.json - description: Output json - info: - format: - type: json - # TODO: add schema +description: Convert dataset uns YAML to schema-compliant JSON + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: A YAML file containing dataset uns + required: true + example: resources_test/openproblems/task_results_v4/raw/dataset_uns.yaml + + - name: Outputs + arguments: + - name: --output + type: file + direction: output + default: dataset_info.json + description: Output JSON file matching dataset info schema + info: + format: + type: json + schema: /src/reporting/schemas/dataset_info_schema.json + example: resources_test/openproblems/task_results_v4/processed/dataset_info.json + resources: - type: r_script path: script.R + test_resources: - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/openproblems/task_results_v3 - dest: resources_test/openproblems/task_results_v3 + - path: /resources_test/openproblems/task_results_v4 + dest: resources_test/openproblems/task_results_v4 + engines: - type: docker image: openproblems/base_r:1.0.0 setup: - type: r - cran: [ purrr, yaml, rlang, processx ] + cran: + - purrr + runners: - type: executable - type: nextflow diff --git a/src/reporting/get_dataset_info/script.R b/src/reporting/get_dataset_info/script.R index 797fdb1ad..0d1cb3ca9 100644 --- a/src/reporting/get_dataset_info/script.R +++ b/src/reporting/get_dataset_info/script.R @@ -1,53 +1,61 @@ -requireNamespace("jsonlite", quietly = TRUE) -requireNamespace("yaml", quietly = TRUE) -library(purrr, warn.conflicts = FALSE) -library(rlang, warn.conflicts = FALSE) - ## VIASH START par <- list( - input = "resources_test/openproblems/task_results_v3/raw/dataset_uns.yaml", - output = "resources_test/openproblems/task_results_v3/processed/dataset_info.json" + input = "resources_test/openproblems/task_results_v4/raw/dataset_uns.yaml", + output = "resources_test/openproblems/task_results_v4/processed/dataset_info.json" ) ## VIASH END -datasets <- yaml::yaml.load_file(par$input) - -# transform into format expected by website -outputs <- map(datasets, function(dataset) { - # ↑ the 'dataset' object could be used as the new format +cat("====== Get dataset info ======\n") - # TODO: it'd be nice if the s3 path was also included in the dataset info +`%||%` <- rlang::`%||%` - # construct v1 format - out <- list( - "dataset_id" = dataset$dataset_id, - "dataset_name" = dataset$dataset_name, - "dataset_summary" = dataset$dataset_summary, - "dataset_description" = dataset$dataset_description %||% NA_character_, - "data_reference" = dataset$dataset_reference %||% NA_character_, - "data_url" = dataset$dataset_url %||% NA_character_, - "date_created" = dataset$date_created %||% NA_character_, - "file_size" = dataset$file_size %||% NA_character_ - ) - - if (!is.null(dataset[["common_dataset_id"]])) { - out[["common_dataset_id"]] <- dataset[["common_dataset_id"]] - } +cat("\n>>> Reading input files...\n") +cat("Reading dataset uns from '", par$input, "'...\n", sep = "") +dataset_uns <- yaml::yaml.load_file( + par$input, + # Read file sizes as floats to avoid issues with big integers + handlers = list(int = \(x) {as.numeric(x)}) +) - # show warning when certain data is missing and return null? - for (n in names(out)) { - if (is.null(out[[n]])) { - out_as_str <- jsonlite::toJSON(out, auto_unbox = TRUE, pretty = TRUE) - stop("missing value for value '", n, "' in ", out_as_str) - } +cat( + "\n>>> Processing ", length(dataset_uns), " datasets...\n", + sep = "" +) +dataset_info_json <- purrr::map(dataset_uns, function(.dataset) { + cat("Processing dataset uns '", .dataset$dataset_id, "'\n", sep = "") + + references <- if (is.list(.dataset$dataset_reference)) { + list( + doi = .dataset$dataset_reference$doi %||% character(0), + bibtex = .dataset$dataset_reference$bibtex %||% character(0) + ) + } else { + .dataset$dataset_reference } - out + list( + name = jsonlite::unbox(.dataset$dataset_id), + label = jsonlite::unbox(.dataset$dataset_name), + commit = jsonlite::unbox(.dataset$dataset_commit %||% "missing-sha"), + summary = jsonlite::unbox(.dataset$dataset_summary), + description = jsonlite::unbox(.dataset$dataset_description), + source_url = jsonlite::unbox(.dataset$dataset_url), + common_dataset_names = .dataset$common_dataset_id, + modalities = jsonlite::unbox(.dataset$dataset_modality), + organisms = .dataset$dataset_organism, + references = references, + date_created = jsonlite::unbox(.dataset$date_created), + file_size_mb = jsonlite::unbox(.dataset$file_size / 1048576) + ) }) +cat("\n>>> Writing output files...\n") +cat("Writing dataset info to '", par$output, "'...\n", sep = "") jsonlite::write_json( - outputs, + dataset_info_json, par$output, - auto_unbox = TRUE, - pretty = TRUE + pretty = TRUE, + null = "null" ) + +cat("\n>>> Done!\n") diff --git a/src/reporting/get_method_info/config.vsh.yaml b/src/reporting/get_method_info/config.vsh.yaml index 12c0c928b..980e16402 100644 --- a/src/reporting/get_method_info/config.vsh.yaml +++ b/src/reporting/get_method_info/config.vsh.yaml @@ -11,7 +11,7 @@ argument_groups: required: true example: resources_test/openproblems/task_results_v4/raw/method_configs.yaml - - name: Inputs + - name: Outputs arguments: - name: --output type: file @@ -31,8 +31,8 @@ resources: test_resources: - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/openproblems/task_results_v3 - dest: resources_test/openproblems/task_results_v3 + - path: /resources_test/openproblems/task_results_v4 + dest: resources_test/openproblems/task_results_v4 engines: - type: docker diff --git a/src/reporting/get_metric_info/config.vsh.yaml b/src/reporting/get_metric_info/config.vsh.yaml index 107f25679..ecde1607a 100644 --- a/src/reporting/get_metric_info/config.vsh.yaml +++ b/src/reporting/get_metric_info/config.vsh.yaml @@ -11,7 +11,7 @@ argument_groups: required: true example: resources_test/openproblems/task_results_v4/raw/metric_configs.yaml - - name: Inputs + - name: Outputs arguments: - name: --output type: file @@ -31,8 +31,8 @@ resources: test_resources: - type: python_script path: /common/component_tests/run_and_check_output.py - - path: /resources_test/openproblems/task_results_v3 - dest: resources_test/openproblems/task_results_v3 + - path: /resources_test/openproblems/task_results_v4 + dest: resources_test/openproblems/task_results_v4 engines: - type: docker diff --git a/src/reporting/schemas/dataset_info_schema.json b/src/reporting/schemas/dataset_info_schema.json new file mode 100644 index 000000000..b937eb3b0 --- /dev/null +++ b/src/reporting/schemas/dataset_info_schema.json @@ -0,0 +1,78 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://openproblems.bio/schemas/dataset_info_schema.json", + "title": "Method info", + "description": "Information about datasets in a task", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "description": "Method identifier", + "type": "string" + }, + "label": { + "description": "Human-readable label for the method", + "type": "string" + }, + "commit": { + "description": "Git commit SHA for this version of the method", + "type": "string" + }, + "summary": { + "description": "One line summary of the method", + "type": "string" + }, + "description": { + "description": "Detailed description of the method", + "type": "string" + }, + "source_url": { + "description": "URL to the original source of the dataset", + "type": "string" + }, + "common_dataset_names": { + "description": "Identifier(s) for the common dataset(s) this dataset was derived from. Only if the dataset_name is different.", + "type": ["null", "array"], + "items": { + "type": "string" + } + }, + "modalities": { + "description": "Modalities of the dataset", + "type": ["null", "array"], + "items": { + "type": "string" + } + }, + "organisms": { + "description": "Organisms in the dataset", + "type": "array", + "items": { + "type": "string" + } + }, + "references": { + "$ref": "https://openproblems.bio/schemas/references_schema.json" + }, + "file_size_mb": { + "description": "Size of the dataset in MB", + "type": "number" + } + }, + "required": [ + "name", + "label", + "commit", + "summary", + "description", + "source_url", + "common_dataset_names", + "modalities", + "organisms", + "references", + "date_created", + "file_size_mb" + ] + } +} diff --git a/src/reporting/schemas/method_info_schema.json b/src/reporting/schemas/method_info_schema.json index cf470f47b..64eede7c1 100644 --- a/src/reporting/schemas/method_info_schema.json +++ b/src/reporting/schemas/method_info_schema.json @@ -1,6 +1,6 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://openproblems.bio/schemas/task_info_schema.json", + "$id": "https://openproblems.bio/schemas/method_info_schema.json", "title": "Method info", "description": "Information about methods in a task", "type": "array", diff --git a/src/reporting/schemas/metric_info_schema.json b/src/reporting/schemas/metric_info_schema.json index 6171a8abe..7ce6a4ba9 100644 --- a/src/reporting/schemas/metric_info_schema.json +++ b/src/reporting/schemas/metric_info_schema.json @@ -1,6 +1,6 @@ { "$schema": "https://json-schema.org/draft/2020-12/schema", - "$id": "https://openproblems.bio/schemas/task_info_schema.json", + "$id": "https://openproblems.bio/schemas/metric_info_schema.json", "title": "Metric info", "description": "Information about metrics in a task", "type": "array",