Skip to content

Commit

Permalink
Update get_dataset_info to new schema
Browse files Browse the repository at this point in the history
  • Loading branch information
lazappi committed Feb 6, 2025
1 parent e3439c9 commit fdbdff8
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 65 deletions.
50 changes: 31 additions & 19 deletions src/reporting/get_dataset_info/config.vsh.yaml
Original file line number Diff line number Diff line change
@@ -1,35 +1,47 @@
name: get_dataset_info
namespace: reporting
description: Extract dataset info and convert to expected format for website results
arguments:
- name: --input
type: file
description: A yaml file
required: true
example: resources_test/openproblems/task_results_v3/raw/dataset_uns.yaml
- name: --output
type: file
direction: output
default: output.json
description: Output json
info:
format:
type: json
# TODO: add schema
description: Convert dataset uns YAML to schema-compliant JSON

argument_groups:
- name: Inputs
arguments:
- name: --input
type: file
description: A YAML file containing dataset uns
required: true
example: resources_test/openproblems/task_results_v4/raw/dataset_uns.yaml

- name: Outputs
arguments:
- name: --output
type: file
direction: output
default: dataset_info.json
description: Output JSON file matching dataset info schema
info:
format:
type: json
schema: /src/reporting/schemas/dataset_info_schema.json
example: resources_test/openproblems/task_results_v4/processed/dataset_info.json

resources:
- type: r_script
path: script.R

test_resources:
- type: python_script
path: /common/component_tests/run_and_check_output.py
- path: /resources_test/openproblems/task_results_v3
dest: resources_test/openproblems/task_results_v3
- path: /resources_test/openproblems/task_results_v4
dest: resources_test/openproblems/task_results_v4

engines:
- type: docker
image: openproblems/base_r:1.0.0
setup:
- type: r
cran: [ purrr, yaml, rlang, processx ]
cran:
- purrr

runners:
- type: executable
- type: nextflow
Expand Down
84 changes: 46 additions & 38 deletions src/reporting/get_dataset_info/script.R
Original file line number Diff line number Diff line change
@@ -1,53 +1,61 @@
requireNamespace("jsonlite", quietly = TRUE)
requireNamespace("yaml", quietly = TRUE)
library(purrr, warn.conflicts = FALSE)
library(rlang, warn.conflicts = FALSE)

## VIASH START
par <- list(
input = "resources_test/openproblems/task_results_v3/raw/dataset_uns.yaml",
output = "resources_test/openproblems/task_results_v3/processed/dataset_info.json"
input = "resources_test/openproblems/task_results_v4/raw/dataset_uns.yaml",
output = "resources_test/openproblems/task_results_v4/processed/dataset_info.json"
)
## VIASH END

datasets <- yaml::yaml.load_file(par$input)

# transform into format expected by website
outputs <- map(datasets, function(dataset) {
# ↑ the 'dataset' object could be used as the new format
cat("====== Get dataset info ======\n")

# TODO: it'd be nice if the s3 path was also included in the dataset info
`%||%` <- rlang::`%||%`

# construct v1 format
out <- list(
"dataset_id" = dataset$dataset_id,
"dataset_name" = dataset$dataset_name,
"dataset_summary" = dataset$dataset_summary,
"dataset_description" = dataset$dataset_description %||% NA_character_,
"data_reference" = dataset$dataset_reference %||% NA_character_,
"data_url" = dataset$dataset_url %||% NA_character_,
"date_created" = dataset$date_created %||% NA_character_,
"file_size" = dataset$file_size %||% NA_character_
)

if (!is.null(dataset[["common_dataset_id"]])) {
out[["common_dataset_id"]] <- dataset[["common_dataset_id"]]
}
cat("\n>>> Reading input files...\n")
cat("Reading dataset uns from '", par$input, "'...\n", sep = "")
dataset_uns <- yaml::yaml.load_file(
par$input,
# Read file sizes as floats to avoid issues with big integers
handlers = list(int = \(x) {as.numeric(x)})
)

# show warning when certain data is missing and return null?
for (n in names(out)) {
if (is.null(out[[n]])) {
out_as_str <- jsonlite::toJSON(out, auto_unbox = TRUE, pretty = TRUE)
stop("missing value for value '", n, "' in ", out_as_str)
}
cat(
"\n>>> Processing ", length(dataset_uns), " datasets...\n",
sep = ""
)
dataset_info_json <- purrr::map(dataset_uns, function(.dataset) {
cat("Processing dataset uns '", .dataset$dataset_id, "'\n", sep = "")

references <- if (is.list(.dataset$dataset_reference)) {
list(
doi = .dataset$dataset_reference$doi %||% character(0),
bibtex = .dataset$dataset_reference$bibtex %||% character(0)
)
} else {
.dataset$dataset_reference
}

out
list(
name = jsonlite::unbox(.dataset$dataset_id),
label = jsonlite::unbox(.dataset$dataset_name),
commit = jsonlite::unbox(.dataset$dataset_commit %||% "missing-sha"),
summary = jsonlite::unbox(.dataset$dataset_summary),
description = jsonlite::unbox(.dataset$dataset_description),
source_url = jsonlite::unbox(.dataset$dataset_url),
common_dataset_names = .dataset$common_dataset_id,
modalities = jsonlite::unbox(.dataset$dataset_modality),
organisms = .dataset$dataset_organism,
references = references,
date_created = jsonlite::unbox(.dataset$date_created),
file_size_mb = jsonlite::unbox(.dataset$file_size / 1048576)
)
})

cat("\n>>> Writing output files...\n")
cat("Writing dataset info to '", par$output, "'...\n", sep = "")
jsonlite::write_json(
outputs,
dataset_info_json,
par$output,
auto_unbox = TRUE,
pretty = TRUE
pretty = TRUE,
null = "null"
)

cat("\n>>> Done!\n")
6 changes: 3 additions & 3 deletions src/reporting/get_method_info/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ argument_groups:
required: true
example: resources_test/openproblems/task_results_v4/raw/method_configs.yaml

- name: Inputs
- name: Outputs
arguments:
- name: --output
type: file
Expand All @@ -31,8 +31,8 @@ resources:
test_resources:
- type: python_script
path: /common/component_tests/run_and_check_output.py
- path: /resources_test/openproblems/task_results_v3
dest: resources_test/openproblems/task_results_v3
- path: /resources_test/openproblems/task_results_v4
dest: resources_test/openproblems/task_results_v4

engines:
- type: docker
Expand Down
6 changes: 3 additions & 3 deletions src/reporting/get_metric_info/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ argument_groups:
required: true
example: resources_test/openproblems/task_results_v4/raw/metric_configs.yaml

- name: Inputs
- name: Outputs
arguments:
- name: --output
type: file
Expand All @@ -31,8 +31,8 @@ resources:
test_resources:
- type: python_script
path: /common/component_tests/run_and_check_output.py
- path: /resources_test/openproblems/task_results_v3
dest: resources_test/openproblems/task_results_v3
- path: /resources_test/openproblems/task_results_v4
dest: resources_test/openproblems/task_results_v4

engines:
- type: docker
Expand Down
78 changes: 78 additions & 0 deletions src/reporting/schemas/dataset_info_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://openproblems.bio/schemas/dataset_info_schema.json",
"title": "Method info",
"description": "Information about datasets in a task",
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {
"description": "Method identifier",
"type": "string"
},
"label": {
"description": "Human-readable label for the method",
"type": "string"
},
"commit": {
"description": "Git commit SHA for this version of the method",
"type": "string"
},
"summary": {
"description": "One line summary of the method",
"type": "string"
},
"description": {
"description": "Detailed description of the method",
"type": "string"
},
"source_url": {
"description": "URL to the original source of the dataset",
"type": "string"
},
"common_dataset_names": {
"description": "Identifier(s) for the common dataset(s) this dataset was derived from. Only if the dataset_name is different.",
"type": ["null", "array"],
"items": {
"type": "string"
}
},
"modalities": {
"description": "Modalities of the dataset",
"type": ["null", "array"],
"items": {
"type": "string"
}
},
"organisms": {
"description": "Organisms in the dataset",
"type": "array",
"items": {
"type": "string"
}
},
"references": {
"$ref": "https://openproblems.bio/schemas/references_schema.json"
},
"file_size_mb": {
"description": "Size of the dataset in MB",
"type": "number"
}
},
"required": [
"name",
"label",
"commit",
"summary",
"description",
"source_url",
"common_dataset_names",
"modalities",
"organisms",
"references",
"date_created",
"file_size_mb"
]
}
}
2 changes: 1 addition & 1 deletion src/reporting/schemas/method_info_schema.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://openproblems.bio/schemas/task_info_schema.json",
"$id": "https://openproblems.bio/schemas/method_info_schema.json",
"title": "Method info",
"description": "Information about methods in a task",
"type": "array",
Expand Down
2 changes: 1 addition & 1 deletion src/reporting/schemas/metric_info_schema.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"$schema": "https://json-schema.org/draft/2020-12/schema",
"$id": "https://openproblems.bio/schemas/task_info_schema.json",
"$id": "https://openproblems.bio/schemas/metric_info_schema.json",
"title": "Metric info",
"description": "Information about metrics in a task",
"type": "array",
Expand Down

0 comments on commit fdbdff8

Please sign in to comment.