-
Notifications
You must be signed in to change notification settings - Fork 82
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Update get_dataset_info to new schema
- Loading branch information
Showing
7 changed files
with
163 additions
and
65 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,53 +1,61 @@ | ||
requireNamespace("jsonlite", quietly = TRUE) | ||
requireNamespace("yaml", quietly = TRUE) | ||
library(purrr, warn.conflicts = FALSE) | ||
library(rlang, warn.conflicts = FALSE) | ||
|
||
## VIASH START | ||
par <- list( | ||
input = "resources_test/openproblems/task_results_v3/raw/dataset_uns.yaml", | ||
output = "resources_test/openproblems/task_results_v3/processed/dataset_info.json" | ||
input = "resources_test/openproblems/task_results_v4/raw/dataset_uns.yaml", | ||
output = "resources_test/openproblems/task_results_v4/processed/dataset_info.json" | ||
) | ||
## VIASH END | ||
|
||
datasets <- yaml::yaml.load_file(par$input) | ||
|
||
# transform into format expected by website | ||
outputs <- map(datasets, function(dataset) { | ||
# ↑ the 'dataset' object could be used as the new format | ||
cat("====== Get dataset info ======\n") | ||
|
||
# TODO: it'd be nice if the s3 path was also included in the dataset info | ||
`%||%` <- rlang::`%||%` | ||
|
||
# construct v1 format | ||
out <- list( | ||
"dataset_id" = dataset$dataset_id, | ||
"dataset_name" = dataset$dataset_name, | ||
"dataset_summary" = dataset$dataset_summary, | ||
"dataset_description" = dataset$dataset_description %||% NA_character_, | ||
"data_reference" = dataset$dataset_reference %||% NA_character_, | ||
"data_url" = dataset$dataset_url %||% NA_character_, | ||
"date_created" = dataset$date_created %||% NA_character_, | ||
"file_size" = dataset$file_size %||% NA_character_ | ||
) | ||
|
||
if (!is.null(dataset[["common_dataset_id"]])) { | ||
out[["common_dataset_id"]] <- dataset[["common_dataset_id"]] | ||
} | ||
cat("\n>>> Reading input files...\n") | ||
cat("Reading dataset uns from '", par$input, "'...\n", sep = "") | ||
dataset_uns <- yaml::yaml.load_file( | ||
par$input, | ||
# Read file sizes as floats to avoid issues with big integers | ||
handlers = list(int = \(x) {as.numeric(x)}) | ||
) | ||
|
||
# show warning when certain data is missing and return null? | ||
for (n in names(out)) { | ||
if (is.null(out[[n]])) { | ||
out_as_str <- jsonlite::toJSON(out, auto_unbox = TRUE, pretty = TRUE) | ||
stop("missing value for value '", n, "' in ", out_as_str) | ||
} | ||
cat( | ||
"\n>>> Processing ", length(dataset_uns), " datasets...\n", | ||
sep = "" | ||
) | ||
dataset_info_json <- purrr::map(dataset_uns, function(.dataset) { | ||
cat("Processing dataset uns '", .dataset$dataset_id, "'\n", sep = "") | ||
|
||
references <- if (is.list(.dataset$dataset_reference)) { | ||
list( | ||
doi = .dataset$dataset_reference$doi %||% character(0), | ||
bibtex = .dataset$dataset_reference$bibtex %||% character(0) | ||
) | ||
} else { | ||
.dataset$dataset_reference | ||
} | ||
|
||
out | ||
list( | ||
name = jsonlite::unbox(.dataset$dataset_id), | ||
label = jsonlite::unbox(.dataset$dataset_name), | ||
commit = jsonlite::unbox(.dataset$dataset_commit %||% "missing-sha"), | ||
summary = jsonlite::unbox(.dataset$dataset_summary), | ||
description = jsonlite::unbox(.dataset$dataset_description), | ||
source_url = jsonlite::unbox(.dataset$dataset_url), | ||
common_dataset_names = .dataset$common_dataset_id, | ||
modalities = jsonlite::unbox(.dataset$dataset_modality), | ||
organisms = .dataset$dataset_organism, | ||
references = references, | ||
date_created = jsonlite::unbox(.dataset$date_created), | ||
file_size_mb = jsonlite::unbox(.dataset$file_size / 1048576) | ||
) | ||
}) | ||
|
||
cat("\n>>> Writing output files...\n") | ||
cat("Writing dataset info to '", par$output, "'...\n", sep = "") | ||
jsonlite::write_json( | ||
outputs, | ||
dataset_info_json, | ||
par$output, | ||
auto_unbox = TRUE, | ||
pretty = TRUE | ||
pretty = TRUE, | ||
null = "null" | ||
) | ||
|
||
cat("\n>>> Done!\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
{ | ||
"$schema": "https://json-schema.org/draft/2020-12/schema", | ||
"$id": "https://openproblems.bio/schemas/dataset_info_schema.json", | ||
"title": "Method info", | ||
"description": "Information about datasets in a task", | ||
"type": "array", | ||
"items": { | ||
"type": "object", | ||
"properties": { | ||
"name": { | ||
"description": "Method identifier", | ||
"type": "string" | ||
}, | ||
"label": { | ||
"description": "Human-readable label for the method", | ||
"type": "string" | ||
}, | ||
"commit": { | ||
"description": "Git commit SHA for this version of the method", | ||
"type": "string" | ||
}, | ||
"summary": { | ||
"description": "One line summary of the method", | ||
"type": "string" | ||
}, | ||
"description": { | ||
"description": "Detailed description of the method", | ||
"type": "string" | ||
}, | ||
"source_url": { | ||
"description": "URL to the original source of the dataset", | ||
"type": "string" | ||
}, | ||
"common_dataset_names": { | ||
"description": "Identifier(s) for the common dataset(s) this dataset was derived from. Only if the dataset_name is different.", | ||
"type": ["null", "array"], | ||
"items": { | ||
"type": "string" | ||
} | ||
}, | ||
"modalities": { | ||
"description": "Modalities of the dataset", | ||
"type": ["null", "array"], | ||
"items": { | ||
"type": "string" | ||
} | ||
}, | ||
"organisms": { | ||
"description": "Organisms in the dataset", | ||
"type": "array", | ||
"items": { | ||
"type": "string" | ||
} | ||
}, | ||
"references": { | ||
"$ref": "https://openproblems.bio/schemas/references_schema.json" | ||
}, | ||
"file_size_mb": { | ||
"description": "Size of the dataset in MB", | ||
"type": "number" | ||
} | ||
}, | ||
"required": [ | ||
"name", | ||
"label", | ||
"commit", | ||
"summary", | ||
"description", | ||
"source_url", | ||
"common_dataset_names", | ||
"modalities", | ||
"organisms", | ||
"references", | ||
"date_created", | ||
"file_size_mb" | ||
] | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters