Skip to content

Commit

Permalink
Optimize output of ParseHCA
Browse files Browse the repository at this point in the history
  • Loading branch information
showteeth committed May 16, 2024
1 parent 86bf0f1 commit bcfc5af
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 5 deletions.
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ importFrom(SummarizedExperiment,assayNames)
importFrom(curl,curl_fetch_memory)
importFrom(data.table,fread)
importFrom(data.table,rbindlist)
importFrom(dplyr,any_of)
importFrom(dplyr,arrange)
importFrom(dplyr,desc)
importFrom(dplyr,distinct)
Expand All @@ -69,8 +70,10 @@ importFrom(dplyr,filter)
importFrom(dplyr,full_join)
importFrom(dplyr,group_by_all)
importFrom(dplyr,if_all)
importFrom(dplyr,last_col)
importFrom(dplyr,mutate)
importFrom(dplyr,n)
importFrom(dplyr,relocate)
importFrom(dplyr,select)
importFrom(dplyr,starts_with)
importFrom(dplyr,summarise)
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
* Support summarising multiple attributes in `StatDBAttribute`.
* Fix bugs in `mergeExperiments`.
* Fix bugs in `ParseHCA` (no file with extension specified by `file.ext`).
* Optimize `ParseHCA` to return metadata of downloaded files.

-------------------

Expand Down
24 changes: 20 additions & 4 deletions R/hca.R
Original file line number Diff line number Diff line change
Expand Up @@ -235,15 +235,15 @@ ExtractHCAMeta <- function(all.projects.df, organism = NULL, sex = NULL, organ =
#' @param parallel Logical value, whether to download parallelly. Default: TRUE. When "libcurl" is available for \code{download.file},
#' the parallel is done by default (\code{parallel} can be FALSE).
#'
#' @return Dataframe contains failed projects or NULL.
#' @return List contains files' metadata of downloaded successfully (down.meta) and failed (fail.meta).
#' @importFrom magrittr %>%
#' @importFrom curl curl_fetch_memory
#' @importFrom jsonlite fromJSON
#' @importFrom data.table rbindlist
#' @importFrom parallel detectCores mclapply
#' @importFrom utils download.file
#' @importFrom tidyr spread
#' @importFrom dplyr distinct
#' @importFrom dplyr distinct relocate any_of last_col select everything
#' @importFrom rlang .data
#' @export
#'
Expand Down Expand Up @@ -276,6 +276,7 @@ ParseHCA <- function(meta, file.ext = c("rds", "rdata", "h5", "h5ad", "loom", "t
# check entryId exists
CheckColumns(df = meta, columns = c("entryId", "catalog"))
# filter projects with meta
# projects.valid <- merge(hca.projects.df, meta[c("entryId", "catalog")], by = c("entryId", "catalog"))
projects.valid <- merge(hca.projects.df, meta[c("entryId", "catalog")], by = c("entryId", "catalog"))

# extract data
Expand All @@ -302,6 +303,17 @@ ParseHCA <- function(meta, file.ext = c("rds", "rdata", "h5", "h5ad", "loom", "t
return(x.dataset.df)
})
projects.datasets.df <- data.table::rbindlist(projects.datasets.list, fill = TRUE) %>% as.data.frame()
# remove unused columns
projects.datasets.df$drs_uri <- NULL
projects.datasets.df$uuid <- NULL
projects.datasets.df <- merge(meta[c(
"projectTitle", "projectDescription", "publications",
"sampleEntityType", "organPart", "disease", "preservationMethod", "biologicalSex",
"nucleicAcidSource", "entryId", "catalog"
)], projects.datasets.df, by = c("entryId", "catalog"))
projects.datasets.df <- projects.datasets.df %>%
dplyr::relocate(dplyr::any_of(c("entryId", "catalog")), .after = dplyr::last_col()) %>%
dplyr::select(dplyr::any_of(c("meta", "contentDescription", "name")), dplyr::everything())
projects.datasets.df$lowerformat <- tolower(projects.datasets.df$format)
# filter with file.ext
file.ext <- c(file.ext, paste0(file.ext, ".tar.gz"), paste0(file.ext, ".gz"))
Expand Down Expand Up @@ -358,11 +370,15 @@ ParseHCA <- function(meta, file.ext = c("rds", "rdata", "h5", "h5ad", "loom", "t
fail.status <- which(down.status != 0)
if (length(fail.status) == 0) {
message("All datasets downloaded successfully!")
return(NULL)
res.list <- list(down.meta = projects.datasets.valid.df, fail.meta = NULL)
return(res.list)
} else {
message(length(fail.status), " files downloaded failed, please re-run with fail.meta (meta)")
fail.entry.id <- projects.datasets.valid.df[fail.status, "entryId"] %>% unique()
fail.meta <- meta[meta$entryId %in% fail.entry.id, ]
return(fail.meta)
success.meta <- projects.datasets.valid.df[!projects.datasets.valid.df$entryId %in% fail.entry.id, ]
res.list <- list(down.meta = success.meta, fail.meta = fail.meta)
return(res.list)
}
}
}
1 change: 1 addition & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,7 @@ HCAExtactData <- function(df) {
df.final <- tidyr::spread(data = df.unlist[c("meta", "type", "value")], key = "type", value = "value")
return(df.final)
}

# used in CELLxGENE, Zenodo
LoadRDS2Seurat <- function(out.folder, merge, obs.value.filter = NULL, obs.keys = NULL, include.genes = NULL) {
rds.files <- list.files(path = out.folder, pattern = "rds$", full.names = TRUE, ignore.case = TRUE)
Expand Down
2 changes: 1 addition & 1 deletion man/ParseHCA.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit bcfc5af

Please sign in to comment.