From 07aebca5bc182bd1424e1902651da7f9728fe3e1 Mon Sep 17 00:00:00 2001 From: LTLA Date: Sat, 14 Dec 2024 21:06:48 -0800 Subject: [PATCH] Cleaned up the per-reference scores from combineRecomputedResults. Now we return a DataFrame of nested Dataframes, where each inner DataFrame corresponds to a reference and contains the identity of the best label and the recomputed score in that reference. This is effectively a sparse version of the previous "expanded with NA" format, which is easier to read and saves memory. We also update the plotting functions to work with this new format, mostly by regenerated the previous expanded format as needed. --- R/combineRecomputedResults.R | 37 ++++++++++---------------------- R/plotDeltaDistribution.R | 10 ++++++++- R/plotScoreDistribution.R | 12 +++++++++-- R/plotScoreHeatmap.R | 31 ++++++++++++++++++++------ man/combineRecomputedResults.Rd | 5 ++--- man/plotDeltaDistribution.Rd | 2 +- man/plotScoreDistribution.Rd | 2 +- tests/testthat/test-heatmap.R | 30 ++++++-------------------- tests/testthat/test-recomputed.R | 18 +++++++++------- 9 files changed, 74 insertions(+), 73 deletions(-) diff --git a/R/combineRecomputedResults.R b/R/combineRecomputedResults.R index dd4e59f..f306c66 100644 --- a/R/combineRecomputedResults.R +++ b/R/combineRecomputedResults.R @@ -19,9 +19,8 @@ #' @return A \linkS4class{DataFrame} is returned containing the annotation statistics for each cell or cluster (row). #' This mimics the output of \code{\link{classifySingleR}} and contains the following fields: #' \itemize{ -#' \item \code{scores}, a numeric matrix of correlations containing the \emph{recomputed} scores. -#' For any given cell, entries of this matrix are only non-\code{NA} for the assigned label in each reference; -#' scores are not recomputed for the other labels. +#' \item \code{scores}, a DataFrame of DataFrames containing the \emph{recomputed} scores for the best label in each reference. +#' Each nested DataFrame corresponds to a reference and contains \code{labels} (the best label for that cell in this reference) and \code{scores} (the recomputed score). #' \item \code{labels}, a character vector containing the per-cell combined label across references. #' \item \code{reference}, an integer vector specifying the reference from which the combined label was derived. #' \item \code{delta.next}, a numeric vector containing the difference between the best and next-best score. @@ -186,21 +185,19 @@ combineRecomputedResults <- function( ) # Organizing the outputs. + if (is.null(names(results))) { + names(results) <- sprintf("ref%i", seq_along(results)) + } + base.scores <- vector("list", length(results)) - for (r in seq_along(base.scores)) { - mat <- results[[r]]$scores - mat[] <- NA_real_ - idx <- cbind(seq_len(nrow(mat)), collated[[r]] + 1L) - mat[idx] <- irun$scores[,r] - base.scores[[r]] <- mat + names(base.scores) <- names(results) + for (i in seq_along(base.scores)) { + base.scores[[i]] <- DataFrame(labels=results[[i]]$labels, trained[[i]]$labels$unique, scores=irun$scores[,i]) } - all.scores <- do.call(cbind, base.scores) + all.scores <- DataFrame(lapply(base.scores, I)) output <- DataFrame(scores = I(all.scores), row.names=rownames(results[[1]])) - metadata(output)$label.origin <- .create_label_origin(base.scores) - - chosen <- irun$best + 1L - cbind(output, .combine_result_frames(chosen, irun$delta, results)) + cbind(output, .combine_result_frames(irun$best + 1L, irun$delta, results)) } #' @importFrom S4Vectors DataFrame @@ -228,19 +225,7 @@ combineRecomputedResults <- function( output$reference <- chosen output$delta.next <- delta - - if (is.null(names(results))) { - names(results) <- sprintf("ref%i", seq_along(results)) - } output$orig.results <- do.call(DataFrame, lapply(results, I)) output } - -#' @importFrom S4Vectors DataFrame -.create_label_origin <- function(collected.scores) { - DataFrame( - label=unlist(lapply(collected.scores, colnames)), - reference=rep(seq_along(collected.scores), vapply(collected.scores, ncol, 0L)) - ) -} diff --git a/R/plotDeltaDistribution.R b/R/plotDeltaDistribution.R index d5ac457..af7f982 100644 --- a/R/plotDeltaDistribution.R +++ b/R/plotDeltaDistribution.R @@ -76,7 +76,7 @@ plotDeltaDistribution <- function( results, show = c("delta.med", "delta.next"), - labels.use = colnames(results$scores), + labels.use = NULL, references = NULL, chosen.only = TRUE, size = 2, @@ -135,6 +135,14 @@ plotDeltaDistribution <- function( pruned <- is.na(current.results$pruned.labels) } + if (is.null(labels.use)) { + if (is(results$scores, "DataFrame")) { + labels.use <- unlist(lapply(results$scores, function(x) levels(x$labels))) + } else { + labels.use <- colnames(results$scores) + } + } + # Actually creating the plot plots[[i]] <- .plot_delta_distribution( values=values, labels=labels, pruned=pruned, labels.use=labels.use, diff --git a/R/plotScoreDistribution.R b/R/plotScoreDistribution.R index 11d5b3f..ac8e863 100644 --- a/R/plotScoreDistribution.R +++ b/R/plotScoreDistribution.R @@ -91,7 +91,7 @@ plotScoreDistribution <- function( results, show = NULL, - labels.use = colnames(results$scores), + labels.use = NULL, references = NULL, scores.use = NULL, calls.use = 0, @@ -137,11 +137,15 @@ plotScoreDistribution <- function( chosen <- references[i] if (chosen==0L) { current.results <- results + scores <- current.results$scores + if (is(scores, "DataFrame")) { # i.e., from combineRecomputedResults. + scores <- .expand_recomputed_scores(scores) + } } else { current.results <- results$orig.results[[chosen]] + scores <- current.results$scores } - scores <- current.results$scores scores.title <- .values_title(is.combined, chosen, ref.names, show) # Pulling out the labels to use in this iteration. @@ -154,6 +158,10 @@ plotScoreDistribution <- function( prune.calls <- current.results$pruned.labels } + if (is.null(labels.use)) { + labels.use <- colnames(scores) + } + # Actually creating the plot plots[[i]] <- .plot_score_distribution( scores=scores, labels=labels, prune.calls=prune.calls, labels.use=labels.use, diff --git a/R/plotScoreHeatmap.R b/R/plotScoreHeatmap.R index 7745725..9f8a6a9 100644 --- a/R/plotScoreHeatmap.R +++ b/R/plotScoreHeatmap.R @@ -223,11 +223,15 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL, chosen.scores <- scores.use[i] if (chosen.scores==0L) { score.results <- results + scores <- score.results$scores + if (is(scores, "DataFrame")) { # i.e., from combineRecomputedResults. + scores <- .expand_recomputed_scores(scores) + } } else { score.results <- results$orig.results[[chosen.scores]] + scores <- score.results$scores } - scores <- score.results$scores rownames(scores) <- rownames(results) scores.title <- .values_title(is.combined, chosen.scores, ref.names, "Scores") scores.labels <- score.results$labels @@ -245,6 +249,10 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL, names(labels) <- names(prune.calls) <- rownames(scores) labels.title <- .values_title(is.combined, chosen.calls, ref.names, "Labels") + if (is.null(labels.use)) { + labels.use <- colnames(scores) + } + # Actually creating the heatmap. output <- .plot_score_heatmap( scores=scores, @@ -465,10 +473,6 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL, .trim_byLabel_and_normalize_scores <- function( scores, labels.use, max.labels, normalize, scores.title, scores.labels) { - # Trim by labels (remove any with no scores) - all.na <- apply(scores, 2, FUN = function(x) all(is.na(x))) - scores <- scores[,!all.na, drop = FALSE] - # Trim by labels (labels.use) if (!is.null(labels.use)) { labels.use <- labels.use[labels.use %in% colnames(scores)] @@ -481,7 +485,7 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL, # Trim by labels (max.labels), using primarily the most frequent labels. times.best <- table(factor(scores.labels, levels = unique(colnames(scores))))[colnames(scores)] - if (!any(is.na(scores))) { + if (!anyNA(scores)) { # To break ties, we sort by the scaled maximum if there are no NAs. # This is done _before_ within-cell normalization of the scores, # after which it makes little sense to compare scores between cells. @@ -489,7 +493,7 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL, } else { # If there are NAs - usually from combineRecomputedResults - # we sort by the frequency of non-NA occurrences. - secondary <- apply(scores, 2, FUN = function(x) sum(!is.na(x))) + secondary <- colSums(!is.na(scores)) } to.keep <- order(times.best, secondary, decreasing=TRUE) to.keep <- head(to.keep, max.labels) @@ -650,3 +654,16 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL, next.color.index.discrete = next.color.index.discrete, next.color.index.numeric = next.color.index.numeric) } + +.expand_recomputed_scores <- function(scores) { + expanded.scores <- vector("list", ncol(scores)) + for (i in seq_along(expanded.scores)) { + curscores <- scores[[i]] + u <- unique(curscores$labels) + expanded <- matrix(NA_real_, nrow(curscores), length(u)) + expanded[cbind(seq_len(nrow(curscores)), match(curscores$labels, u))] <- curscores$scores + colnames(expanded) <- u + expanded.scores[[i]] <- expanded + } + do.call(cbind, expanded.scores) +} diff --git a/man/combineRecomputedResults.Rd b/man/combineRecomputedResults.Rd index 1c36164..24110bb 100644 --- a/man/combineRecomputedResults.Rd +++ b/man/combineRecomputedResults.Rd @@ -53,9 +53,8 @@ see \code{?\link{trainSingleR}} and \code{?\link{classifySingleR}} for more deta A \linkS4class{DataFrame} is returned containing the annotation statistics for each cell or cluster (row). This mimics the output of \code{\link{classifySingleR}} and contains the following fields: \itemize{ -\item \code{scores}, a numeric matrix of correlations containing the \emph{recomputed} scores. -For any given cell, entries of this matrix are only non-\code{NA} for the assigned label in each reference; -scores are not recomputed for the other labels. +\item \code{scores}, a DataFrame of DataFrames containing the \emph{recomputed} scores for the best label in each reference. +Each nested DataFrame corresponds to a reference and contains \code{labels} (the best label for that cell in this reference) and \code{scores} (the recomputed score). \item \code{labels}, a character vector containing the per-cell combined label across references. \item \code{reference}, an integer vector specifying the reference from which the combined label was derived. \item \code{delta.next}, a numeric vector containing the difference between the best and next-best score. diff --git a/man/plotDeltaDistribution.Rd b/man/plotDeltaDistribution.Rd index 33b7556..d8b1a82 100644 --- a/man/plotDeltaDistribution.Rd +++ b/man/plotDeltaDistribution.Rd @@ -7,7 +7,7 @@ plotDeltaDistribution( results, show = c("delta.med", "delta.next"), - labels.use = colnames(results$scores), + labels.use = NULL, references = NULL, chosen.only = TRUE, size = 2, diff --git a/man/plotScoreDistribution.Rd b/man/plotScoreDistribution.Rd index 73d004e..4f0402c 100644 --- a/man/plotScoreDistribution.Rd +++ b/man/plotScoreDistribution.Rd @@ -7,7 +7,7 @@ plotScoreDistribution( results, show = NULL, - labels.use = colnames(results$scores), + labels.use = NULL, references = NULL, scores.use = NULL, calls.use = 0, diff --git a/tests/testthat/test-heatmap.R b/tests/testthat/test-heatmap.R index 481d711..f14af8d 100644 --- a/tests/testthat/test-heatmap.R +++ b/tests/testthat/test-heatmap.R @@ -314,32 +314,14 @@ test_that("heatmap multi-ref - 'na.color'", { "#000000") }) -test_that("heatmap multi-ref - labels with no scores are removed", { - combined$scores <- cbind(combined$scores, "f" = NA) - expect_true("f" %in% colnames(combined$scores)) - expect_false("f" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, - scores.use = 0)$mat)) -}) - test_that("heatmap multi-ref - labels with least calls/calcs are removed by 'max.labels'", { - combined$scores <- cbind(combined$scores, "neverCalled" = 1) # actual score is immaterial - combined$scores <- cbind(combined$scores, "rarelyCalc" = NA) - combined$scores[1,"rarelyCalc"] <- 1 # Needs at least one score to not be removed anyway. - expect_true(all(c("neverCalled", "rarelyCalc") %in% colnames(combined$scores))) - - # Both there with no trimming - expect_true(all(c("neverCalled", "rarelyCalc") %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0, - max.labels = 40)$mat))) - - # The rarely picked for calculation "rarelyCalc" label should be removed first - expect_true("neverCalled" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0, - max.labels = 11)$mat)) - expect_false("rarelyCalc" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0, - max.labels = 11)$mat)) + combined$scores[[1]][1,"labels"] <- "rarelyCalc" - # The never picked as final label "neverCalled" label should be removed next - expect_false("neverCalled" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0, - max.labels = 10)$mat)) + # Present with no trimming + expect_true("rarelyCalc" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0, + max.labels = 40)$mat)) + + # The rarely picked for calculation "rarelyCalc" label should be removed. expect_false("rarelyCalc" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0, max.labels = 10)$mat)) }) diff --git a/tests/testthat/test-recomputed.R b/tests/testthat/test-recomputed.R index 7c45d05..c281752 100644 --- a/tests/testthat/test-recomputed.R +++ b/tests/testthat/test-recomputed.R @@ -30,12 +30,17 @@ test_that("combineRecomputedResults works as expected (light check)", { combined <- combineRecomputedResults( results=list(pred1, pred2), test=test, - trained=list(train1, train2)) + trained=list(train1, train2), + fine.tune=FALSE + ) - # Checking the sanity of the output. - obs <- apply(combined$scores, 1, FUN=function(x) colnames(combined$scores)[!is.na(x)]) - ref <- rbind(pred1$labels, pred2$labels) - expect_identical(obs, ref) + expect_identical(combined$scores$ref1$labels, pred1$labels) + expect_identical(combined$scores$ref2$labels, pred2$labels) + + aggregated.scores <- do.call(cbind, lapply(combined$scores, function(x) x$scores)) + aggregated.labels <- do.call(cbind, lapply(combined$scores, function(x) as.character(x$labels))) + expect_identical(max.col(aggregated.scores), combined$reference) + expect_identical(aggregated.labels[cbind(seq_len(nrow(aggregated.labels)), max.col(aggregated.scores))], combined$labels) expect_true(all(combined$labels == pred1$labels | combined$labels==pred2$labels)) expect_true(all(combined$first.labels == pred1$first.labels | combined$first.labels==pred2$first.labels)) @@ -46,9 +51,6 @@ test_that("combineRecomputedResults works as expected (light check)", { is.na(combined$pruned.labels)==is.na(pred1$pruned.labels) | is.na(combined$pruned.labels)==is.na(pred2$pruned.labels) )) - - top <- apply(combined$scores, 1, FUN=function(x) colnames(combined$scores)[which.max(x)]) - expect_identical(top, combined$labels) }) test_that("combineRecomputedResults matrix fragmentation works as expected", {