diff --git a/R/combineRecomputedResults.R b/R/combineRecomputedResults.R index dd4e59f..f306c66 100644 --- a/R/combineRecomputedResults.R +++ b/R/combineRecomputedResults.R @@ -19,9 +19,8 @@ #' @return A \linkS4class{DataFrame} is returned containing the annotation statistics for each cell or cluster (row). #' This mimics the output of \code{\link{classifySingleR}} and contains the following fields: #' \itemize{ -#' \item \code{scores}, a numeric matrix of correlations containing the \emph{recomputed} scores. -#' For any given cell, entries of this matrix are only non-\code{NA} for the assigned label in each reference; -#' scores are not recomputed for the other labels. +#' \item \code{scores}, a DataFrame of DataFrames containing the \emph{recomputed} scores for the best label in each reference. +#' Each nested DataFrame corresponds to a reference and contains \code{labels} (the best label for that cell in this reference) and \code{scores} (the recomputed score). #' \item \code{labels}, a character vector containing the per-cell combined label across references. #' \item \code{reference}, an integer vector specifying the reference from which the combined label was derived. #' \item \code{delta.next}, a numeric vector containing the difference between the best and next-best score. @@ -186,21 +185,19 @@ combineRecomputedResults <- function( ) # Organizing the outputs. + if (is.null(names(results))) { + names(results) <- sprintf("ref%i", seq_along(results)) + } + base.scores <- vector("list", length(results)) - for (r in seq_along(base.scores)) { - mat <- results[[r]]$scores - mat[] <- NA_real_ - idx <- cbind(seq_len(nrow(mat)), collated[[r]] + 1L) - mat[idx] <- irun$scores[,r] - base.scores[[r]] <- mat + names(base.scores) <- names(results) + for (i in seq_along(base.scores)) { + base.scores[[i]] <- DataFrame(labels=results[[i]]$labels, trained[[i]]$labels$unique, scores=irun$scores[,i]) } - all.scores <- do.call(cbind, base.scores) + all.scores <- DataFrame(lapply(base.scores, I)) output <- DataFrame(scores = I(all.scores), row.names=rownames(results[[1]])) - metadata(output)$label.origin <- .create_label_origin(base.scores) - - chosen <- irun$best + 1L - cbind(output, .combine_result_frames(chosen, irun$delta, results)) + cbind(output, .combine_result_frames(irun$best + 1L, irun$delta, results)) } #' @importFrom S4Vectors DataFrame @@ -228,19 +225,7 @@ combineRecomputedResults <- function( output$reference <- chosen output$delta.next <- delta - - if (is.null(names(results))) { - names(results) <- sprintf("ref%i", seq_along(results)) - } output$orig.results <- do.call(DataFrame, lapply(results, I)) output } - -#' @importFrom S4Vectors DataFrame -.create_label_origin <- function(collected.scores) { - DataFrame( - label=unlist(lapply(collected.scores, colnames)), - reference=rep(seq_along(collected.scores), vapply(collected.scores, ncol, 0L)) - ) -} diff --git a/R/plotDeltaDistribution.R b/R/plotDeltaDistribution.R index d5ac457..af7f982 100644 --- a/R/plotDeltaDistribution.R +++ b/R/plotDeltaDistribution.R @@ -76,7 +76,7 @@ plotDeltaDistribution <- function( results, show = c("delta.med", "delta.next"), - labels.use = colnames(results$scores), + labels.use = NULL, references = NULL, chosen.only = TRUE, size = 2, @@ -135,6 +135,14 @@ plotDeltaDistribution <- function( pruned <- is.na(current.results$pruned.labels) } + if (is.null(labels.use)) { + if (is(results$scores, "DataFrame")) { + labels.use <- unlist(lapply(results$scores, function(x) levels(x$labels))) + } else { + labels.use <- colnames(results$scores) + } + } + # Actually creating the plot plots[[i]] <- .plot_delta_distribution( values=values, labels=labels, pruned=pruned, labels.use=labels.use, diff --git a/R/plotScoreDistribution.R b/R/plotScoreDistribution.R index 11d5b3f..ac8e863 100644 --- a/R/plotScoreDistribution.R +++ b/R/plotScoreDistribution.R @@ -91,7 +91,7 @@ plotScoreDistribution <- function( results, show = NULL, - labels.use = colnames(results$scores), + labels.use = NULL, references = NULL, scores.use = NULL, calls.use = 0, @@ -137,11 +137,15 @@ plotScoreDistribution <- function( chosen <- references[i] if (chosen==0L) { current.results <- results + scores <- current.results$scores + if (is(scores, "DataFrame")) { # i.e., from combineRecomputedResults. + scores <- .expand_recomputed_scores(scores) + } } else { current.results <- results$orig.results[[chosen]] + scores <- current.results$scores } - scores <- current.results$scores scores.title <- .values_title(is.combined, chosen, ref.names, show) # Pulling out the labels to use in this iteration. @@ -154,6 +158,10 @@ plotScoreDistribution <- function( prune.calls <- current.results$pruned.labels } + if (is.null(labels.use)) { + labels.use <- colnames(scores) + } + # Actually creating the plot plots[[i]] <- .plot_score_distribution( scores=scores, labels=labels, prune.calls=prune.calls, labels.use=labels.use, diff --git a/R/plotScoreHeatmap.R b/R/plotScoreHeatmap.R index 7745725..9f8a6a9 100644 --- a/R/plotScoreHeatmap.R +++ b/R/plotScoreHeatmap.R @@ -223,11 +223,15 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL, chosen.scores <- scores.use[i] if (chosen.scores==0L) { score.results <- results + scores <- score.results$scores + if (is(scores, "DataFrame")) { # i.e., from combineRecomputedResults. + scores <- .expand_recomputed_scores(scores) + } } else { score.results <- results$orig.results[[chosen.scores]] + scores <- score.results$scores } - scores <- score.results$scores rownames(scores) <- rownames(results) scores.title <- .values_title(is.combined, chosen.scores, ref.names, "Scores") scores.labels <- score.results$labels @@ -245,6 +249,10 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL, names(labels) <- names(prune.calls) <- rownames(scores) labels.title <- .values_title(is.combined, chosen.calls, ref.names, "Labels") + if (is.null(labels.use)) { + labels.use <- colnames(scores) + } + # Actually creating the heatmap. output <- .plot_score_heatmap( scores=scores, @@ -465,10 +473,6 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL, .trim_byLabel_and_normalize_scores <- function( scores, labels.use, max.labels, normalize, scores.title, scores.labels) { - # Trim by labels (remove any with no scores) - all.na <- apply(scores, 2, FUN = function(x) all(is.na(x))) - scores <- scores[,!all.na, drop = FALSE] - # Trim by labels (labels.use) if (!is.null(labels.use)) { labels.use <- labels.use[labels.use %in% colnames(scores)] @@ -481,7 +485,7 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL, # Trim by labels (max.labels), using primarily the most frequent labels. times.best <- table(factor(scores.labels, levels = unique(colnames(scores))))[colnames(scores)] - if (!any(is.na(scores))) { + if (!anyNA(scores)) { # To break ties, we sort by the scaled maximum if there are no NAs. # This is done _before_ within-cell normalization of the scores, # after which it makes little sense to compare scores between cells. @@ -489,7 +493,7 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL, } else { # If there are NAs - usually from combineRecomputedResults - # we sort by the frequency of non-NA occurrences. - secondary <- apply(scores, 2, FUN = function(x) sum(!is.na(x))) + secondary <- colSums(!is.na(scores)) } to.keep <- order(times.best, secondary, decreasing=TRUE) to.keep <- head(to.keep, max.labels) @@ -650,3 +654,16 @@ plotScoreHeatmap <- function(results, cells.use = NULL, labels.use = NULL, next.color.index.discrete = next.color.index.discrete, next.color.index.numeric = next.color.index.numeric) } + +.expand_recomputed_scores <- function(scores) { + expanded.scores <- vector("list", ncol(scores)) + for (i in seq_along(expanded.scores)) { + curscores <- scores[[i]] + u <- unique(curscores$labels) + expanded <- matrix(NA_real_, nrow(curscores), length(u)) + expanded[cbind(seq_len(nrow(curscores)), match(curscores$labels, u))] <- curscores$scores + colnames(expanded) <- u + expanded.scores[[i]] <- expanded + } + do.call(cbind, expanded.scores) +} diff --git a/man/combineRecomputedResults.Rd b/man/combineRecomputedResults.Rd index 1c36164..24110bb 100644 --- a/man/combineRecomputedResults.Rd +++ b/man/combineRecomputedResults.Rd @@ -53,9 +53,8 @@ see \code{?\link{trainSingleR}} and \code{?\link{classifySingleR}} for more deta A \linkS4class{DataFrame} is returned containing the annotation statistics for each cell or cluster (row). This mimics the output of \code{\link{classifySingleR}} and contains the following fields: \itemize{ -\item \code{scores}, a numeric matrix of correlations containing the \emph{recomputed} scores. -For any given cell, entries of this matrix are only non-\code{NA} for the assigned label in each reference; -scores are not recomputed for the other labels. +\item \code{scores}, a DataFrame of DataFrames containing the \emph{recomputed} scores for the best label in each reference. +Each nested DataFrame corresponds to a reference and contains \code{labels} (the best label for that cell in this reference) and \code{scores} (the recomputed score). \item \code{labels}, a character vector containing the per-cell combined label across references. \item \code{reference}, an integer vector specifying the reference from which the combined label was derived. \item \code{delta.next}, a numeric vector containing the difference between the best and next-best score. diff --git a/man/plotDeltaDistribution.Rd b/man/plotDeltaDistribution.Rd index 33b7556..d8b1a82 100644 --- a/man/plotDeltaDistribution.Rd +++ b/man/plotDeltaDistribution.Rd @@ -7,7 +7,7 @@ plotDeltaDistribution( results, show = c("delta.med", "delta.next"), - labels.use = colnames(results$scores), + labels.use = NULL, references = NULL, chosen.only = TRUE, size = 2, diff --git a/man/plotScoreDistribution.Rd b/man/plotScoreDistribution.Rd index 73d004e..4f0402c 100644 --- a/man/plotScoreDistribution.Rd +++ b/man/plotScoreDistribution.Rd @@ -7,7 +7,7 @@ plotScoreDistribution( results, show = NULL, - labels.use = colnames(results$scores), + labels.use = NULL, references = NULL, scores.use = NULL, calls.use = 0, diff --git a/tests/testthat/test-heatmap.R b/tests/testthat/test-heatmap.R index 481d711..f14af8d 100644 --- a/tests/testthat/test-heatmap.R +++ b/tests/testthat/test-heatmap.R @@ -314,32 +314,14 @@ test_that("heatmap multi-ref - 'na.color'", { "#000000") }) -test_that("heatmap multi-ref - labels with no scores are removed", { - combined$scores <- cbind(combined$scores, "f" = NA) - expect_true("f" %in% colnames(combined$scores)) - expect_false("f" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, - scores.use = 0)$mat)) -}) - test_that("heatmap multi-ref - labels with least calls/calcs are removed by 'max.labels'", { - combined$scores <- cbind(combined$scores, "neverCalled" = 1) # actual score is immaterial - combined$scores <- cbind(combined$scores, "rarelyCalc" = NA) - combined$scores[1,"rarelyCalc"] <- 1 # Needs at least one score to not be removed anyway. - expect_true(all(c("neverCalled", "rarelyCalc") %in% colnames(combined$scores))) - - # Both there with no trimming - expect_true(all(c("neverCalled", "rarelyCalc") %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0, - max.labels = 40)$mat))) - - # The rarely picked for calculation "rarelyCalc" label should be removed first - expect_true("neverCalled" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0, - max.labels = 11)$mat)) - expect_false("rarelyCalc" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0, - max.labels = 11)$mat)) + combined$scores[[1]][1,"labels"] <- "rarelyCalc" - # The never picked as final label "neverCalled" label should be removed next - expect_false("neverCalled" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0, - max.labels = 10)$mat)) + # Present with no trimming + expect_true("rarelyCalc" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0, + max.labels = 40)$mat)) + + # The rarely picked for calculation "rarelyCalc" label should be removed. expect_false("rarelyCalc" %in% rownames(plotScoreHeatmap(results = combined, silent = TRUE, return.data = TRUE, scores.use = 0, max.labels = 10)$mat)) }) diff --git a/tests/testthat/test-recomputed.R b/tests/testthat/test-recomputed.R index 7c45d05..c281752 100644 --- a/tests/testthat/test-recomputed.R +++ b/tests/testthat/test-recomputed.R @@ -30,12 +30,17 @@ test_that("combineRecomputedResults works as expected (light check)", { combined <- combineRecomputedResults( results=list(pred1, pred2), test=test, - trained=list(train1, train2)) + trained=list(train1, train2), + fine.tune=FALSE + ) - # Checking the sanity of the output. - obs <- apply(combined$scores, 1, FUN=function(x) colnames(combined$scores)[!is.na(x)]) - ref <- rbind(pred1$labels, pred2$labels) - expect_identical(obs, ref) + expect_identical(combined$scores$ref1$labels, pred1$labels) + expect_identical(combined$scores$ref2$labels, pred2$labels) + + aggregated.scores <- do.call(cbind, lapply(combined$scores, function(x) x$scores)) + aggregated.labels <- do.call(cbind, lapply(combined$scores, function(x) as.character(x$labels))) + expect_identical(max.col(aggregated.scores), combined$reference) + expect_identical(aggregated.labels[cbind(seq_len(nrow(aggregated.labels)), max.col(aggregated.scores))], combined$labels) expect_true(all(combined$labels == pred1$labels | combined$labels==pred2$labels)) expect_true(all(combined$first.labels == pred1$first.labels | combined$first.labels==pred2$first.labels)) @@ -46,9 +51,6 @@ test_that("combineRecomputedResults works as expected (light check)", { is.na(combined$pruned.labels)==is.na(pred1$pruned.labels) | is.na(combined$pruned.labels)==is.na(pred2$pruned.labels) )) - - top <- apply(combined$scores, 1, FUN=function(x) colnames(combined$scores)[which.max(x)]) - expect_identical(top, combined$labels) }) test_that("combineRecomputedResults matrix fragmentation works as expected", {