From 5f024d838ce5e060ef80e8963ba720ba115bcd6e Mon Sep 17 00:00:00 2001 From: bbimber Date: Sat, 22 Feb 2020 14:51:28 -0800 Subject: [PATCH] Improve cell hashing reports (#121) * Add more plots to cell hashing output * Bugfix multiseq normalization --- DESCRIPTION | 2 +- R/CellHashing.R | 65 +++++++++++++++++++++------- R/GeneNames.R | 2 +- R/Seurat_III.R | 13 +----- R/Seurat_III_Fixes.R | 7 ++- R/Utils.R | 16 +++++++ man/DoMULTIseqDemux.Rd | 4 +- man/GenerateCellHashCallsMultiSeq.Rd | 2 +- tests/testthat/test-cellhashing.R | 20 +++++---- tests/testthat/test-genealias.R | 6 +-- 10 files changed, 92 insertions(+), 45 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d035573..b349764 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -57,9 +57,9 @@ Remotes: bioc::release/SingleR RoxygenNote: 7.0.2 Collate: + 'Utils.R' 'LabKeySettings.R' 'CellHashing.R' - 'Utils.R' 'Classification_Caret.R' 'Classification_Garnett.R' 'Classification_MCE.R' diff --git a/R/CellHashing.R b/R/CellHashing.R index 442b5ba..b2b42c3 100644 --- a/R/CellHashing.R +++ b/R/CellHashing.R @@ -1,4 +1,5 @@ #' @include LabKeySettings.R +#' @include Utils.R #' @import Seurat #' @import Rlabkey @@ -296,6 +297,7 @@ GenerateQcPlots <- function(barcodeData){ #boxplot per HTO: barcodeMatrix <- as.matrix(barcodeData) melted <- setNames(reshape2::melt(barcodeMatrix), c('HTO', 'CellBarcode', 'Count')) + melted$HTO <- naturalsort::naturalfactor(melted$HTO) print(ggplot(melted, aes(x = HTO, y = Count)) + geom_boxplot() + xlab('HTO') + @@ -304,6 +306,23 @@ GenerateQcPlots <- function(barcodeData){ theme(axis.text.x = element_text(angle = 90, hjust = 1)) ) + print(ggplot(melted, aes(x = Count, color = HTO)) + + geom_density() + + xlab('Count/Cell') + + ylab('Density') + + ggtitle('HTO Counts Per Cell') + + facet_grid(HTO ~ ., scales = 'free') + ) + + melted$LogCount <- log10(melted$Count + 0.5) + print(ggplot(melted, aes(x = LogCount, color = HTO)) + + geom_density() + + xlab('log10(Count)/Cell') + + ylab('Density') + + ggtitle('Log10 HTO Counts Per Cell') + + facet_grid(HTO ~ ., scales = 'free') + ) + melted$Count <- melted$Count + 0.5 print(ggplot(melted, aes(x = HTO, y = Count)) + geom_boxplot() + @@ -488,7 +507,7 @@ DebugDemux <- function(seuratObj, assay = 'HTO', reportKmeans = FALSE) { Idents(object = seuratObj, cells = names(x = init.clusters$clustering), drop = TRUE) <- init.clusters$clustering # Calculate tSNE embeddings with a distance matrix - perplexity <- .InferPerplexity(seuratObj, 100) + perplexity <- .InferPerplexityFromSeuratObj(seuratObj, 100) seuratObj[['hto_tsne']] <- RunTSNE(dist(t(data)), assay = assay, perplexity = perplexity) P <- DimPlot(seuratObj, reduction = 'hto_tsne', label = TRUE) P <- P + ggtitle('Clusters: clara') @@ -549,11 +568,15 @@ DoHtoDemux <- function(seuratObj, positive.quantile = 0.99, label = 'Seurat HTOD #' @description A description #' @return A modified Seurat object. #' @import data.table -GenerateCellHashCallsMultiSeq <- function(barcodeData) { +GenerateCellHashCallsMultiSeq <- function(barcodeData, method = 'seurat') { seuratObj <- CreateSeuratObject(barcodeData, assay = 'HTO') tryCatch({ - seuratObj <- DoMULTIseqDemux(seuratObj) + if (method == 'seurat') { + seuratObj <- DoMULTIseqDemux(seuratObj) + } else { + stop(pate0('Unknown method: ', method)) + } return(data.table(Barcode = as.factor(colnames(seuratObj)), HTO_classification = seuratObj$MULTI_ID, HTO_classification.global = seuratObj$MULTI_classification.global, key = c('Barcode'))) }, error = function(e){ @@ -585,32 +608,33 @@ GenerateCellHashingCalls <- function(barcodeData, positive.quantile = 0.99, atte return(dt) } -#' @title A Title +#' @title Perform HTO classification using Seurat's implementation of MULTI-seq classification #' -#' @description A description +#' @description Perform HTO classification using Seurat's implementation of MULTI-seq classification #' @param seuratObj, A Seurat object. #' @return A modified Seurat object. DoMULTIseqDemux <- function(seuratObj, assay = 'HTO', autoThresh = TRUE, quantile = NULL, maxiter = 20, qrange = seq(from = 0.2, to = 0.95, by = 0.05)) { - ## Normalize Data: Log2 Transform, mean-center counts <- GetAssayData(seuratObj, assay = assay, slot = 'counts') - log2Scaled <- as.data.frame(log2(counts)) - for (i in 1:ncol(counts)) { + log2Scaled <- as.data.frame(log2(Matrix::t(counts))) + for (i in 1:ncol(log2Scaled)) { ind <- which(is.finite(log2Scaled[,i]) == FALSE) log2Scaled[ind,i] <- 0 log2Scaled[,i] <- log2Scaled[,i]-mean(log2Scaled[,i]) } seuratObjMS <- CreateSeuratObject(counts, assay = 'MultiSeq') - seuratObjMS[['MultiSeq']]@data <- as.matrix(log2Scaled) + seuratObjMS[['MultiSeq']]@data <- Matrix::t(as.matrix(log2Scaled)) seuratObjMS <- MULTIseqDemux(seuratObjMS, assay = "MultiSeq", quantile = quantile, verbose = TRUE, autoThresh = autoThresh, maxiter = maxiter, qrange = qrange) - seuratObj$MULTI_ID <- as.character(seuratObjMS$MULTI_ID) - seuratObj$MULTI_classification.global <- as.character(seuratObjMS$MULTI_ID) - seuratObj$MULTI_classification.global[!(seuratObjMS$MULTI_ID %in% c('Negative', 'Doublet'))] <- 'Singlet' - seuratObj$MULTI_classification.global <- as.factor(seuratObj$MULTI_classification.global) + seuratObjMS$MULTI_classification.global <- as.character(seuratObjMS$MULTI_ID) + seuratObjMS$MULTI_classification.global[!(seuratObjMS$MULTI_ID %in% c('Negative', 'Doublet'))] <- 'Singlet' + seuratObjMS$MULTI_classification.global <- as.factor(seuratObjMS$MULTI_classification.global) - HtoSummary(seuratObj, label = 'MULTI-SEQ', htoClassificationField = 'MULTI_ID', globalClassificationField = 'MULTI_classification.global') + HtoSummary(seuratObjMS, label = 'MULTI-SEQ', htoClassificationField = 'MULTI_ID', globalClassificationField = 'MULTI_classification.global', assay = 'MultiSeq') + + seuratObj$MULTI_ID <- as.character(seuratObjMS$MULTI_ID) + seuratObj$MULTI_classification.global <- seuratObjMS$MULTI_classification.global return(seuratObj) } @@ -635,7 +659,7 @@ HtoSummary <- function(seuratObj, htoClassificationField, globalClassificationFi } if (doTSNE) { - perplexity <- .InferPerplexity(seuratObj, 100) + perplexity <- .InferPerplexityFromSeuratObj(seuratObj, 100) seuratObj[['hto_tsne']] <- RunTSNE(dist(Matrix::t(GetAssayData(seuratObj, slot = "data", assay = assay))), assay = assay, perplexity = perplexity) print(DimPlot(seuratObj, reduction = 'hto_tsne', group.by = htoClassificationField, label = TRUE) + ggtitle(label)) print(DimPlot(seuratObj, reduction = 'hto_tsne', group.by = globalClassificationField, label = TRUE) + ggtitle(label)) @@ -763,6 +787,15 @@ ProcessEnsemblHtoCalls <- function(mc, sc, barcodeData, ggtitle('Discordance By HTO Call') + ylab('Seurat') + xlab('MULTI-seq') ) + discord <- merged[!merged$HasSeuratCall | !merged$HasMultiSeqCall,] + discord <- discord[discord$HasSeuratCall | discord$HasMultiSeqCall,] + discord <- discord %>% group_by(HTO_classification.MultiSeq, HTO_classification.Seurat) %>% summarise(Count = dplyr::n()) + print(qplot(x=HTO_classification.MultiSeq, y=HTO_classification.Seurat, data=discord, fill=Count, geom="tile") + + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + + scale_fill_gradient2(low = "blue", mid = "white", high = "red") + + ggtitle('Calls Made By Single Caller') + ylab('Seurat') + xlab('MULTI-seq') + ) + # These calls should be identical, except for possibly negatives from one method that are non-negative in the other # For the time being, accept those as correct. merged$FinalCall <- as.character(merged$HTO_classification.MultiSeq) @@ -1023,7 +1056,7 @@ DownloadAndAppendCellHashing <- function(seuratObject, outPath = '.'){ cellHashingId <- FindMatchedCellHashing(barcodePrefix) if (is.null(cellHashingId)){ print(paste0('Cell hashing not used for prefix: ', barcodePrefix, ', skipping')) - next(seuratObject) + next } else if (is.na(cellHashingId)){ stop(paste0('Unable to find cellHashing calls table file for prefix: ', barcodePrefix)) } diff --git a/R/GeneNames.R b/R/GeneNames.R index d64118c..bf6bbd4 100644 --- a/R/GeneNames.R +++ b/R/GeneNames.R @@ -35,7 +35,7 @@ QueryEnsemblSymbolAndHumanHomologs <- function(ensemblIds, biomart = "ensembl", ret <- ret[order(ret$SortOrder),] ret$Label <- as.character(ret$ensembl_gene_id) - ret$Label[!is.na(ret$hsapiens_homolog_associated_gene_name)] <- ret$hsapiens_homolog_associated_gene_name[!is.na(ret$hsapiens_homolog_associated_gene_name)] + ret$Label[!is.na(ret$hsapiens_homolog_associated_gene_name)] <- paste0(ret$hsapiens_homolog_associated_gene_name[!is.na(ret$hsapiens_homolog_associated_gene_name)], '(hs)') ret$Label[!is.na(ret$external_gene_name)] <- ret$external_gene_name[!is.na(ret$external_gene_name)] return(ret) diff --git a/R/Seurat_III.R b/R/Seurat_III.R index 6ebd674..c90caec 100644 --- a/R/Seurat_III.R +++ b/R/Seurat_III.R @@ -828,7 +828,7 @@ FindClustersAndDimRedux <- function(seuratObj, dimsToUse = NULL, saveFile = NULL } if (forceReCalc | !HasStepRun(seuratObj, 'RunTSNE', forceReCalc = forceReCalc)) { - perplexity <- .InferPerplexity(seuratObj) + perplexity <- .InferPerplexityFromSeuratObj(seuratObj) seuratObj <- RunTSNE(object = seuratObj, dims.use = dimsToUse, check_duplicates = FALSE, perplexity = perplexity) seuratObj <- MarkStepRun(seuratObj, 'RunTSNE', saveFile) } @@ -1558,14 +1558,3 @@ PlotMarkerSet <- function(seuratObj, reduction, title, features) { print(AddTitleToMultiPlot(FeaturePlot(seuratObj, features = featuresToPlot, reduction = reduction), title)) } -.InferPerplexity <- function(seuratObj, perplexity = 30) { - if (ncol(seuratObj) - 1 < 3 * perplexity) { - print(paste0('Perplexity is too large for the number of samples: ', ncol(seuratObj))) - perplexityNew <- floor((ncol(seuratObj) - 1) / 3) - print(paste0('lowering from ', perplexity, ' to: ', perplexityNew)) - perplexity <- perplexityNew - } - - return(perplexity) -} - diff --git a/R/Seurat_III_Fixes.R b/R/Seurat_III_Fixes.R index 82ff397..c7b49c4 100644 --- a/R/Seurat_III_Fixes.R +++ b/R/Seurat_III_Fixes.R @@ -97,6 +97,7 @@ HTODemux2 <- function( idents = levels(x = Idents(object = object))[[minNonZero]] )] + doSkip <- FALSE tryCatch({ fit <- suppressWarnings(fitdist(data = values.use, distr = "nbinom")) if (plotDist) { @@ -108,8 +109,12 @@ HTODemux2 <- function( print(paste0('Error fitting nbinom, skipping: ', hto)) print(e) saveRDS(values.use, file = paste0('./', hto, '.nbinom.data.rds')) - next + doSkip <- TRUE }) + + if (doSkip) { + next + } } discrete[hto, names(x = which(x = values > cutoff))] <- 1 diff --git a/R/Utils.R b/R/Utils.R index b7c53f4..1cb2798 100644 --- a/R/Utils.R +++ b/R/Utils.R @@ -340,3 +340,19 @@ find_peaks <- function (x, m = 4){ pks } + +.InferPerplexityFromSeuratObj <- function(seuratObj, perplexity = 30) { + return(.InferPerplexity(ncol(seuratObj), perplexity)) +} + + +.InferPerplexity <- function(sampleNumber, perplexity = 30) { + if (sampleNumber - 1 < 3 * perplexity) { + print(paste0('Perplexity is too large for the number of samples: ', sampleNumber)) + perplexityNew <- floor((sampleNumber - 1) / 3) + print(paste0('lowering from ', perplexity, ' to: ', perplexityNew)) + perplexity <- perplexityNew + } + + return(perplexity) +} diff --git a/man/DoMULTIseqDemux.Rd b/man/DoMULTIseqDemux.Rd index e3b2936..cb448ea 100644 --- a/man/DoMULTIseqDemux.Rd +++ b/man/DoMULTIseqDemux.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/CellHashing.R \name{DoMULTIseqDemux} \alias{DoMULTIseqDemux} -\title{A Title} +\title{Perform HTO classification using Seurat's implementation of MULTI-seq classification} \usage{ DoMULTIseqDemux( seuratObj, @@ -20,5 +20,5 @@ DoMULTIseqDemux( A modified Seurat object. } \description{ -A description +Perform HTO classification using Seurat's implementation of MULTI-seq classification } diff --git a/man/GenerateCellHashCallsMultiSeq.Rd b/man/GenerateCellHashCallsMultiSeq.Rd index 3702db6..90628c3 100644 --- a/man/GenerateCellHashCallsMultiSeq.Rd +++ b/man/GenerateCellHashCallsMultiSeq.Rd @@ -4,7 +4,7 @@ \alias{GenerateCellHashCallsMultiSeq} \title{A Title} \usage{ -GenerateCellHashCallsMultiSeq(barcodeData) +GenerateCellHashCallsMultiSeq(barcodeData, method = "seurat") } \value{ A modified Seurat object. diff --git a/tests/testthat/test-cellhashing.R b/tests/testthat/test-cellhashing.R index d3a1336..2d3a4c2 100644 --- a/tests/testthat/test-cellhashing.R +++ b/tests/testthat/test-cellhashing.R @@ -1,23 +1,25 @@ context("scRNAseq") tests <- list( - '282-1' = list( + '282-1' = list( input = '../testdata/cellHashing/282-1-HTO_cellHashingRawCounts.txt', htos = c(2:3, 8, 10, 12), gexBarcodeFile = '../testdata/cellHashing/282-1-whitelist.txt', - CalledCells = 6953, - Singlet = 4791, - MultiSeq = 4790, + CalledCells = 6032, + Singlet = 4084, + MultiSeq = 5860, + Discordant = 1968, Seurat = 4289, TotalRows = 8000, DoRowFilter = T ), - '283' = list( + '283' = list( input = '../testdata/cellHashing/283-cellbarcodeToHTO.calls.citeSeqCounts.txt', htos = c(2:6), gexBarcodeFile = '../testdata/cellHashing/283-validCellIndexes.csv', - CalledCells = 4970, - Singlet = 3889, - MultiSeq = 4576, + CalledCells = 4733, + Singlet = 3363, + MultiSeq = 4786, + Discordant = 1294, Seurat = 3581, TotalRows = 6027, DoRowFilter = T @@ -93,6 +95,8 @@ test_that("Cell hashing works", { expect_equal(test[['Singlet']], sum(dt$HTO_Classification == 'Singlet')) expect_equal(test[['Seurat']], sum(dt$Seurat)) expect_equal(test[['MultiSeq']], sum(dt$MultiSeq)) + expect_equal(test[['Discordant']], sum(dt$HTO == 'Discordant')) + expect_equal(test[['Discordant']], sum(dt$HTO_Classification == 'Discordant')) d <- read.table(callsFile, header = T, sep = '\t') expect_equal(test[['TotalRows']], nrow(d)) diff --git a/tests/testthat/test-genealias.R b/tests/testthat/test-genealias.R index a052d1a..b46f194 100644 --- a/tests/testthat/test-genealias.R +++ b/tests/testthat/test-genealias.R @@ -5,16 +5,16 @@ test_that("All aliases preserved", { # ENSMMUG00000008350: resolved by homolog # CD8: Already symbol, not resolved, return original homologs <- QueryEnsemblSymbolAndHumanHomologs(c('ENSMMUG00000040244', 'ENSMMUG00000008350', 'CD8'), dataset = 'mmulatta_gene_ensembl', version = '97') - expect_equal(homologs$Label, c('TRAV1-1', 'MDK', 'CD8')) + expect_equal(homologs$Label, c('TRAV1-1', 'MDK(hs)', 'CD8')) # This is a private method, but test directly anyway cdg <- OOSAP:::RenameGenesUsingCD(c('PTPR', '12345', 'DPP4', 'ENSMMUG00000040244')) expect_equal(cdg, c('PTPR', '12345', 'DPP4 (CD26,ADCP2)', 'ENSMMUG00000040244')) aliased <- AliasGeneNames(c('ENSMMUG00000040244', 'ENSMMUG00000008350', 'CD8', 'DPP4'), ensemblVersion = '97') - expect_equal(aliased, c('TRAV1-1', 'MDK', 'CD8', 'DPP4 (CD26,ADCP2)')) + expect_equal(aliased, c('TRAV1-1', 'MDK(hs)', 'CD8', 'DPP4 (CD26,ADCP2)')) #verify concat works when it returns two hits: aliased <- AliasGeneNames(c('ENSMMUG00000029821'), ensemblVersion = '97') - expect_equal(aliased, c('HSPA1A,HSPA1B')) + expect_equal(aliased, c('HSPA1A,HSPA1B(hs)')) }) \ No newline at end of file