Skip to content

Commit

Permalink
Improve cell hashing reports (#121)
Browse files Browse the repository at this point in the history
* Add more plots to cell hashing output
* Bugfix multiseq normalization
  • Loading branch information
bbimber authored Feb 22, 2020
1 parent 232ba7c commit 5f024d8
Show file tree
Hide file tree
Showing 10 changed files with 92 additions and 45 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ Remotes:
bioc::release/SingleR
RoxygenNote: 7.0.2
Collate:
'Utils.R'
'LabKeySettings.R'
'CellHashing.R'
'Utils.R'
'Classification_Caret.R'
'Classification_Garnett.R'
'Classification_MCE.R'
Expand Down
65 changes: 49 additions & 16 deletions R/CellHashing.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#' @include LabKeySettings.R
#' @include Utils.R
#' @import Seurat
#' @import Rlabkey

Expand Down Expand Up @@ -296,6 +297,7 @@ GenerateQcPlots <- function(barcodeData){
#boxplot per HTO:
barcodeMatrix <- as.matrix(barcodeData)
melted <- setNames(reshape2::melt(barcodeMatrix), c('HTO', 'CellBarcode', 'Count'))
melted$HTO <- naturalsort::naturalfactor(melted$HTO)
print(ggplot(melted, aes(x = HTO, y = Count)) +
geom_boxplot() +
xlab('HTO') +
Expand All @@ -304,6 +306,23 @@ GenerateQcPlots <- function(barcodeData){
theme(axis.text.x = element_text(angle = 90, hjust = 1))
)

print(ggplot(melted, aes(x = Count, color = HTO)) +
geom_density() +
xlab('Count/Cell') +
ylab('Density') +
ggtitle('HTO Counts Per Cell') +
facet_grid(HTO ~ ., scales = 'free')
)

melted$LogCount <- log10(melted$Count + 0.5)
print(ggplot(melted, aes(x = LogCount, color = HTO)) +
geom_density() +
xlab('log10(Count)/Cell') +
ylab('Density') +
ggtitle('Log10 HTO Counts Per Cell') +
facet_grid(HTO ~ ., scales = 'free')
)

melted$Count <- melted$Count + 0.5
print(ggplot(melted, aes(x = HTO, y = Count)) +
geom_boxplot() +
Expand Down Expand Up @@ -488,7 +507,7 @@ DebugDemux <- function(seuratObj, assay = 'HTO', reportKmeans = FALSE) {
Idents(object = seuratObj, cells = names(x = init.clusters$clustering), drop = TRUE) <- init.clusters$clustering

# Calculate tSNE embeddings with a distance matrix
perplexity <- .InferPerplexity(seuratObj, 100)
perplexity <- .InferPerplexityFromSeuratObj(seuratObj, 100)
seuratObj[['hto_tsne']] <- RunTSNE(dist(t(data)), assay = assay, perplexity = perplexity)
P <- DimPlot(seuratObj, reduction = 'hto_tsne', label = TRUE)
P <- P + ggtitle('Clusters: clara')
Expand Down Expand Up @@ -549,11 +568,15 @@ DoHtoDemux <- function(seuratObj, positive.quantile = 0.99, label = 'Seurat HTOD
#' @description A description
#' @return A modified Seurat object.
#' @import data.table
GenerateCellHashCallsMultiSeq <- function(barcodeData) {
GenerateCellHashCallsMultiSeq <- function(barcodeData, method = 'seurat') {
seuratObj <- CreateSeuratObject(barcodeData, assay = 'HTO')

tryCatch({
seuratObj <- DoMULTIseqDemux(seuratObj)
if (method == 'seurat') {
seuratObj <- DoMULTIseqDemux(seuratObj)
} else {
stop(pate0('Unknown method: ', method))
}

return(data.table(Barcode = as.factor(colnames(seuratObj)), HTO_classification = seuratObj$MULTI_ID, HTO_classification.global = seuratObj$MULTI_classification.global, key = c('Barcode')))
}, error = function(e){
Expand Down Expand Up @@ -585,32 +608,33 @@ GenerateCellHashingCalls <- function(barcodeData, positive.quantile = 0.99, atte
return(dt)
}

#' @title A Title
#' @title Perform HTO classification using Seurat's implementation of MULTI-seq classification
#'
#' @description A description
#' @description Perform HTO classification using Seurat's implementation of MULTI-seq classification
#' @param seuratObj, A Seurat object.
#' @return A modified Seurat object.
DoMULTIseqDemux <- function(seuratObj, assay = 'HTO', autoThresh = TRUE, quantile = NULL, maxiter = 20, qrange = seq(from = 0.2, to = 0.95, by = 0.05)) {

## Normalize Data: Log2 Transform, mean-center
counts <- GetAssayData(seuratObj, assay = assay, slot = 'counts')
log2Scaled <- as.data.frame(log2(counts))
for (i in 1:ncol(counts)) {
log2Scaled <- as.data.frame(log2(Matrix::t(counts)))
for (i in 1:ncol(log2Scaled)) {
ind <- which(is.finite(log2Scaled[,i]) == FALSE)
log2Scaled[ind,i] <- 0
log2Scaled[,i] <- log2Scaled[,i]-mean(log2Scaled[,i])
}
seuratObjMS <- CreateSeuratObject(counts, assay = 'MultiSeq')
seuratObjMS[['MultiSeq']]@data <- as.matrix(log2Scaled)
seuratObjMS[['MultiSeq']]@data <- Matrix::t(as.matrix(log2Scaled))

seuratObjMS <- MULTIseqDemux(seuratObjMS, assay = "MultiSeq", quantile = quantile, verbose = TRUE, autoThresh = autoThresh, maxiter = maxiter, qrange = qrange)

seuratObj$MULTI_ID <- as.character(seuratObjMS$MULTI_ID)
seuratObj$MULTI_classification.global <- as.character(seuratObjMS$MULTI_ID)
seuratObj$MULTI_classification.global[!(seuratObjMS$MULTI_ID %in% c('Negative', 'Doublet'))] <- 'Singlet'
seuratObj$MULTI_classification.global <- as.factor(seuratObj$MULTI_classification.global)
seuratObjMS$MULTI_classification.global <- as.character(seuratObjMS$MULTI_ID)
seuratObjMS$MULTI_classification.global[!(seuratObjMS$MULTI_ID %in% c('Negative', 'Doublet'))] <- 'Singlet'
seuratObjMS$MULTI_classification.global <- as.factor(seuratObjMS$MULTI_classification.global)

HtoSummary(seuratObj, label = 'MULTI-SEQ', htoClassificationField = 'MULTI_ID', globalClassificationField = 'MULTI_classification.global')
HtoSummary(seuratObjMS, label = 'MULTI-SEQ', htoClassificationField = 'MULTI_ID', globalClassificationField = 'MULTI_classification.global', assay = 'MultiSeq')

seuratObj$MULTI_ID <- as.character(seuratObjMS$MULTI_ID)
seuratObj$MULTI_classification.global <- seuratObjMS$MULTI_classification.global

return(seuratObj)
}
Expand All @@ -635,7 +659,7 @@ HtoSummary <- function(seuratObj, htoClassificationField, globalClassificationFi
}

if (doTSNE) {
perplexity <- .InferPerplexity(seuratObj, 100)
perplexity <- .InferPerplexityFromSeuratObj(seuratObj, 100)
seuratObj[['hto_tsne']] <- RunTSNE(dist(Matrix::t(GetAssayData(seuratObj, slot = "data", assay = assay))), assay = assay, perplexity = perplexity)
print(DimPlot(seuratObj, reduction = 'hto_tsne', group.by = htoClassificationField, label = TRUE) + ggtitle(label))
print(DimPlot(seuratObj, reduction = 'hto_tsne', group.by = globalClassificationField, label = TRUE) + ggtitle(label))
Expand Down Expand Up @@ -763,6 +787,15 @@ ProcessEnsemblHtoCalls <- function(mc, sc, barcodeData,
ggtitle('Discordance By HTO Call') + ylab('Seurat') + xlab('MULTI-seq')
)

discord <- merged[!merged$HasSeuratCall | !merged$HasMultiSeqCall,]
discord <- discord[discord$HasSeuratCall | discord$HasMultiSeqCall,]
discord <- discord %>% group_by(HTO_classification.MultiSeq, HTO_classification.Seurat) %>% summarise(Count = dplyr::n())
print(qplot(x=HTO_classification.MultiSeq, y=HTO_classification.Seurat, data=discord, fill=Count, geom="tile") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red") +
ggtitle('Calls Made By Single Caller') + ylab('Seurat') + xlab('MULTI-seq')
)

# These calls should be identical, except for possibly negatives from one method that are non-negative in the other
# For the time being, accept those as correct.
merged$FinalCall <- as.character(merged$HTO_classification.MultiSeq)
Expand Down Expand Up @@ -1023,7 +1056,7 @@ DownloadAndAppendCellHashing <- function(seuratObject, outPath = '.'){
cellHashingId <- FindMatchedCellHashing(barcodePrefix)
if (is.null(cellHashingId)){
print(paste0('Cell hashing not used for prefix: ', barcodePrefix, ', skipping'))
next(seuratObject)
next
} else if (is.na(cellHashingId)){
stop(paste0('Unable to find cellHashing calls table file for prefix: ', barcodePrefix))
}
Expand Down
2 changes: 1 addition & 1 deletion R/GeneNames.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ QueryEnsemblSymbolAndHumanHomologs <- function(ensemblIds, biomart = "ensembl",
ret <- ret[order(ret$SortOrder),]

ret$Label <- as.character(ret$ensembl_gene_id)
ret$Label[!is.na(ret$hsapiens_homolog_associated_gene_name)] <- ret$hsapiens_homolog_associated_gene_name[!is.na(ret$hsapiens_homolog_associated_gene_name)]
ret$Label[!is.na(ret$hsapiens_homolog_associated_gene_name)] <- paste0(ret$hsapiens_homolog_associated_gene_name[!is.na(ret$hsapiens_homolog_associated_gene_name)], '(hs)')
ret$Label[!is.na(ret$external_gene_name)] <- ret$external_gene_name[!is.na(ret$external_gene_name)]

return(ret)
Expand Down
13 changes: 1 addition & 12 deletions R/Seurat_III.R
Original file line number Diff line number Diff line change
Expand Up @@ -828,7 +828,7 @@ FindClustersAndDimRedux <- function(seuratObj, dimsToUse = NULL, saveFile = NULL
}

if (forceReCalc | !HasStepRun(seuratObj, 'RunTSNE', forceReCalc = forceReCalc)) {
perplexity <- .InferPerplexity(seuratObj)
perplexity <- .InferPerplexityFromSeuratObj(seuratObj)
seuratObj <- RunTSNE(object = seuratObj, dims.use = dimsToUse, check_duplicates = FALSE, perplexity = perplexity)
seuratObj <- MarkStepRun(seuratObj, 'RunTSNE', saveFile)
}
Expand Down Expand Up @@ -1558,14 +1558,3 @@ PlotMarkerSet <- function(seuratObj, reduction, title, features) {
print(AddTitleToMultiPlot(FeaturePlot(seuratObj, features = featuresToPlot, reduction = reduction), title))
}

.InferPerplexity <- function(seuratObj, perplexity = 30) {
if (ncol(seuratObj) - 1 < 3 * perplexity) {
print(paste0('Perplexity is too large for the number of samples: ', ncol(seuratObj)))
perplexityNew <- floor((ncol(seuratObj) - 1) / 3)
print(paste0('lowering from ', perplexity, ' to: ', perplexityNew))
perplexity <- perplexityNew
}

return(perplexity)
}

7 changes: 6 additions & 1 deletion R/Seurat_III_Fixes.R
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ HTODemux2 <- function(
idents = levels(x = Idents(object = object))[[minNonZero]]
)]

doSkip <- FALSE
tryCatch({
fit <- suppressWarnings(fitdist(data = values.use, distr = "nbinom"))
if (plotDist) {
Expand All @@ -108,8 +109,12 @@ HTODemux2 <- function(
print(paste0('Error fitting nbinom, skipping: ', hto))
print(e)
saveRDS(values.use, file = paste0('./', hto, '.nbinom.data.rds'))
next
doSkip <- TRUE
})

if (doSkip) {
next
}
}

discrete[hto, names(x = which(x = values > cutoff))] <- 1
Expand Down
16 changes: 16 additions & 0 deletions R/Utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -340,3 +340,19 @@ find_peaks <- function (x, m = 4){
pks
}


.InferPerplexityFromSeuratObj <- function(seuratObj, perplexity = 30) {
return(.InferPerplexity(ncol(seuratObj), perplexity))
}


.InferPerplexity <- function(sampleNumber, perplexity = 30) {
if (sampleNumber - 1 < 3 * perplexity) {
print(paste0('Perplexity is too large for the number of samples: ', sampleNumber))
perplexityNew <- floor((sampleNumber - 1) / 3)
print(paste0('lowering from ', perplexity, ' to: ', perplexityNew))
perplexity <- perplexityNew
}

return(perplexity)
}
4 changes: 2 additions & 2 deletions man/DoMULTIseqDemux.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/GenerateCellHashCallsMultiSeq.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 12 additions & 8 deletions tests/testthat/test-cellhashing.R
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
context("scRNAseq")

tests <- list(
'282-1' = list(
'282-1' = list(
input = '../testdata/cellHashing/282-1-HTO_cellHashingRawCounts.txt',
htos = c(2:3, 8, 10, 12),
gexBarcodeFile = '../testdata/cellHashing/282-1-whitelist.txt',
CalledCells = 6953,
Singlet = 4791,
MultiSeq = 4790,
CalledCells = 6032,
Singlet = 4084,
MultiSeq = 5860,
Discordant = 1968,
Seurat = 4289,
TotalRows = 8000,
DoRowFilter = T
),
'283' = list(
'283' = list(
input = '../testdata/cellHashing/283-cellbarcodeToHTO.calls.citeSeqCounts.txt', htos = c(2:6),
gexBarcodeFile = '../testdata/cellHashing/283-validCellIndexes.csv',
CalledCells = 4970,
Singlet = 3889,
MultiSeq = 4576,
CalledCells = 4733,
Singlet = 3363,
MultiSeq = 4786,
Discordant = 1294,
Seurat = 3581,
TotalRows = 6027,
DoRowFilter = T
Expand Down Expand Up @@ -93,6 +95,8 @@ test_that("Cell hashing works", {
expect_equal(test[['Singlet']], sum(dt$HTO_Classification == 'Singlet'))
expect_equal(test[['Seurat']], sum(dt$Seurat))
expect_equal(test[['MultiSeq']], sum(dt$MultiSeq))
expect_equal(test[['Discordant']], sum(dt$HTO == 'Discordant'))
expect_equal(test[['Discordant']], sum(dt$HTO_Classification == 'Discordant'))

d <- read.table(callsFile, header = T, sep = '\t')
expect_equal(test[['TotalRows']], nrow(d))
Expand Down
6 changes: 3 additions & 3 deletions tests/testthat/test-genealias.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,16 @@ test_that("All aliases preserved", {
# ENSMMUG00000008350: resolved by homolog
# CD8: Already symbol, not resolved, return original
homologs <- QueryEnsemblSymbolAndHumanHomologs(c('ENSMMUG00000040244', 'ENSMMUG00000008350', 'CD8'), dataset = 'mmulatta_gene_ensembl', version = '97')
expect_equal(homologs$Label, c('TRAV1-1', 'MDK', 'CD8'))
expect_equal(homologs$Label, c('TRAV1-1', 'MDK(hs)', 'CD8'))

# This is a private method, but test directly anyway
cdg <- OOSAP:::RenameGenesUsingCD(c('PTPR', '12345', 'DPP4', 'ENSMMUG00000040244'))
expect_equal(cdg, c('PTPR', '12345', 'DPP4 (CD26,ADCP2)', 'ENSMMUG00000040244'))

aliased <- AliasGeneNames(c('ENSMMUG00000040244', 'ENSMMUG00000008350', 'CD8', 'DPP4'), ensemblVersion = '97')
expect_equal(aliased, c('TRAV1-1', 'MDK', 'CD8', 'DPP4 (CD26,ADCP2)'))
expect_equal(aliased, c('TRAV1-1', 'MDK(hs)', 'CD8', 'DPP4 (CD26,ADCP2)'))

#verify concat works when it returns two hits:
aliased <- AliasGeneNames(c('ENSMMUG00000029821'), ensemblVersion = '97')
expect_equal(aliased, c('HSPA1A,HSPA1B'))
expect_equal(aliased, c('HSPA1A,HSPA1B(hs)'))
})

0 comments on commit 5f024d8

Please sign in to comment.