From 7a79ab135caab74642fe5aacb95f417769d3835f Mon Sep 17 00:00:00 2001 From: rfriedman22 Date: Thu, 28 Nov 2024 14:49:55 -0800 Subject: [PATCH 1/4] Aggregate genes using the same method as cells --- R/cluster_genes.R | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/R/cluster_genes.R b/R/cluster_genes.R index 7b812e6..91a83f5 100644 --- a/R/cluster_genes.R +++ b/R/cluster_genes.R @@ -479,33 +479,39 @@ aggregate_gene_expression <- function(cds, if (any(short_name_mask)) { geneids <- as.character(gene_group_df[[1]]) geneids[short_name_mask] <- row.names(fData(cds))[match( - geneids[short_name_mask], fData(cds)$gene_short_name)] + geneids[short_name_mask], fData(cds)$gene_short_name + )] gene_group_df[[1]] <- geneids } + + agg_mat <- agg_mat[gene_group_df[, 1], , drop = FALSE] + agg_mat <- my.aggregate.Matrix(agg_mat, + as.factor(gene_group_df[, 2]), + fun = gene_agg_fun) # gene_group_df = gene_group_df[row.names(fData(cds)),] # FIXME: this should allow genes to be part of multiple groups. group_by # over the second column with a call to colSum should do it. - gene_groups = unique(gene_group_df[,2]) - agg_gene_groups = lapply(gene_groups, function(gene_group){ - genes_in_group = unique(gene_group_df[gene_group_df[,2] == gene_group,1]) - gene_expr_mat = agg_mat[genes_in_group,] - if (length(dn <- dim(gene_expr_mat)) < 2L) - return(NA) - if (gene_agg_fun == "mean"){ - res = Matrix::colMeans(agg_mat[genes_in_group,]) - }else if (gene_agg_fun == "sum"){ - res = Matrix::colSums(agg_mat[genes_in_group,]) - } - return(res) - }) - - agg_mat_colnames = colnames(agg_mat) - agg_mat = do.call(rbind, agg_gene_groups) - row.names(agg_mat) = gene_groups - agg_mat = agg_mat[is.na(agg_gene_groups) == FALSE, , drop=FALSE] - colnames(agg_mat) = agg_mat_colnames + # gene_groups = unique(gene_group_df[,2]) + # agg_gene_groups = lapply(gene_groups, function(gene_group){ + # genes_in_group = unique(gene_group_df[gene_group_df[,2] == gene_group,1]) + # gene_expr_mat = agg_mat[genes_in_group,] + # if (length(dn <- dim(gene_expr_mat)) < 2L) + # return(NA) + # if (gene_agg_fun == "mean"){ + # res = Matrix::colMeans(agg_mat[genes_in_group,]) + # }else if (gene_agg_fun == "sum"){ + # res = Matrix::colSums(agg_mat[genes_in_group,]) + # } + # return(res) + # }) + + # agg_mat_colnames = colnames(agg_mat) + # agg_mat = do.call(rbind, agg_gene_groups) + # row.names(agg_mat) = gene_groups + # agg_mat = agg_mat[is.na(agg_gene_groups) == FALSE, , drop=FALSE] + # colnames(agg_mat) = agg_mat_colnames } if (is.null(cell_group_df) == FALSE){ From 2a27eab2ff18b7e5c6d81c059c9e9872af504975 Mon Sep 17 00:00:00 2001 From: rfriedman22 Date: Thu, 19 Dec 2024 16:50:04 -0800 Subject: [PATCH 2/4] Handle cases where a gene is represented by multiple groups --- R/cluster_genes.R | 42 ++++++++++++++---------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/R/cluster_genes.R b/R/cluster_genes.R index 91a83f5..275824c 100644 --- a/R/cluster_genes.R +++ b/R/cluster_genes.R @@ -484,34 +484,20 @@ aggregate_gene_expression <- function(cds, gene_group_df[[1]] <- geneids } - agg_mat <- agg_mat[gene_group_df[, 1], , drop = FALSE] - agg_mat <- my.aggregate.Matrix(agg_mat, - as.factor(gene_group_df[, 2]), - fun = gene_agg_fun) - - # gene_group_df = gene_group_df[row.names(fData(cds)),] - - # FIXME: this should allow genes to be part of multiple groups. group_by - # over the second column with a call to colSum should do it. - # gene_groups = unique(gene_group_df[,2]) - # agg_gene_groups = lapply(gene_groups, function(gene_group){ - # genes_in_group = unique(gene_group_df[gene_group_df[,2] == gene_group,1]) - # gene_expr_mat = agg_mat[genes_in_group,] - # if (length(dn <- dim(gene_expr_mat)) < 2L) - # return(NA) - # if (gene_agg_fun == "mean"){ - # res = Matrix::colMeans(agg_mat[genes_in_group,]) - # }else if (gene_agg_fun == "sum"){ - # res = Matrix::colSums(agg_mat[genes_in_group,]) - # } - # return(res) - # }) - - # agg_mat_colnames = colnames(agg_mat) - # agg_mat = do.call(rbind, agg_gene_groups) - # row.names(agg_mat) = gene_groups - # agg_mat = agg_mat[is.na(agg_gene_groups) == FALSE, , drop=FALSE] - # colnames(agg_mat) = agg_mat_colnames + unique_gene_ids <- unique(gene_group_df[, 1]) + agg_mat <- agg_mat[unique_gene_ids, , drop = FALSE] + gene_groups <- unique(gene_group_df[, 2]) + X <- Matrix::sparseMatrix( + i = gene_group_df[, 2], + j = match(gene_group_df[, 1], unique_gene_ids), + x = 1, + dims = c(length(gene_groups), length(unique_gene_ids)), + ) + agg_mat <- X %*% agg_mat + if (gene_agg_fun == "mean") { + agg_mat <- agg_mat / Matrix::rowSums(X) + } + row.names(agg_mat) <- gene_groups } if (is.null(cell_group_df) == FALSE){ From f02532fdb4b02051560ce4b7680fac59de7dce34 Mon Sep 17 00:00:00 2001 From: rfriedman22 Date: Thu, 19 Dec 2024 16:57:53 -0800 Subject: [PATCH 3/4] Need to use `match` when setting both rows and cols of the sparse Matrix --- R/cluster_genes.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/cluster_genes.R b/R/cluster_genes.R index 275824c..1ccaf30 100644 --- a/R/cluster_genes.R +++ b/R/cluster_genes.R @@ -484,11 +484,12 @@ aggregate_gene_expression <- function(cds, gene_group_df[[1]] <- geneids } + # browser() unique_gene_ids <- unique(gene_group_df[, 1]) agg_mat <- agg_mat[unique_gene_ids, , drop = FALSE] gene_groups <- unique(gene_group_df[, 2]) X <- Matrix::sparseMatrix( - i = gene_group_df[, 2], + i = match(gene_group_df[, 2], gene_groups), j = match(gene_group_df[, 1], unique_gene_ids), x = 1, dims = c(length(gene_groups), length(unique_gene_ids)), From b04f2a8d73633dafba04b70d0448b545060bf9b6 Mon Sep 17 00:00:00 2001 From: rfriedman22 Date: Thu, 19 Dec 2024 16:58:39 -0800 Subject: [PATCH 4/4] Remove a browser call --- R/cluster_genes.R | 1 - 1 file changed, 1 deletion(-) diff --git a/R/cluster_genes.R b/R/cluster_genes.R index 1ccaf30..06c0c56 100644 --- a/R/cluster_genes.R +++ b/R/cluster_genes.R @@ -484,7 +484,6 @@ aggregate_gene_expression <- function(cds, gene_group_df[[1]] <- geneids } - # browser() unique_gene_ids <- unique(gene_group_df[, 1]) agg_mat <- agg_mat[unique_gene_ids, , drop = FALSE] gene_groups <- unique(gene_group_df[, 2])