From 17f4ff2752249125332f3fab7a396449e4e7b154 Mon Sep 17 00:00:00 2001 From: Dave Weaver Date: Tue, 19 Jan 2021 10:12:03 -0500 Subject: [PATCH] got bootstrap_null working about starting top-level function --- .gitignore | 1 + DESCRIPTION | 7 +- NAMESPACE | 5 ++ R/RandomWalkRepeats.R | 65 ++++++++++++++++++ R/compute_crosstalk.R | 18 +++++ R/create_null.R | 142 ++++++++++++++++++++++++++++++++++++++- R/ppi_ingest.R | 38 +++++------ man/bootstrap_null.Rd | 19 ++++++ man/compute_crosstalk.Rd | 46 +++++++++++++ man/dist_calc.Rd | 17 +++++ man/ensembl_convert.Rd | 19 ++++++ man/match_seeds.Rd | 19 ++++++ man/norm1.Rd | 17 +++++ man/norm_colsum.Rd | 14 ++++ man/prep_biogrid.Rd | 17 +++++ man/prep_stringdb.Rd | 19 ++++++ man/setup_init.Rd | 17 +++++ man/sparseRWR.Rd | 24 +++++++ 18 files changed, 478 insertions(+), 26 deletions(-) create mode 100644 R/RandomWalkRepeats.R create mode 100644 R/compute_crosstalk.R create mode 100644 man/bootstrap_null.Rd create mode 100644 man/compute_crosstalk.Rd create mode 100644 man/dist_calc.Rd create mode 100644 man/ensembl_convert.Rd create mode 100644 man/match_seeds.Rd create mode 100644 man/norm1.Rd create mode 100644 man/norm_colsum.Rd create mode 100644 man/prep_biogrid.Rd create mode 100644 man/prep_stringdb.Rd create mode 100644 man/setup_init.Rd create mode 100644 man/sparseRWR.Rd diff --git a/.gitignore b/.gitignore index 565f2b6..bb5eab9 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ .Rdata .httr-oauth .DS_Store +./test_data diff --git a/DESCRIPTION b/DESCRIPTION index 985c43c..e6cb8cb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,10 +15,15 @@ Imports: tidyr, tibble, igraph, - RANKS, + Matrix, ensembldb, + foreach, + doParallel, EnsDb.Hsapiens.v79 Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.1.1 +Suggests: + testthat (>= 2.0.0) +Config/testthat/edition: 2 diff --git a/NAMESPACE b/NAMESPACE index 6ae9268..fe2a573 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,2 +1,7 @@ # Generated by roxygen2: do not edit by hand +export(bootstrap_null) +export(compute_crosstalk) +export(sparseRWR) +importFrom(foreach,"%dopar%") +importFrom(magrittr,"%>%") diff --git a/R/RandomWalkRepeats.R b/R/RandomWalkRepeats.R new file mode 100644 index 0000000..b17a74d --- /dev/null +++ b/R/RandomWalkRepeats.R @@ -0,0 +1,65 @@ +#' Perform random walk with repeats on a sparse matrix +#' +#' This function borrows heavily from the RWR function in the RANKS package (cite here) +#' +#' @export +#' +#' @param seed_proteins user defined seed proteins +#' @param w The adjacency matrix of a given graph in sparse format - dgCMatrix +#' @param gamma restart probability +#' @param eps maximum allowed difference between the computed probabilities at the steady state +#' @param norm if True, w is normalized by dividing each value by the column sum. +#' @param tmax the maximum number of iterations for the RWR +#' +sparseRWR <- function(w, seed_proteins, gamma = 0.6, eps = 1e-10, tmax = 1000, norm = TRUE) { + + # divide the values in each column by the within-column sum + if(norm == TRUE) { + w <- norm_colsum(w) + } + + ##The code below is reproduced from the RWR function of the RANKS package. + #setup intial p vector + n <- nrow(w) + p0 <- p <- numeric(n) + names(p) <- names(p0) <- rownames(w) + num_seeds <- length(seed_proteins) + p0[seed_proteins] <- 1/num_seeds + p <- p0 + + for (t in 1:tmax) { + # pi+1 = (1 − d)Wpi + dr + pold <- p + p <- ((1-gamma) * as.vector(pold %*% w)) + gamma * p0 + + #check if the exit condition has been met. + if (norm1(p-pold) < eps) { + break + } + } + return(list(p=p, seed_proteins=seed_proteins, n.iter=t)); + +} + +#' Function to normalize adjacency matrix by dividing each value by the colsum. +#' +#' @inheritParams sparseRWR +#' +norm_colsum <- function(w) { + colsums <- Matrix::colSums(w) + w <- w/colsums + return(w) +} + + +#' Function that computes the norm 1 of a numeric vector +#' +#' This function is reproduced from the source code of the RANKS package (not exported). +#' +#' @param x : numeric vector +#' @return a single real value (the norm1 of the input vector) +norm1 <- function(x) { + return(sum(abs(x))); +} + + diff --git a/R/compute_crosstalk.R b/R/compute_crosstalk.R new file mode 100644 index 0000000..fe5445c --- /dev/null +++ b/R/compute_crosstalk.R @@ -0,0 +1,18 @@ +#' Identify proteins with a statistically significant relationship to user-provided seeds. +#' +#' @inheritParams bootstrap_null +#' +#' @inheritParams sparseRWR +#' +#' @inheritParams setup_init +#' +#' +#' @export + +compute_crosstalk <- function(seed_proteins, ppi = "stringdb", n = 10000, + gamma=0.6, eps = 1e-10, tmax = 1000, + norm = TRUE, set_seed, + cache, seed_name = NULL, + ncores = 1) { + +} diff --git a/R/create_null.R b/R/create_null.R index d78b2b5..53a6cb2 100644 --- a/R/create_null.R +++ b/R/create_null.R @@ -1,10 +1,146 @@ #' Bootstrap null distribution for significance testing #' +#' This function will generate a bootstrapped null distribution to identify +#' signficant vertices in a PPI given a set of user-defined seed proteins. +#' Bootstrapping is done by performing random walk with repeats repeatedly over "random" +#' sets of seed proteins. Degree distribution of user-provided seeds is used to inform sampling. +#' +#' @importFrom foreach %dopar% +#' +#' +#' @param ppi specifies which protein-protein interaction network will be used - currently only "stringdb" is supported +#' @param n number of random walks with repeats +#' @param set_seed integer to set random number seed - for reproducibility +#' @param ncores Number of cores to use - defaults to 1. Significant speedup can be achieved by using multiple cores for computation. +#' @param seed_name Name to give the cached null distribution - must be a character string +#' +#' @inheritDotParams sparseRWR seed_proteins gamma tmax eps norm +#' @inheritDotParams setup_init cache +#' +#' @export + +bootstrap_null <- function(seed_proteins, ppi = "stringdb", n = 10000, + gamma=0.6, eps = 1e-10, tmax = 1000, + norm = TRUE, set_seed, + cache, seed_name = NULL, + ncores = 1) { + + #If file was cached from a previous run - use that- else go through the whole calculation + if(file.exists(paste0(cache, "/", seed_name, "null_dist.Rda"))) { + load(file = paste0(cache, "/", seed_name, "null_dist.Rda")) + } else{ + + if(ppi == "biogrid") { + g <- prep_biogrid(cache = cache) + } else if (ppi == "stringdb") { + g <- prep_stringdb(cache = cache) + } else { + stop("ppi must be either 'biogrid' or 'stringdb'") + } + + w <- igraph::as_adjacency_matrix(g) #sparse adjacency matrix. + + #generate list of degree-similar seed protein vectors. + seeds <- match_seeds(g = g, seed_proteins = seed_proteins, n = n) + + if(ncores > 1) { + cl <- parallel::makeCluster(ncores) + doParallel::registerDoParallel(cl) + null_dist <- + foreach::foreach(i = 1:n, .combine = rbind) %dopar% { + seeds_i <- unlist(seeds[[i]]) + crosstalkr::sparseRWR(w = w, seed_proteins = seeds_i)[[1]] + } + parallel::stopCluster(cl) + null_dist <- tibble::as_tibble(null_dist, rownames = "run_number") + } else { + null_dist <- list() + for(i in 1:n) { + seeds_i <- unlist(seeds[[i]]) + null_dist[[i]] <- sparseRWR(w = w, seed_proteins = seeds_i)[[1]] + } + null_dist <- dplyr::bind_rows(null_dist) + } + + null_dist <- dist_calc(null_dist, ncores = ncores) + + out <- list(null_dist, seed_proteins) + + if(is.character(cache) & is.character(seed_name)) { + save(out, file = paste0(cache, "/", seed_name, "null_dist.Rda")) + } else { + message("Please provide character string designating a filepath and seed name if you would like to save these results to speed up future analysis.") + } + } + return(out) +} + +#' Identify random sets of seeds with similar degree distribution to parent seed proteins +#' +#' This function will generate n character vectors of seeds to be passed to sparseRWR +#' as part of the construction of a boostrapped null distribution for significance testing. +#' @param g igraph object representing the network under study. specified by "ppi" in bootstrap_null +#' +#' @inheritParams bootstrap_null +#' + +match_seeds <- function(g, seed_proteins, n, set_seed = NULL) { + if(is.numeric(set_seed)) { + set.seed(set_seed) + } + num_seeds <- length(seed_proteins) + + #Add degree as a character for each seed_protein. + degree_seeds <- (degree = igraph::degree(g, v = seed_proteins)) + degree_bins <- ggplot2::cut_number(degree_seeds, num_seeds) + + #Identify the single number breaks from degree_bins + bins <- levels(degree_bins) -bootstrap_null <- function(ppi = "biogrid", n = 1000, seed_proteins, gamma, - eps = 1e-10, tmax = 1000, norm = TRUE) { - if(ppi == "biogrid") { + #extract the first side of each bin to pass to cut + breaks <- base::as.numeric(stringr::str_extract(bins, "\\d*(?=\\.)|\\d*(?=,)")) + degree_all <- igraph::degree(g) + degree_all <- tibble::tibble(gene = names(degree_all), + degree = degree_all, + degree_bins = cut(degree_all, breaks = breaks)) + + + #group by breaks + degree_grouped <- dplyr::group_by(degree_all, degree_bins) + + sample_seeds <- list() + + for(i in 1:n) { + samp <- dplyr::slice_sample(degree_grouped, n = 1) #n = 1 because the number of groups will always be equal to the number of seeds + sample_seeds[[i]] <- samp$gene } + return(sample_seeds) + +} + +#' Internal function that computes the mean/stdev for each gene from a wide-format data frame. +#' +#' This function is called by the high-level function "bootstrap_null". +#' +#' @importFrom magrittr %>% +#' +#' @param df : numeric vector +#' @return a 3-column dataframe (gene, ) + +dist_calc <- function(df, ncores) { + if(ncores > 1){ + null_dist <- tidyr::pivot_longer(df, cols = -run_number, names_to = "gene_id", values_to = "p") + } else { + null_dist <- tidyr::pivot_longer(df, cols = tidyr::everything(), names_to = "gene_id", values_to = "p") + } + null_dist <- null_dist %>% + dplyr::group_by(gene_id) %>% + dplyr::summarise(mean_p = mean(p), + stdev_p = sd(p), + nobs = dplyr::n()) + + return(null_dist) + } diff --git a/R/ppi_ingest.R b/R/ppi_ingest.R index 49c72db..1f6e71f 100644 --- a/R/ppi_ingest.R +++ b/R/ppi_ingest.R @@ -2,9 +2,12 @@ #' #' @param cache A filepath to a folder downloaded files should be stored, inherits from user-available functions #' @param edb ensemble database object +#' @param min_score minimum connectivity score for each edge in the network. #' @return list containing Adjacency matrix from stringdb dataset and igraph object built from the adjacency matrix. -prep_stringdb <- function(cache = NULL, edb = EnsDb.Hsapiens.v79::EnsDb.Hsapiens.v79){ +prep_stringdb <- function(cache = NULL, + edb = EnsDb.Hsapiens.v79::EnsDb.Hsapiens.v79, + min_score = NULL){ if(!file.exists(paste0(cache, "/stringdb.Rda"))) { message("Downloading stringdb Homo Sapiens v11.0") @@ -15,28 +18,28 @@ prep_stringdb <- function(cache = NULL, edb = EnsDb.Hsapiens.v79::EnsDb.Hsapiens df <- dplyr::mutate(df, protein1 = ensembl_convert(protein1, edb = edb), protein2 = ensembl_convert(protein2, edb = edb)) - #pivot to adjacency matrix - #fill in missing values with zero - adj_df <- tidyr::pivot_wider(df, names_from = protein2, - values_from = combined_score, - values_fn = max, - values_fill = 0) + + #filter out nodes below a given min score + if(is.numeric(min_score)) { + df <- dplyr::filter(df, combined_score > min_score) + } + #Convert to igraph object g <- igraph::graph_from_data_frame(df, directed = FALSE) g <- igraph::simplify(g, remove.multiple = TRUE, remove.loops = TRUE) - output <- list(g, adj_df) + if(!is.null(cache)) { - save(output, file = paste0(cache, "/stringdb.Rda")) + save(g, file = paste0(cache, "/stringdb.Rda")) } } else { message("using cached version of stringdb Homo Sapeins v11.0") load(file = paste0(cache, "/stringdb.Rda")) } - return(output) + return(g) } #' Prepare biogrid for use in analyses @@ -75,25 +78,15 @@ prep_biogrid <- function(cache = NULL) { g <- igraph::simplify(g, remove.multiple = TRUE, remove.loops = TRUE) - biogrid$adjacency <- 1 - - #Pivot to create an adjacency matrix - adj_df <- tidyr::pivot_wider(biogrid, names_from = Official.Symbol.Interactor.B, - values_from = adjacency, - values_fn = max, - values_fill = 0) - - output <- list(g, adj_df) - if(!is.null(cache)) { - save(output, file = paste0(cache, "/biogrid.Rda")) + save(g, file = paste0(cache, "/biogrid.Rda")) } } else { message("using cached version of biogrid v3.5.171") load(file = paste0(cache, "/biogrid.Rda")) } - return(output) + return(g) } #' Helper function for first-time use of crosstalkr package @@ -108,3 +101,4 @@ setup_init <- function(cache = NULL) { tmp_var2 <- prep_stringdb(cache = cache) } + diff --git a/man/bootstrap_null.Rd b/man/bootstrap_null.Rd new file mode 100644 index 0000000..ff95631 --- /dev/null +++ b/man/bootstrap_null.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_null.R +\name{bootstrap_null} +\alias{bootstrap_null} +\title{Bootstrap null distribution for significance testing} +\usage{ +bootstrap_null( + ppi = "biogrid", + n = 1000, + seed_proteins, + gamma, + eps = 1e-10, + tmax = 1000, + norm = TRUE +) +} +\description{ +Bootstrap null distribution for significance testing +} diff --git a/man/compute_crosstalk.Rd b/man/compute_crosstalk.Rd new file mode 100644 index 0000000..93f1c64 --- /dev/null +++ b/man/compute_crosstalk.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/compute_crosstalk.R +\name{compute_crosstalk} +\alias{compute_crosstalk} +\title{Identify proteins with a statistically significant relationship to user-provided seeds.} +\usage{ +compute_crosstalk( + seed_proteins, + ppi = "stringdb", + n = 10000, + gamma = 0.6, + eps = 1e-10, + tmax = 1000, + norm = TRUE, + set_seed, + cache, + seed_name = NULL, + ncores = 1 +) +} +\arguments{ +\item{seed_proteins}{user defined seed proteins} + +\item{ppi}{specifies which protein-protein interaction network will be used - currently only "stringdb" is supported} + +\item{n}{number of random walks with repeats} + +\item{gamma}{restart probability} + +\item{eps}{maximum allowed difference between the computed probabilities at the steady state} + +\item{tmax}{the maximum number of iterations for the RWR} + +\item{norm}{if True, w is normalized by dividing each value by the column sum.} + +\item{set_seed}{integer to set random number seed - for reproducibility} + +\item{cache}{A filepath to a folder downloaded files should be stored, inherits from user-available functions} + +\item{seed_name}{Name to give the cached null distribution - must be a character string} + +\item{ncores}{= 1} +} +\description{ +Identify proteins with a statistically significant relationship to user-provided seeds. +} diff --git a/man/dist_calc.Rd b/man/dist_calc.Rd new file mode 100644 index 0000000..f500e4f --- /dev/null +++ b/man/dist_calc.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_null.R +\name{dist_calc} +\alias{dist_calc} +\title{Internal function that computes the mean/stdev for each gene from a wide-format data frame.} +\usage{ +dist_calc(df, ncores) +} +\arguments{ +\item{df}{: numeric vector} +} +\value{ +a 3-column dataframe (gene, ) +} +\description{ +This function is called by the high-level function "bootstrap_null". +} diff --git a/man/ensembl_convert.Rd b/man/ensembl_convert.Rd new file mode 100644 index 0000000..5d72fd2 --- /dev/null +++ b/man/ensembl_convert.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{ensembl_convert} +\alias{ensembl_convert} +\title{Convert from ensembl.gene to gene.symbol} +\usage{ +ensembl_convert(x, edb = EnsDb.Hsapiens.v79::EnsDb.Hsapiens.v79) +} +\arguments{ +\item{x}{vector of ensemble.gene ids} + +\item{edb}{ensemble database object} +} +\value{ +vector of gene.symbol +} +\description{ +Convert from ensembl.gene to gene.symbol +} diff --git a/man/match_seeds.Rd b/man/match_seeds.Rd new file mode 100644 index 0000000..2f20ebc --- /dev/null +++ b/man/match_seeds.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/create_null.R +\name{match_seeds} +\alias{match_seeds} +\title{Identify random sets of seeds with similar degree distribution to parent seed proteins} +\usage{ +match_seeds(g, seed_proteins, n, set_seed = NULL) +} +\arguments{ +\item{g}{igraph object representing the network under study. specified by "ppi" in bootstrap_null} + +\item{n}{number of random walks with repeats} + +\item{set_seed}{integer to set random number seed - for reproducibility} +} +\description{ +This function will generate n character vectors of seeds to be passed to sparseRWR +as part of the construction of a boostrapped null distribution for significance testing. +} diff --git a/man/norm1.Rd b/man/norm1.Rd new file mode 100644 index 0000000..5d915a1 --- /dev/null +++ b/man/norm1.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RandomWalkRepeats.R +\name{norm1} +\alias{norm1} +\title{Function that computes the norm 1 of a numeric vector} +\usage{ +norm1(x) +} +\arguments{ +\item{x}{: numeric vector} +} +\value{ +a single real value (the norm1 of the input vector) +} +\description{ +This function is reproduced from the source code of the RANKS package (not exported). +} diff --git a/man/norm_colsum.Rd b/man/norm_colsum.Rd new file mode 100644 index 0000000..890bdeb --- /dev/null +++ b/man/norm_colsum.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RandomWalkRepeats.R +\name{norm_colsum} +\alias{norm_colsum} +\title{Function to normalize adjacency matrix by dividing each value by the colsum.} +\usage{ +norm_colsum(w) +} +\arguments{ +\item{w}{The adjacency matrix of a given graph in sparse format - dgCMatrix} +} +\description{ +Function to normalize adjacency matrix by dividing each value by the colsum. +} diff --git a/man/prep_biogrid.Rd b/man/prep_biogrid.Rd new file mode 100644 index 0000000..3792133 --- /dev/null +++ b/man/prep_biogrid.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ppi_ingest.R +\name{prep_biogrid} +\alias{prep_biogrid} +\title{Prepare biogrid for use in analyses} +\usage{ +prep_biogrid(cache = NULL) +} +\arguments{ +\item{cache}{A filepath to a folder downloaded files should be stored, inherits from user-available functions} +} +\value{ +list containing Adjacency matrix from stringdb dataset and igraph object built from the adjacency matrix. +} +\description{ +Prepare biogrid for use in analyses +} diff --git a/man/prep_stringdb.Rd b/man/prep_stringdb.Rd new file mode 100644 index 0000000..ee3d331 --- /dev/null +++ b/man/prep_stringdb.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ppi_ingest.R +\name{prep_stringdb} +\alias{prep_stringdb} +\title{Prepare Stringdb for use in analyses} +\usage{ +prep_stringdb(cache = NULL, edb = EnsDb.Hsapiens.v79::EnsDb.Hsapiens.v79) +} +\arguments{ +\item{cache}{A filepath to a folder downloaded files should be stored, inherits from user-available functions} + +\item{edb}{ensemble database object} +} +\value{ +list containing Adjacency matrix from stringdb dataset and igraph object built from the adjacency matrix. +} +\description{ +Prepare Stringdb for use in analyses +} diff --git a/man/setup_init.Rd b/man/setup_init.Rd new file mode 100644 index 0000000..a14b87b --- /dev/null +++ b/man/setup_init.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ppi_ingest.R +\name{setup_init} +\alias{setup_init} +\title{Helper function for first-time use of crosstalkr package} +\usage{ +setup_init(cache = NULL) +} +\arguments{ +\item{cache}{A filepath to a folder downloaded files should be stored, inherits from user-available functions} +} +\value{ +directory on users computer containing the different adjacency matrices for future use. +} +\description{ +Helper function for first-time use of crosstalkr package +} diff --git a/man/sparseRWR.Rd b/man/sparseRWR.Rd new file mode 100644 index 0000000..fb50864 --- /dev/null +++ b/man/sparseRWR.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RandomWalkRepeats.R +\name{sparseRWR} +\alias{sparseRWR} +\title{Perform random walk with repeats on a sparse matrix} +\usage{ +sparseRWR(w, seed_proteins, gamma = 0.6, eps = 1e-10, tmax = 1000, norm = TRUE) +} +\arguments{ +\item{w}{The adjacency matrix of a given graph in sparse format - dgCMatrix} + +\item{seed_proteins}{user defined seed proteins} + +\item{gamma}{restart probability} + +\item{eps}{maximum allowed difference between the computed probabilities at the steady state} + +\item{tmax}{the maximum number of iterations for the RWR} + +\item{norm}{if True, w is normalized by dividing each value by the column sum.} +} +\description{ +This function borrows heavily from the RWR function in the RANKS package (cite here) +}