Skip to content

Commit

Permalink
got bootstrap_null working about starting top-level function
Browse files Browse the repository at this point in the history
  • Loading branch information
DavisWeaver committed Jan 19, 2021
1 parent 339b0b3 commit 17f4ff2
Show file tree
Hide file tree
Showing 18 changed files with 478 additions and 26 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
.Rdata
.httr-oauth
.DS_Store
./test_data
7 changes: 6 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,15 @@ Imports:
tidyr,
tibble,
igraph,
RANKS,
Matrix,
ensembldb,
foreach,
doParallel,
EnsDb.Hsapiens.v79
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
Suggests:
testthat (>= 2.0.0)
Config/testthat/edition: 2
5 changes: 5 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
# Generated by roxygen2: do not edit by hand

export(bootstrap_null)
export(compute_crosstalk)
export(sparseRWR)
importFrom(foreach,"%dopar%")
importFrom(magrittr,"%>%")
65 changes: 65 additions & 0 deletions R/RandomWalkRepeats.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#' Perform random walk with repeats on a sparse matrix
#'
#' This function borrows heavily from the RWR function in the RANKS package (cite here)
#'
#' @export
#'
#' @param seed_proteins user defined seed proteins
#' @param w The adjacency matrix of a given graph in sparse format - dgCMatrix
#' @param gamma restart probability
#' @param eps maximum allowed difference between the computed probabilities at the steady state
#' @param norm if True, w is normalized by dividing each value by the column sum.
#' @param tmax the maximum number of iterations for the RWR
#'
sparseRWR <- function(w, seed_proteins, gamma = 0.6, eps = 1e-10, tmax = 1000, norm = TRUE) {

# divide the values in each column by the within-column sum
if(norm == TRUE) {
w <- norm_colsum(w)
}

##The code below is reproduced from the RWR function of the RANKS package.
#setup intial p vector
n <- nrow(w)
p0 <- p <- numeric(n)
names(p) <- names(p0) <- rownames(w)
num_seeds <- length(seed_proteins)
p0[seed_proteins] <- 1/num_seeds
p <- p0

for (t in 1:tmax) {
# pi+1 = (1 − d)Wpi + dr
pold <- p
p <- ((1-gamma) * as.vector(pold %*% w)) + gamma * p0

#check if the exit condition has been met.
if (norm1(p-pold) < eps) {
break
}
}
return(list(p=p, seed_proteins=seed_proteins, n.iter=t));

}

#' Function to normalize adjacency matrix by dividing each value by the colsum.
#'
#' @inheritParams sparseRWR
#'
norm_colsum <- function(w) {
colsums <- Matrix::colSums(w)
w <- w/colsums
return(w)
}


#' Function that computes the norm 1 of a numeric vector
#'
#' This function is reproduced from the source code of the RANKS package (not exported).
#'
#' @param x : numeric vector
#' @return a single real value (the norm1 of the input vector)
norm1 <- function(x) {
return(sum(abs(x)));
}


18 changes: 18 additions & 0 deletions R/compute_crosstalk.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#' Identify proteins with a statistically significant relationship to user-provided seeds.
#'
#' @inheritParams bootstrap_null
#'
#' @inheritParams sparseRWR
#'
#' @inheritParams setup_init
#'
#'
#' @export

compute_crosstalk <- function(seed_proteins, ppi = "stringdb", n = 10000,
gamma=0.6, eps = 1e-10, tmax = 1000,
norm = TRUE, set_seed,
cache, seed_name = NULL,
ncores = 1) {

}
142 changes: 139 additions & 3 deletions R/create_null.R
Original file line number Diff line number Diff line change
@@ -1,10 +1,146 @@
#' Bootstrap null distribution for significance testing
#'
#' This function will generate a bootstrapped null distribution to identify
#' signficant vertices in a PPI given a set of user-defined seed proteins.
#' Bootstrapping is done by performing random walk with repeats repeatedly over "random"
#' sets of seed proteins. Degree distribution of user-provided seeds is used to inform sampling.
#'
#' @importFrom foreach %dopar%
#'
#'
#' @param ppi specifies which protein-protein interaction network will be used - currently only "stringdb" is supported
#' @param n number of random walks with repeats
#' @param set_seed integer to set random number seed - for reproducibility
#' @param ncores Number of cores to use - defaults to 1. Significant speedup can be achieved by using multiple cores for computation.
#' @param seed_name Name to give the cached null distribution - must be a character string
#'
#' @inheritDotParams sparseRWR seed_proteins gamma tmax eps norm
#' @inheritDotParams setup_init cache
#'
#' @export

bootstrap_null <- function(seed_proteins, ppi = "stringdb", n = 10000,
gamma=0.6, eps = 1e-10, tmax = 1000,
norm = TRUE, set_seed,
cache, seed_name = NULL,
ncores = 1) {

#If file was cached from a previous run - use that- else go through the whole calculation
if(file.exists(paste0(cache, "/", seed_name, "null_dist.Rda"))) {
load(file = paste0(cache, "/", seed_name, "null_dist.Rda"))
} else{

if(ppi == "biogrid") {
g <- prep_biogrid(cache = cache)
} else if (ppi == "stringdb") {
g <- prep_stringdb(cache = cache)
} else {
stop("ppi must be either 'biogrid' or 'stringdb'")
}

w <- igraph::as_adjacency_matrix(g) #sparse adjacency matrix.

#generate list of degree-similar seed protein vectors.
seeds <- match_seeds(g = g, seed_proteins = seed_proteins, n = n)

if(ncores > 1) {
cl <- parallel::makeCluster(ncores)
doParallel::registerDoParallel(cl)
null_dist <-
foreach::foreach(i = 1:n, .combine = rbind) %dopar% {
seeds_i <- unlist(seeds[[i]])
crosstalkr::sparseRWR(w = w, seed_proteins = seeds_i)[[1]]
}
parallel::stopCluster(cl)
null_dist <- tibble::as_tibble(null_dist, rownames = "run_number")
} else {
null_dist <- list()
for(i in 1:n) {
seeds_i <- unlist(seeds[[i]])
null_dist[[i]] <- sparseRWR(w = w, seed_proteins = seeds_i)[[1]]
}
null_dist <- dplyr::bind_rows(null_dist)
}

null_dist <- dist_calc(null_dist, ncores = ncores)

out <- list(null_dist, seed_proteins)

if(is.character(cache) & is.character(seed_name)) {
save(out, file = paste0(cache, "/", seed_name, "null_dist.Rda"))
} else {
message("Please provide character string designating a filepath and seed name if you would like to save these results to speed up future analysis.")
}
}
return(out)
}

#' Identify random sets of seeds with similar degree distribution to parent seed proteins
#'
#' This function will generate n character vectors of seeds to be passed to sparseRWR
#' as part of the construction of a boostrapped null distribution for significance testing.
#' @param g igraph object representing the network under study. specified by "ppi" in bootstrap_null
#'
#' @inheritParams bootstrap_null
#'

match_seeds <- function(g, seed_proteins, n, set_seed = NULL) {
if(is.numeric(set_seed)) {
set.seed(set_seed)
}
num_seeds <- length(seed_proteins)

#Add degree as a character for each seed_protein.
degree_seeds <- (degree = igraph::degree(g, v = seed_proteins))
degree_bins <- ggplot2::cut_number(degree_seeds, num_seeds)

#Identify the single number breaks from degree_bins
bins <- levels(degree_bins)

bootstrap_null <- function(ppi = "biogrid", n = 1000, seed_proteins, gamma,
eps = 1e-10, tmax = 1000, norm = TRUE) {
if(ppi == "biogrid") {
#extract the first side of each bin to pass to cut
breaks <- base::as.numeric(stringr::str_extract(bins, "\\d*(?=\\.)|\\d*(?=,)"))
degree_all <- igraph::degree(g)

degree_all <- tibble::tibble(gene = names(degree_all),
degree = degree_all,
degree_bins = cut(degree_all, breaks = breaks))


#group by breaks
degree_grouped <- dplyr::group_by(degree_all, degree_bins)

sample_seeds <- list()

for(i in 1:n) {
samp <- dplyr::slice_sample(degree_grouped, n = 1) #n = 1 because the number of groups will always be equal to the number of seeds
sample_seeds[[i]] <- samp$gene
}

return(sample_seeds)

}

#' Internal function that computes the mean/stdev for each gene from a wide-format data frame.
#'
#' This function is called by the high-level function "bootstrap_null".
#'
#' @importFrom magrittr %>%
#'
#' @param df : numeric vector
#' @return a 3-column dataframe (gene, )

dist_calc <- function(df, ncores) {
if(ncores > 1){
null_dist <- tidyr::pivot_longer(df, cols = -run_number, names_to = "gene_id", values_to = "p")
} else {
null_dist <- tidyr::pivot_longer(df, cols = tidyr::everything(), names_to = "gene_id", values_to = "p")
}
null_dist <- null_dist %>%
dplyr::group_by(gene_id) %>%
dplyr::summarise(mean_p = mean(p),
stdev_p = sd(p),
nobs = dplyr::n())

return(null_dist)

}
38 changes: 16 additions & 22 deletions R/ppi_ingest.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@
#'
#' @param cache A filepath to a folder downloaded files should be stored, inherits from user-available functions
#' @param edb ensemble database object
#' @param min_score minimum connectivity score for each edge in the network.
#' @return list containing Adjacency matrix from stringdb dataset and igraph object built from the adjacency matrix.

prep_stringdb <- function(cache = NULL, edb = EnsDb.Hsapiens.v79::EnsDb.Hsapiens.v79){
prep_stringdb <- function(cache = NULL,
edb = EnsDb.Hsapiens.v79::EnsDb.Hsapiens.v79,
min_score = NULL){

if(!file.exists(paste0(cache, "/stringdb.Rda"))) {
message("Downloading stringdb Homo Sapiens v11.0")
Expand All @@ -15,28 +18,28 @@ prep_stringdb <- function(cache = NULL, edb = EnsDb.Hsapiens.v79::EnsDb.Hsapiens
df <- dplyr::mutate(df,
protein1 = ensembl_convert(protein1, edb = edb),
protein2 = ensembl_convert(protein2, edb = edb))
#pivot to adjacency matrix
#fill in missing values with zero
adj_df <- tidyr::pivot_wider(df, names_from = protein2,
values_from = combined_score,
values_fn = max,
values_fill = 0)

#filter out nodes below a given min score
if(is.numeric(min_score)) {
df <- dplyr::filter(df, combined_score > min_score)
}


#Convert to igraph object
g <- igraph::graph_from_data_frame(df, directed = FALSE)
g <-
igraph::simplify(g, remove.multiple = TRUE, remove.loops = TRUE)

output <- list(g, adj_df)


if(!is.null(cache)) {
save(output, file = paste0(cache, "/stringdb.Rda"))
save(g, file = paste0(cache, "/stringdb.Rda"))
}
} else {
message("using cached version of stringdb Homo Sapeins v11.0")
load(file = paste0(cache, "/stringdb.Rda"))
}
return(output)
return(g)
}

#' Prepare biogrid for use in analyses
Expand Down Expand Up @@ -75,25 +78,15 @@ prep_biogrid <- function(cache = NULL) {
g <-
igraph::simplify(g, remove.multiple = TRUE, remove.loops = TRUE)

biogrid$adjacency <- 1

#Pivot to create an adjacency matrix
adj_df <- tidyr::pivot_wider(biogrid, names_from = Official.Symbol.Interactor.B,
values_from = adjacency,
values_fn = max,
values_fill = 0)

output <- list(g, adj_df)

if(!is.null(cache)) {
save(output, file = paste0(cache, "/biogrid.Rda"))
save(g, file = paste0(cache, "/biogrid.Rda"))
}

} else {
message("using cached version of biogrid v3.5.171")
load(file = paste0(cache, "/biogrid.Rda"))
}
return(output)
return(g)
}

#' Helper function for first-time use of crosstalkr package
Expand All @@ -108,3 +101,4 @@ setup_init <- function(cache = NULL) {
tmp_var2 <- prep_stringdb(cache = cache)
}


19 changes: 19 additions & 0 deletions man/bootstrap_null.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 17f4ff2

Please sign in to comment.