From dc0ab8c77bab9792e4e3a161c9438ddc80df5759 Mon Sep 17 00:00:00 2001 From: Marmzy Date: Tue, 29 Nov 2022 21:26:23 +0900 Subject: [PATCH 1/2] Rewrote superenhancers cutoff (R -> Python) --- src/superenhancers/super_enhancer.py | 54 ++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 src/superenhancers/super_enhancer.py diff --git a/src/superenhancers/super_enhancer.py b/src/superenhancers/super_enhancer.py new file mode 100644 index 0000000..5728960 --- /dev/null +++ b/src/superenhancers/super_enhancer.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python + +import math +import numpy as np + +from scipy.optimize import minimize_scalar + + +def calculate_cutoff( + vector: np.ndarray +) -> np.float: + """Calculate superenhancer signal density cutoff value + + Args: + vector (np.ndarray): Array of (control corrected) stitched enhancer loci density signals + + Returns: + np.float: Density signal cutoff value to delineate superenhancers from normal enhaners + """ + + #Get the slope of the line to slide + vector.sort() + slope = (max(vector) - min(vector)) / len(vector) + + #Minimising the (control corrected) stitched enhancer loci density signal function (aka finding the tangent of the function) + x_min = math.floor(minimize_scalar(numPts_below_line, bounds=(1, len(vector)), args=(vector, slope), method="bounded")["x"]) + y_cutoff = vector[x_min] + + return y_cutoff + + +def numPts_below_line( + x: float, + vector: np.ndarray, + slope: np.float64 +) -> int: + """Stitched enhancer loci density signal function to minimise + + Args: + x (float): Signal density value to perform the calculation at + vector (np.ndarray): Signal density values vector + slope (np.float64): Slope coefficient + + Returns: + int: Number of signal density values equal to or smaller than a line going through point x + """ + + #Calculate line equation at point x (y = ax+b) + y = vector[int(x)-1] + b = y-(slope*x) + + #Calculate y values of points on the line and the number of points that are larger than those of the vector + xPts = np.array(range(1, len(vector)+1)) + return sum(vector <= (xPts*slope+b)) \ No newline at end of file From ba590ad2b23fc81efe8efef860b5694e55917392 Mon Sep 17 00:00:00 2001 From: Marmzy Date: Wed, 7 Dec 2022 21:10:33 +0900 Subject: [PATCH 2/2] Rewrote remaining R code to Python --- README.md | 4 +- ROSE.sh | 7 ++ src/ROSE_callSuper.R | 83 ---------------- src/ROSE_callSuper.py | 90 +++++++++++++++++ src/superenhancers/output.R | 140 --------------------------- src/superenhancers/output.py | 115 ++++++++++++++++++++++ src/superenhancers/super_enhancer.R | 43 -------- src/superenhancers/super_enhancer.py | 6 +- 8 files changed, 217 insertions(+), 271 deletions(-) delete mode 100644 src/ROSE_callSuper.R create mode 100644 src/ROSE_callSuper.py delete mode 100644 src/superenhancers/output.R create mode 100644 src/superenhancers/output.py delete mode 100644 src/superenhancers/super_enhancer.R diff --git a/README.md b/README.md index 0ede20d..5553193 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,9 @@ ### Description -Rank Ordering of Super-Enhancers aka ROSE is a tool for identifying super-enhancers. It does this by separating super-enhancers from typical enhancers using sequencing data (.bam) given a file of previsously identified constituent enhancers (.gff). The original ROSE tool was developed by Charles Y. Lin, David A. Orlando and Brian J. Abraham at Young Lab Whitehead Institute/MIT. This new ROSE version is an attempt to update the code from Python 2 to 3, use newer versions of tools, make the code more readable to allow for better in-depth understanding of the algorithm and to increase the computational speed. +Rank Ordering of Super-Enhancers aka ROSE is a tool for identifying super-enhancers. It does this by separating super-enhancers from typical enhancers using sequencing data (.bam) given a file of previsously identified constituent enhancers (.gff). The original ROSE tool was developed by Charles Y. Lin, David A. Orlando and Brian J. Abraham at Young Lab Whitehead Institute/MIT. This new ROSE version is an attempt to update the code from Python 2 to 3, convert the few R code to Python, use newer versions of tools, make the code more readable to allow for better in-depth understanding of the algorithm and to increase the computational speed. -This version of ROSE was developed using `Python 3.8.10`, `R 4.2.1`, and `SAMtools 1.10` +This version of ROSE was developed using `Python 3.8.10`, and `SAMtools 1.10` --- diff --git a/ROSE.sh b/ROSE.sh index 8b631a1..386d10f 100644 --- a/ROSE.sh +++ b/ROSE.sh @@ -106,4 +106,11 @@ if [ "$VALUE_C" ]; then Rscript src/ROSE_callSuper.R -o ${PWD}/${VALUE_O} -d $DENSITY -g $VALUE_I -c $VALUE_C else Rscript src/ROSE_callSuper.R -o ${PWD}/${VALUE_O} -d $DENSITY -g $VALUE_I +fi + +#Identifing and visualising superenhancers +if [ "$VALUE_C" ]; then + python3 src/ROSE_callSuper.py -o ${PWD}/${VALUE_O} -d $DENSITY -g $VALUE_I -c $VALUE_C +else + python3 src/ROSE_callSuper.py -o ${PWD}/${VALUE_O} -d $DENSITY -g $VALUE_I fi \ No newline at end of file diff --git a/src/ROSE_callSuper.R b/src/ROSE_callSuper.R deleted file mode 100644 index f330a19..0000000 --- a/src/ROSE_callSuper.R +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env Rscript - -suppressMessages(library(data.table)) -suppressMessages(library(docstring)) -suppressMessages(library(optparse)) - - -get_path <- function() { - #' Get path to ROSE tool directory - #' - #' @return Path to ROSE tool directory - - path <- strsplit(getwd(), "/") - return(paste(unlist(path)[1:which(unlist(path)=="ROSE")], collapse="/")) -} - - -option_list <- list( - make_option(c("-o", "--output"), action="store", type="character", default=NULL, help="Output directory name"), - make_option(c("-d", "--density"), action="store", type="character", default=NULL, help="Stitched enhancer loci signal density file"), - make_option(c("-g", "--gff"), action="store", type="character", default=NULL, help="Constituent enhancers peak file"), - make_option(c("-c", "--control"), action="store", type="character", default=NULL, help="Control .bam file") -) - -#Parse arguments from command line -opt <- parse_args(OptionParser(option_list=option_list)) - -#Initialising variables -path <- get_path() -densityName <- tail(unlist(strsplit(opt$density, "/"))) - -#Load necessary functions -source(paste(c(path, "src/superenhancers/output.R"), collapse="/")) -source(paste(c(path, "src/superenhancers/super_enhancer.R"), collapse="/")) - -#Read stitched enhancer loci density signal file as dataframe -stitched_regions <- fread(opt$density, sep="\t") - -#Subtract background signal if control is available -if (is.null(opt$control)) { - rankBy_vector = stitched_regions[,7] -} else { - rankBy_vector = stitched_regions[,7] - stitched_regions[,8] -} - -#Setting negative values to 0 -rankBy_vector[rankBy_vector < 0] <- 0 - -#Calculating the superenhancer signal cut-off value -cutoff_options <- calculate_cutoff(as.matrix(rankBy_vector)) - -#Get superenhancers based on cut-off -superEnhancerRows <- which(rankBy_vector > cutoff_options$absolute) -typicalEnhancers <- setdiff(1:nrow(stitched_regions), superEnhancerRows) -enhancerDescription <- paste(basename(opt$gff), "Enhancers\nCreated from ", tail(unlist(strsplit(opt$density, "/")), n=1), "\nRanked by ", colnames(stitched_regions)[7], "\nUsing cutoff of ", cutoff_options$absolute, " for Super-Enhancers", sep="", collapse="") - -#Get base .gff file name -gff <- tools::file_path_sans_ext(basename(opt$gff)) - -#Creating hockey stick plot -hockey_stick_plot(rankBy_vector, cutoff_options, superEnhancerRows, opt$output, gff, opt$control, colnames(stitched_regions)[7]) - -#Rank stitched enhancer loci by background signal corrected density signal and output to .bed file -bedFileName = paste(opt$output, "/", gff, "_Enhancers_withSuper.bed", sep="") -convert_stitched_to_bed(stitched_regions, paste(colnames(stitched_regions)[7], "Enhancers"), enhancerDescription, bedFileName, densitySignal=as.matrix(rankBy_vector), splitSuper=TRUE, superRows=superEnhancerRows) - -#Output stitched enhancer loci ranking -convert_stitched_to_gateway_bed(stitched_regions, paste(opt$output, "/", gff, sep=""), splitSuper=TRUE, densitySignal=as.matrix(rankBy_vector), superRows=superEnhancerRows) - -#Select super enhancers -true_super_enhancers <- stitched_regions[superEnhancerRows,] - -#Create matrix of stitched enhancer loci rankings and super status -data = c(nrow(stitched_regions)-rank(rankBy_vector, ties.method="first")+1, ifelse(1:nrow(stitched_regions) %in% superEnhancerRows, 1, 0)) -additionalTableData <- matrix(data=data, ncol=2, nrow=nrow(stitched_regions)) -colnames(additionalTableData) <- c("enhancerRank", "isSuper") - -#Output rankings and status matrix -enhancerTableFile = paste(opt$output, "/", gff, "_AllEnhancers.table.txt", sep="") -superTableFile = paste(opt$output, "/", gff, "_SuperEnhancers.table.txt", sep="") - -writeSuperEnhancer_table(stitched_regions, enhancerDescription, enhancerTableFile, additionalData=additionalTableData) -writeSuperEnhancer_table(true_super_enhancers, enhancerDescription, superTableFile, additionalData=additionalTableData[superEnhancerRows,]) diff --git a/src/ROSE_callSuper.py b/src/ROSE_callSuper.py new file mode 100644 index 0000000..ad0e0cc --- /dev/null +++ b/src/ROSE_callSuper.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python + +import argparse +import numpy as np +import pandas as pd + +from pathlib import Path +from scipy.stats import rankdata +from superenhancers.output import convert_stitched_to_bed, hockey_stick_plot, write_enhancer_table +from superenhancers.super_enhancer import calculate_cutoff +from utils.file_helper import check_file, check_path + + +def parseArgs() -> argparse.Namespace: + """Parse arguments from CLI + + Returns: + argparse.Namespace: Argparse space containing parsed arguments + """ + + parser = argparse.ArgumentParser(description="") + + #Required arguments + parser.add_argument("-o", "--output", type=str, help="Output directory name") + parser.add_argument("-d", "--density", type=str, help="Stitched enhancer loci signal density file") + parser.add_argument("-g", "--gff", type=str, help="File (.bed, .gff or .gtf) containing binding sites to make enhancers") + + #Optional arguments + parser.add_argument("-c", "--control", type=str, nargs="?", help="Control (.bam) file") + + + #Printing arguments to the command line + args = parser.parse_args() + + print("Called with args:") + print(f"{args}\n") + + return args + + +def main() -> None: + + #Parse arguments from the command line + args = parseArgs() + + #Read stitched enhancer loci density signal file as dataframe + stitched_regions = pd.read_csv(check_file(args.density), sep="\t") + + #Subtract control signal if control is available + if args.control: + rankBy_vector = stitched_regions.iloc[:, 6] - stitched_regions.iloc[:, 7] + else: + rankBy_vector = stitched_regions.iloc[:, 6] + + #Setting negative values to 0 + rankBy_vector[rankBy_vector < 0] = 0 + + #Calculate the superenhancer density signal cut-off value + y_cutoff = calculate_cutoff(np.asarray(rankBy_vector.copy())) + + #Get superenhancers based on their density signal + superEnhancerRows = np.where(rankBy_vector > y_cutoff)[0] + + #Create output file header + enhancerDescription = f"{Path(args.gff).name} Enhancers\nCreated from {Path(args.density).name}" + enhancerDescription += f"\nRanked by {stitched_regions.columns[6]}\nUsing cutoff of {y_cutoff} for Super-Enhancers" + + #Creating hockey stick plot + hockey_stick_plot(np.asarray(rankBy_vector), y_cutoff, superEnhancerRows, args.output, args.gff, args.control, stitched_regions.columns[6]) + + #Rank stitched enhancer loci by control corrected density signal and output to .bed file + bedName = str(Path(args.output, str(Path(args.gff).stem) + "_enhancers_withSuper.bed")) + convert_stitched_to_bed(stitched_regions, enhancerDescription, bedName, np.asarray(rankBy_vector), superEnhancerRows) + + #Calculate stitched enhancer loci rankings and super status + enhancer_rank = len(stitched_regions)-rankdata(rankBy_vector, method="ordinal")+1 + super_status = [1 if sr in superEnhancerRows else 0 for sr in range(0, len(stitched_regions))] + additional_data = pd.DataFrame({"enhancerRank": enhancer_rank, "isSuper": super_status}) + + #Output rankings and status dataframe + enhancer_file = check_path(Path(args.output, f"{str(Path(args.gff).stem)}_AllEnhancers.table.txt")) + write_enhancer_table(stitched_regions, enhancerDescription, enhancer_file, additional_data) + + super_file = check_path(Path(args.output, f"{str(Path(args.gff).stem)}_SuperEnhancers.table.txt")) + write_enhancer_table(stitched_regions.iloc[superEnhancerRows, :], enhancerDescription, super_file, additional_data.iloc[superEnhancerRows, :]) + + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/superenhancers/output.R b/src/superenhancers/output.R deleted file mode 100644 index 69bb8d2..0000000 --- a/src/superenhancers/output.R +++ /dev/null @@ -1,140 +0,0 @@ -#!/usr/bin/env Rscript - -suppressMessages(library(docstring)) - - -convert_stitched_to_bed <- function(stitchedRegions, trackName, trackDescription, output, splitSuper=TRUE, densitySignal=c(), superRows=c()) { - #' Create a .bed file ranking stitched enhancer loci by corrected density signal - #' - #' @param stitchedRegions list Stitched enhancer loci signal density dataframe - #' @param trackName character Target .bam file - #' @param trackDescription character Analysis description - #' @param output character Output file name - #' @param splitSuper logical Boolean whether to add a superenhancers only section to the .bed file - #' @param densitySignal double Background signal corrected density signal values - #' @param superRows integer Vector of superenancer indices - #' - #' @return .bed file containing stitched enhancer loci information and their ranking - - #Create output matrix - outMatrix <- matrix(data="", ncol=4+ifelse(nrow(densitySignal)==nrow(stitchedRegions),1,0), nrow=nrow(stitchedRegions)) - outMatrix[,1] <- as.character(stitchedRegions$CHROM) - outMatrix[,2] <- as.character(stitchedRegions$START) - outMatrix[,3] <- as.character(stitchedRegions$STOP) - outMatrix[,4] <- as.character(stitchedRegions$REGION_ID) - - #Rank density signal-corrected stitched enhancer loci from smallest to largest - if (nrow(densitySignal) == nrow(stitchedRegions)) { - densitySignal <- rank(densitySignal, ties.method="first") - densitySignal <- length(densitySignal)-densitySignal+1 - outMatrix[,5] <- as.character(densitySignal) - } - - #Create description track for output .bed file - trackDescription <- paste(trackDescription, "\nCreated on ", format(Sys.time(), "%b %d %Y"), collapse="", sep="") - trackDescription <- gsub("\n", "\t", trackDescription) - tName <- gsub(" ", "_", trackName) - cat('track name="', tName, '" description="', trackDescription, '" itemRGB=On color=0,0,0\n', sep="", file=output) - fwrite(file=output, outMatrix, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE) - - if (splitSuper==TRUE) { - cat("\ntrack name=\"Super_", tName, '" description="Super ', trackDescription, '" itemRGB=On color=255,0,0\n', sep="", file=output, append=TRUE) - fwrite(file=output, outMatrix[superRows,], sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE, append=TRUE) - } -} - - -convert_stitched_to_gateway_bed <- function(stitchedRegions, output, splitSuper=TRUE, densitySignal=c(), superRows=c()) { - #' Create a .bed file ranking stitched enhancer loci by corrected density signal - #' - #' @param stitchedRegions list Stitched enhancer loci signal density dataframe - #' @param output character Output file name - #' @param splitSuper logical Boolean whether to add a superenhancers only section to the .bed file - #' @param densitySignal double Background signal corrected density signal values - #' @param superRows integer Vector of superenancer indices - #' - #' @return .bed file containing stitched enhancer loci information and their ranking - - #Create output matrix - outMatrix <- matrix(data="", ncol=6, nrow=nrow(stitchedRegions)) - outMatrix[,1] <- as.character(stitchedRegions$CHROM) - outMatrix[,2] <- as.character(stitchedRegions$START) - outMatrix[,3] <- as.character(stitchedRegions$STOP) - outMatrix[,4] <- as.character(stitchedRegions$REGION_ID) - - #Rank stitched enhancer loci corrected density signal from smallest to largest - if (nrow(densitySignal) == nrow(stitchedRegions)) { - densitySignal <- rank(densitySignal, ties.method="first") - densitySignal <- length(densitySignal)-densitySignal+1 - outMatrix[,5] <- as.character(densitySignal) - } - outMatrix[,6] <- as.character(rep(".", nrow(outMatrix))) - - #Output matrix to .bed file - outputFile1 = paste(output, "_Gateway_Enhancers.bed", sep="") - fwrite(file=outputFile1, outMatrix, sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE, append=FALSE) - - if (splitSuper==TRUE) { - outputFile2 = paste(output, "_Gateway_SuperEnhancers.bed", sep="") - fwrite(file=outputFile2, outMatrix[superRows,], sep="\t", quote=FALSE, row.names=FALSE, col.names=FALSE, append=FALSE) - } -} - - -hockey_stick_plot <- function(vector, cutoff_options, super_enhancer_rows, out_folder, density, control, bam) { - #' Visualise superenhancer and regular enhancer signals - #' - #' @param vector list Background subtracted stitched enhancer loci density signal list - #' @param cutoff_options list Superenhancer signal cutoff values list - #' @param super_enhancer_rows integer Superenhancer row integers - #' @param out_folder character Output folder name - #' @param density character Stitched enahncer loci signal density file name - #' @param control character Control .bam file name - #' @param bam character Target .bam file name - #' - #' @return Hockey stick plot - - #Ordering stitched enhancer loci signal density vector from low to high - vector <- as.matrix(vector[order(vector)]) - - #Creating the hockey stick plot - png(filename=paste(out_folder, "/", density, "_Plot_points.png", sep=""), height=600, width=600) - if (is.null(control)) { - plot(1:length(vector), vector, col="red", xlab=paste(bam, "_enhancers"), ylab=paste(bam, " Signal"), pch=19, cex=2) - } else { - plot(1:length(vector), vector, col="red", xlab=paste(bam, "_enhancers"), ylab=paste(bam, " Signal - ", control), pch=19, cex=2) - } - abline(h=cutoff_options$absolute, col="grey", lty=2) - abline(v=length(vector)-length(super_enhancer_rows), col="grey", lty=2) - lines(1:length(vector), vector, lwd=4, col="red") - text(0, 0.8*max(vector), paste("Cutoff used: ", cutoff_options$absolute, "\n", "Super-Enhancers identified: ", length(super_enhancer_rows)), pos=4) - dev.off() -} - - -writeSuperEnhancer_table <- function(superEnhancer, description, outputFile, additionalData=NA) { - #' Output stitched enhancer loci regions data - #' - #' @param superEnhancer list Stitched enhancer loci data table - #' @param description character Data description - #' @param outputFile character Output file name - #' @param additionalData double Matrix of stitched enhancer loci rankings and super status - #' - #' @return Output regions data to tab-delimited file - - #Create descriptive header - description <- paste("#", description, "\nCreated on ", format(Sys.time(), "%b %d %Y"), collapse="", sep="") - description <- gsub("\n", "\n#", description) - cat(description, "\n", file=outputFile) - - #Merge stitched enhancer loci regions data with superenhancer rankings data - if (is.matrix(additionalData)) { - if (nrow(additionalData)!=nrow(superEnhancer)) { - warning("Additional data does not have the same number of rows as the number of super enhancers.\n--->>> ADDITIONAL DATA NOT INCLUDED <<<---\n") - } else { - superEnhancer <- cbind(superEnhancer, additionalData) - superEnhancer = superEnhancer[order(superEnhancer$enhancerRank),] - } - } - fwrite(file=outputFile, superEnhancer, sep="\t", quote=FALSE, col.names=TRUE, row.names=FALSE, append=TRUE) -} \ No newline at end of file diff --git a/src/superenhancers/output.py b/src/superenhancers/output.py new file mode 100644 index 0000000..db7d456 --- /dev/null +++ b/src/superenhancers/output.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd + +from datetime import datetime +from pathlib import Path +from scipy.stats import rankdata +from typing import Union +from utils.file_helper import check_path + + +def convert_stitched_to_bed( + stitchedRegions: pd.core.frame.DataFrame, + trackDescription: str, + output: str, + densitySignal: np.ndarray, + superRows: np.ndarray +) -> None: + """Create a .bed file ranking stitched enhancer loci by their (corrected) density signal + + Args: + stitchedRegions (pd.core.frame.DataFrame): Stitched enhancer loci signal density dataframe + trackDescription (str): Analysis description + output (str): Output file name + densitySignal (np.ndarray): (Control corrected) density signal values + superRows (np.ndarray): Vector of superenhancer indices + """ + + #Create output dataframe + df = stitchedRegions.loc[:, ["CHROM", "START", "STOP", "REGION_ID"]] + + #Rank density signal-corrected stitched enhancer loci from smallest to largest + if len(densitySignal) == len(stitchedRegions): + densitySignal = rankdata(densitySignal, method="ordinal") + densitySignal = len(densitySignal) - densitySignal +1 + df["RANK"] = densitySignal + + #Create a description track and output the dataframe to a .bed file + trackDescription = f"{trackDescription}\nCreated on {datetime.now().strftime('%Y-%m-%d')}".replace("\n", "\t") + with open(output, "w") as f_out: + f_out.write(f"track name='{stitchedRegions.columns[7]}_Enhancers' description='{trackDescription}' itemRGB=On color=0,0,0\n") + df.iloc[superRows, :].to_csv(f_out, sep="\t", header=False, index=False, mode="a") + + +def hockey_stick_plot( + vector: np.ndarray, + y_cutoff: np.float, + super_enhancer_rows: np.ndarray, + out: str, + gff: str, + control: Union[str, None], + bam: str +) -> None: + """Visualise stitched enhancer categorisation with a hockey-stick plot + + Args: + vector (np.ndarray): Array of density signal values + y_cutoff (np.float): Density signal cut-off value + super_enhancer_rows (np.ndarray): Array of super-enhancer row indices + out (str): Output directory name + gff (str): .gff file name + control (Union[str, None]): Control file name + bam (str): .bam file name + """ + + #Ordering the stitched enhancer loci signal density vector from low to high + vector.sort() + + #Creating the hockey stick plot + fig, ax = plt.subplots(figsize=(10, 10)) + ax.plot(list(range(len(vector))), vector, "o-", color="red") + ax.axhline(y_cutoff, color="grey", linestyle="dashed") + ax.axvline(len(vector)-len(super_enhancer_rows), color="grey", linestyle="dashed") + ax.set_xlabel(f"{bam} Stitched Enhancers") + ax.ticklabel_format(axis="y", style="sci", scilimits=(1,4)) + if control: + ax.set_ylabel(f"{bam} - {Path(control).name} Signal") + else: + ax.set_ylabel(f"{bam} Signal") + ax.set_title(f"Cut-off used: {y_cutoff}\nSuper-Enhancers identified: {len(super_enhancer_rows)}") + fig.tight_layout() + fig.savefig(check_path(Path(out, f"{Path(gff).stem}_plot_points.png"))) + + +def write_enhancer_table( + enhancer_df: pd.core.frame.DataFrame, + description: str, + output: str, + additional_data: pd.core.frame.DataFrame +) -> None: + """Output sorted stitched enhancer loci regions data + + Args: + enhancer_df (pd.core.frame.DataFrame): Stitched enhancer loci dataframe + description (str): Data description + output (str): Output file name + additional_data (pd.core.frame.DataFrame): Stitched enhancer loci rankings and super status dataframe + + Raises: + ValueError: If the lengths of the two dataframes don't match + """ + + #Creating a descriptive header + description = f"#{description}\nCreated on {datetime.now().strftime('%Y-%m-%d')}".replace("\n", "\n#") + + #Merge stitched enhancer loci regions data with superenhancer ranking data + if len(additional_data) == len(enhancer_df): + out_df = pd.concat([enhancer_df, additional_data], axis=1).sort_values("enhancerRank") + with open(output, "w") as f_out: + f_out.write(f"{description}\n") + out_df.to_csv(f_out, sep="\t", index=False, mode="a") + else: + raise ValueError("Stitched enhancer loci dataframe and additional data have differing lengths") \ No newline at end of file diff --git a/src/superenhancers/super_enhancer.R b/src/superenhancers/super_enhancer.R deleted file mode 100644 index a377d45..0000000 --- a/src/superenhancers/super_enhancer.R +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env Rscript - -suppressMessages(library(data.table)) -suppressMessages(library(docstring)) - - - calculate_cutoff <- function(vector) { - #' Calculate superenhancer signal cut-off by sliding a diagonal line until the tangent is found - #' - #' @param vector matrix Background corrected stitched enhancer signal values matrix - #' - #' @return List of superenhancer signal cut-off values - - - #Get the slope of the line to slide - vector <- vector[order(vector)] - slope <- (max(vector) - min(vector)) / length(vector) - - #Find the x-axis point where a line passing through that point has the minimum number of points below it (ie. tangent) - xPt <- floor(optimize(numPts_below_line, lower=1, upper=length(vector), vector=vector, slope=slope)$minimum) - y_cutoff <- vector[xPt] - - return(list(absolute=y_cutoff, overMedian=y_cutoff/median(vector), overMean=y_cutoff/mean(vector))) -} - - -numPts_below_line <- function(vector, slope, x) { - #' Calculate linear equation of diagonal at point [x,y] and the number of signal values below the diagonal - #' - #' @param vector matrix Background corrected stitched enhancer signal values matrix - #' @param slope double Diagonal slope - #' @param x double X point at which to calculate the linear equation - #' - #' @return Number of signal values below the diagonal at point [x,y] - - #Calculate lienar equation (y = ax+b) - y <- vector[x] - b <- y-(slope*x) - - #Calculate number of signal values below diagonal - xPts <- 1:length(vector) - return(sum(vector <= (xPts*slope+b))) -} diff --git a/src/superenhancers/super_enhancer.py b/src/superenhancers/super_enhancer.py index 5728960..ba6cf21 100644 --- a/src/superenhancers/super_enhancer.py +++ b/src/superenhancers/super_enhancer.py @@ -9,13 +9,13 @@ def calculate_cutoff( vector: np.ndarray ) -> np.float: - """Calculate superenhancer signal density cutoff value + """Calculate superenhancer signal density cut-off value Args: vector (np.ndarray): Array of (control corrected) stitched enhancer loci density signals Returns: - np.float: Density signal cutoff value to delineate superenhancers from normal enhaners + np.float: Density signal cut-off value to delineate superenhancers from normal enhaners """ #Get the slope of the line to slide @@ -24,7 +24,7 @@ def calculate_cutoff( #Minimising the (control corrected) stitched enhancer loci density signal function (aka finding the tangent of the function) x_min = math.floor(minimize_scalar(numPts_below_line, bounds=(1, len(vector)), args=(vector, slope), method="bounded")["x"]) - y_cutoff = vector[x_min] + y_cutoff = vector[x_min-1] return y_cutoff