From f65a45cba6d5db3648ba546563fea6a86480350f Mon Sep 17 00:00:00 2001 From: Donghoon Lee Date: Thu, 8 Aug 2019 11:18:43 -0400 Subject: [PATCH 1/3] cleaned arguments --- starrpeaker/3_procBam.py | 5 +---- starrpeaker/core.py | 15 ++++----------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/starrpeaker/3_procBam.py b/starrpeaker/3_procBam.py index ef6e448..79d1cca 100644 --- a/starrpeaker/3_procBam.py +++ b/starrpeaker/3_procBam.py @@ -25,7 +25,6 @@ ### optional args parser.add_argument('--min', help='Minimum Insert Size', required=False, default=100) parser.add_argument('--max', help='Maximum Insert Size', required=False, default=1000) -parser.add_argument('--pseudocount', help='Pseudocount for Input Normalization', required=False, default=1) args = parser.parse_args() @@ -34,6 +33,4 @@ chromSize=args.chromsize, fileOut=args.prefix + ".bam.bct", minSize=args.min, - maxSize=args.max, - normalize=True, - pseudocount=args.pseudocount) + maxSize=args.max) diff --git a/starrpeaker/core.py b/starrpeaker/core.py index 2e0737b..e2e3e4a 100644 --- a/starrpeaker/core.py +++ b/starrpeaker/core.py @@ -115,7 +115,7 @@ def count_total_proper_templates(bam, minSize, maxSize): return proper_template_count -def proc_bam(bamFiles, bedFile, chromSize, fileOut, minSize, maxSize, normalize=False, pseudocount=1): +def proc_bam(bamFiles, bedFile, chromSize, fileOut, minSize, maxSize): ''' Args: @@ -125,8 +125,6 @@ def proc_bam(bamFiles, bedFile, chromSize, fileOut, minSize, maxSize, normalize= fileOut: output file minSize: minimum size of fragment insert to consider maxSize: maximum size of fragment insert to consider - normalize: if True, normalized input count is added to additional column - pseudocount: pseudocount for input normalization Returns: writes bin count output file @@ -238,14 +236,9 @@ def proc_bam(bamFiles, bedFile, chromSize, fileOut, minSize, maxSize, normalize= bdg2bw(bdgFile=fileOut + "." + str(j) + ".bdg", bwFile=fileOut + "." + str(j) + ".bw", chromSize=chromSize) safe_remove(fileOut + "." + str(j) + ".bdg") - if normalize: - ### normalize input count - normalized_input = mat[:, 0] * (tct[1] / tct[0]) - # nonzero = normalized_input != 0 - # normalized_input[nonzero] += float(pseudocount) - np.savetxt(fileOut, np.concatenate((mat, normalized_input.reshape(-1, 1)), axis=1), fmt='%.5f', delimiter="\t") - else: - np.savetxt(fileOut, mat, fmt='%i', delimiter="\t") + ### normalize input count, normalized input count is added to additional column + normalized_input = mat[:, 0] * (tct[1] / tct[0]) + np.savetxt(fileOut, np.concatenate((mat, normalized_input.reshape(-1, 1)), axis=1), fmt='%.5f', delimiter="\t") del a, mat, tct, normalized_input print("[%s] Done" % (timestamp())) From 343edfa9c5ad7ce99425140305c55d37ba064025 Mon Sep 17 00:00:00 2001 From: Donghoon Lee Date: Thu, 8 Aug 2019 12:27:08 -0400 Subject: [PATCH 2/3] argument clean up --- starrpeaker/1_makeBin.py | 10 ++++---- starrpeaker/2_procCov.py | 7 +++--- starrpeaker/3_procBam.py | 7 +++--- starrpeaker/4_callPeak.py | 22 +++++++---------- starrpeaker/core.py | 49 ++++++++++++-------------------------- starrpeaker/starrpeaker.py | 23 ++++++++---------- 6 files changed, 45 insertions(+), 73 deletions(-) diff --git a/starrpeaker/1_makeBin.py b/starrpeaker/1_makeBin.py index 6622d00..3d1ebfb 100644 --- a/starrpeaker/1_makeBin.py +++ b/starrpeaker/1_makeBin.py @@ -16,13 +16,13 @@ parser = argparse.ArgumentParser(description='Make Bin in BED Format') ### required args -parser.add_argument('-p', '--prefix', help='Output File Prefix', required=True) -parser.add_argument('-c', '--chromsize', help='Chrom Sizes', required=True) -parser.add_argument('-x', '--blacklist', help='Blacklist Region in BED format', required=True) +parser.add_argument('--prefix', help='Output File Prefix', required=True) +parser.add_argument('--chromsize', help='Chrom Sizes', required=True) +parser.add_argument('--blacklist', help='Blacklist Region in BED format', required=True) ### optional args -parser.add_argument('-l', '--length', help='Bin Length', required=False, default=500) -parser.add_argument('-s', '--step', help='Step Size', required=False, default=100) +parser.add_argument('--length', help='Bin Length', required=False, default=500) +parser.add_argument('--step', help='Step Size', required=False, default=100) args = parser.parse_args() diff --git a/starrpeaker/2_procCov.py b/starrpeaker/2_procCov.py index 7e702a1..90d69f9 100644 --- a/starrpeaker/2_procCov.py +++ b/starrpeaker/2_procCov.py @@ -17,12 +17,11 @@ ### required args -parser.add_argument('-p', '--prefix', help='Output File Prefix', required=True) -parser.add_argument('-c', '--cov', help='BigWig Files as Covariates', nargs='+', required=True) -parser.add_argument('-b', '--bed', help='Bin BED File', required=True) +parser.add_argument('--prefix', help='Output File Prefix', required=True) +parser.add_argument('--cov', help='BigWig Files as Covariates', nargs='+', required=True) args = parser.parse_args() if __name__ == "__main__": core.proc_cov(bwFiles=args.cov, - bedFile=args.bed, + bedFile=args.prefix + ".bin.bed", fileOut=args.prefix + ".cov.tsv") diff --git a/starrpeaker/3_procBam.py b/starrpeaker/3_procBam.py index 79d1cca..b63e88d 100644 --- a/starrpeaker/3_procBam.py +++ b/starrpeaker/3_procBam.py @@ -16,9 +16,8 @@ parser = argparse.ArgumentParser(description='Process BAM File(s)') ### required args -parser.add_argument('-p', '--prefix', help='Output File Prefix', required=True) -parser.add_argument('-c', '--chromsize', help='Chrom Sizes', required=True) -parser.add_argument('-b', '--bed', help='Bin BED File', required=True) +parser.add_argument('--prefix', help='Output File Prefix', required=True) +parser.add_argument('--chromsize', help='Chrom Sizes', required=True) parser.add_argument('-i', '--input', help='STARR-seq Input BAM File', required=True) parser.add_argument('-o', '--output', help='STARR-seq Output BAM File', required=True) @@ -29,7 +28,7 @@ args = parser.parse_args() if __name__ == "__main__": core.proc_bam(bamFiles=[args.input, args.output], - bedFile=args.bed, + bedFile=args.prefix + ".bin.bed", chromSize=args.chromsize, fileOut=args.prefix + ".bam.bct", minSize=args.min, diff --git a/starrpeaker/4_callPeak.py b/starrpeaker/4_callPeak.py index d02a962..b9a2486 100644 --- a/starrpeaker/4_callPeak.py +++ b/starrpeaker/4_callPeak.py @@ -16,25 +16,21 @@ parser = argparse.ArgumentParser(description='Call Peaks') ### required args -parser.add_argument('-p', '--prefix', help='Output File Prefix', required=True) -parser.add_argument('--bed', help='Bin BED File', required=True) -parser.add_argument('--bct', help='Bincount BCT File', required=True) -parser.add_argument('--cov', help='Covariate File', required=True) -parser.add_argument('--bw', help='STARR-seq Output Coverage BigWig File', required=True) -parser.add_argument('-c', '--chromsize', help='Chrom Sizes', required=True) +parser.add_argument('--prefix', help='Output File Prefix', required=True) +parser.add_argument('--chromsize', help='Chrom Sizes', required=True) ### optional args -parser.add_argument('-t', '--threshold', help='Adjusted P-value Threshold', required=False, default=0.05) -parser.add_argument('-q', '--minquantile', help='Minimum Input Quantile', required=False, default=0.2) -parser.add_argument('-m', '--mode', help='Mode', required=False, default=1) +parser.add_argument('--threshold', help='Adjusted P-value Threshold', required=False, default=0.05) +parser.add_argument('--minquantile', help='Minimum Input Quantile', required=False, default=0.2) +parser.add_argument('--mode', help='Mode', required=False, default=1) args = parser.parse_args() if __name__ == "__main__": core.call_peak(prefix=args.prefix, - bedFile=args.bed, - bctFile=args.bct, - covFile=args.cov, - bwFile=args.bw, + bedFile=args.prefix + ".bin.bed", + bctFile=args.prefix + ".bam.bct", + covFile=args.prefix + ".cov.tsv", + bwFile=args.prefix + ".bam.bct.1.bw", chromSize=args.chromsize, threshold=args.threshold, minInputQuantile=args.minquantile, diff --git a/starrpeaker/core.py b/starrpeaker/core.py index e2e3e4a..c6cadde 100644 --- a/starrpeaker/core.py +++ b/starrpeaker/core.py @@ -244,7 +244,7 @@ def proc_bam(bamFiles, bedFile, chromSize, fileOut, minSize, maxSize): print("[%s] Done" % (timestamp())) -def proc_bam_readstart(bamFiles, bedFile, chromSize, fileOut, normalize=False, pseudocount=1): +def proc_bam_readstart(bamFiles, bedFile, chromSize, fileOut): print("[%s] Counting template depth per bin %s" % (timestamp(), bedFile)) ### initialize numpy array @@ -333,14 +333,9 @@ def proc_bam_readstart(bamFiles, bedFile, chromSize, fileOut, normalize=False, p safe_remove(fileOut + "." + str(j) + ".bdg") del merged, mergedBed, readDepth - if normalize: - ### normalize input count - normalized_input = mat[:, 0] * (tct[1] / tct[0]) - nonzero = normalized_input != 0 - normalized_input[nonzero] += float(pseudocount) - np.savetxt(fileOut, np.concatenate((mat, normalized_input.reshape(-1, 1)), axis=1), fmt='%.5f', delimiter="\t") - else: - np.savetxt(fileOut, mat, fmt='%i', delimiter="\t") + ### normalize input count + normalized_input = mat[:, 0] * (tct[1] / tct[0]) + np.savetxt(fileOut, np.concatenate((mat, normalized_input.reshape(-1, 1)), axis=1), fmt='%.5f', delimiter="\t") print("[%s] Done" % (timestamp())) del a, mat, tct @@ -396,7 +391,7 @@ def theta(y, mu, verbose=False): return t0, se -def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, minInputQuantile, mode=1): +def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, mode=1): ''' Args: @@ -440,30 +435,16 @@ def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, m lastbin = int(bin.split("\t")[2]) nonSliding[i] = True - ### remove bins with input count of zero (i.e., untested region) OR extreme values (top 1%, i.e., sequencing artifacts) - # minInput = np.quantile(mat[(mat[:, 1] > 0), 1], 0.01) - # maxInput = np.quantile(mat[(mat[:, 1] > 0), 1], 0.99) + ### remove bins with input (and output) count of zero (i.e., untested region) OR extreme values (top 1%, i.e., sequencing artifacts) minInput = 0 maxInput = np.quantile(mat[:, 1], 0.99) nonZeroInput = (mat[:, 1] > minInput) & (mat[:, 1] < maxInput) - - # minOutput = np.quantile(mat[:, 0], 0.01) - # maxOutput = np.quantile(mat[:, 0], 0.99) - # nonZeroInput = (mat[:, 1] > 0) & (mat[:, 0] > minOutput) & (mat[:, 0] < maxOutput) - - ### remove bins with normalized input count of zero (i.e., untested region) OR below "minimum threshold" defined by minInputQuantile - # minInput = np.quantile(mat[(mat[:, 1] > 0), 1], float(minInputQuantile)) - # print("[%s] Minimum Input Coverage: %f" % (timestamp(), minInput)) - # nonZeroInput = mat[:, 1] > minInput + nonZeroOutput = (mat[:, 0] > 0) ### calculate fold change fc = np.zeros(mat.shape[0]) fc[mat[:, 1] > 0] = mat[mat[:, 1] > 0, 0] / (mat[mat[:, 1] > 0, 2]) - minOutputThreshold=0.9 - testOutput = mat[:, 0] > np.quantile(mat[:, 0], float(minOutputThreshold)) - # minFC = fc > 1 - ### filtering bins print("[%s] Before filtering: %s" % (timestamp(), mat.shape[0])) @@ -508,8 +489,8 @@ def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, m ### predict print("[%s] Predicting expected counts for bins above a minimum threshold: %s" % ( - timestamp(), mat[nonZeroInput & testOutput, :].shape[0])) - df = pd.DataFrame(mat[nonZeroInput & testOutput, :], columns=["y", "exposure"] + x) + timestamp(), mat[nonZeroInput & nonZeroOutput, :].shape[0])) + df = pd.DataFrame(mat[nonZeroInput & nonZeroOutput, :], columns=["y", "exposure"] + x) y_hat = model.predict(df, offset=np.log(df["exposure"])) ### mode 1 uses "input" as covariate (default): @@ -545,15 +526,15 @@ def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, m ### predict print("[%s] Predicting expected counts for bins above a minimum threshold: %s" % ( - timestamp(), mat[nonZeroInput & testOutput, :].shape[0])) - df = pd.DataFrame(mat[nonZeroInput & testOutput, :], columns=["y"] + x) + timestamp(), mat[nonZeroInput & nonZeroOutput, :].shape[0])) + df = pd.DataFrame(mat[nonZeroInput & nonZeroOutput, :], columns=["y"] + x) y_hat = model.predict(df) ### calculate P-value print("[%s] Calculating P-value" % (timestamp())) theta_hat = np.repeat(th, len(y_hat)) prob = th / (th + y_hat) ### prob=theta/(theta+mu) - pval = 1 - nbinom.cdf(mat[nonZeroInput & testOutput, 0] - 1, n=theta_hat, p=prob) + pval = 1 - nbinom.cdf(mat[nonZeroInput & nonZeroOutput, 0] - 1, n=theta_hat, p=prob) del mat ### multiple testing correction @@ -566,16 +547,16 @@ def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, m ### output peak with open(prefix + ".peak.bed", "w") as out: with open(bedFile, "r") as bed: - for i, bin in enumerate(list(compress(bed.readlines(), nonZeroInput & testOutput))): + for i, bin in enumerate(list(compress(bed.readlines(), nonZeroInput & nonZeroOutput))): if pval[i] <= float(threshold): out.write("%s\t%.3f\t%.3f\t%.3f\t%.5e\t%.5e\n" % ( - bin.strip(), fc[nonZeroInput & testOutput][i], p_score[i], q_score[i], pval[i], pval_adj[i])) + bin.strip(), fc[nonZeroInput & nonZeroOutput][i], p_score[i], q_score[i], pval[i], pval_adj[i])) ### output p-val track print("[%s] Generating P-value bedGraph" % (timestamp())) with open(prefix + ".pval.bdg", "w") as out: with open(bedFile, "r") as bed: - for i, bin in enumerate(list(compress(bed.readlines(), nonZeroInput & testOutput))): + for i, bin in enumerate(list(compress(bed.readlines(), nonZeroInput & nonZeroOutput))): out.write("%s\t%.3f\n" % (bin.strip(), abs(p_score[i]))) del p_score, q_score, pval, pval_adj diff --git a/starrpeaker/starrpeaker.py b/starrpeaker/starrpeaker.py index ddedfdd..3321b47 100644 --- a/starrpeaker/starrpeaker.py +++ b/starrpeaker/starrpeaker.py @@ -16,28 +16,27 @@ parser = argparse.ArgumentParser(description='STARR-Peaker') ### required args -parser.add_argument('-p', '--prefix', help='Output File Prefix', required=True) +parser.add_argument('--prefix', help='Output File Prefix', required=True) parser.add_argument('--chromsize', help='Chrom Sizes', required=True) -parser.add_argument('-l', '--length', help='Bin Length', required=False, default=500) -parser.add_argument('-s', '--step', help='Step Size', required=False, default=100) -parser.add_argument('-x', '--blacklist', help='Blacklist Region in BED format', required=True) +parser.add_argument('--blacklist', help='Blacklist Region in BED format', required=True) parser.add_argument('--cov', help='Covariate BigWig Files', nargs='+', required=True) parser.add_argument('-i', '--input', help='Input BAM File', required=True) parser.add_argument('-o', '--output', help='STARR-seq BAM File', required=True) ### optional args +parser.add_argument('--length', help='Bin Length', required=False, default=500) +parser.add_argument('--step', help='Step Size', required=False, default=100) parser.add_argument('--min', help='Minimum Template Size', required=False, default=100) parser.add_argument('--max', help='Maximum Template Size', required=False, default=1000) -parser.add_argument('-t', '--threshold', help='Adjusted P-value Threshold', required=False, default=0.05) -parser.add_argument('--pseudocount', help='Pseudocount for Input Normalization', required=False, default=1) -parser.add_argument('-q', '--minquantile', help='Minimum Input Quantile', required=False, default=0.2) -parser.add_argument('-m', '--mode', help='Mode', required=False, default=1) +parser.add_argument('--threshold', help='Adjusted P-value Threshold', required=False, default=0.05) +parser.add_argument('--minquantile', help='Minimum Input Quantile', required=False, default=0.2) +parser.add_argument('--mode', help='Mode', required=False, default=1) args = parser.parse_args() def main(): - ### make bin with specified window length and step size + ### make genomic bin with specified window length and step size core.make_bin(chromSize=args.chromsize, binLength=args.length, stepSize=args.step, @@ -49,15 +48,13 @@ def main(): bedFile=args.prefix + ".bin.bed", fileOut=args.prefix + ".cov.tsv") - ### process input, output bam + ### process input, output bam files core.proc_bam(bamFiles=[args.input, args.output], bedFile=args.prefix + ".bin.bed", chromSize=args.chromsize, fileOut=args.prefix + ".bam.bct", minSize=args.min, - maxSize=args.max, - normalize=True, - pseudocount=args.pseudocount) + maxSize=args.max) ### call peaks core.call_peak(prefix=args.prefix, From bbc37f6f8b04c78d95f30d226e7c415bb65ed1ba Mon Sep 17 00:00:00 2001 From: Donghoon Lee Date: Fri, 9 Aug 2019 10:16:24 -0400 Subject: [PATCH 3/3] clean unnecessary args --- .gitignore | 1 + starrpeaker/4_callPeak.py | 2 -- starrpeaker/core.py | 4 ++-- starrpeaker/starrpeaker.py | 1 - 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index e3be0cd..f5e3917 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ *.bedGraph *.bct *.pptx +starrpeaker/.vscode/settings.json diff --git a/starrpeaker/4_callPeak.py b/starrpeaker/4_callPeak.py index b9a2486..2ba0a35 100644 --- a/starrpeaker/4_callPeak.py +++ b/starrpeaker/4_callPeak.py @@ -21,7 +21,6 @@ ### optional args parser.add_argument('--threshold', help='Adjusted P-value Threshold', required=False, default=0.05) -parser.add_argument('--minquantile', help='Minimum Input Quantile', required=False, default=0.2) parser.add_argument('--mode', help='Mode', required=False, default=1) args = parser.parse_args() @@ -33,5 +32,4 @@ bwFile=args.prefix + ".bam.bct.1.bw", chromSize=args.chromsize, threshold=args.threshold, - minInputQuantile=args.minquantile, mode=args.mode) diff --git a/starrpeaker/core.py b/starrpeaker/core.py index c6cadde..e7e8ca4 100644 --- a/starrpeaker/core.py +++ b/starrpeaker/core.py @@ -391,7 +391,7 @@ def theta(y, mu, verbose=False): return t0, se -def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, mode=1): +def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, mode): ''' Args: @@ -402,7 +402,7 @@ def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, m bwFile: fragment coverage in BigWig format chromSize: chromosome sizes threshold: threshold to call peak - minInputQuantile: minimum input coverage + mode: 1 - using input as covariate 2 - using input as offset Returns: peak files (original peaks, merged peaks, and centered final peaks) diff --git a/starrpeaker/starrpeaker.py b/starrpeaker/starrpeaker.py index 3321b47..6e21a22 100644 --- a/starrpeaker/starrpeaker.py +++ b/starrpeaker/starrpeaker.py @@ -64,7 +64,6 @@ def main(): bwFile=args.prefix + ".bam.bct.1.bw", chromSize=args.chromsize, threshold=args.threshold, - minInputQuantile=args.minquantile, mode=args.mode) if __name__ == "__main__": main()