Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master'
Browse files Browse the repository at this point in the history
  • Loading branch information
hoondy committed Dec 28, 2019
2 parents 45cafd9 + bbc37f6 commit 79f2dab
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 91 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@
*.bedGraph
*.bct
*.pptx
starrpeaker/.vscode/settings.json
10 changes: 5 additions & 5 deletions starrpeaker/1_makeBin.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@
parser = argparse.ArgumentParser(description='Make Bin in BED Format')

### required args
parser.add_argument('-p', '--prefix', help='Output File Prefix', required=True)
parser.add_argument('-c', '--chromsize', help='Chrom Sizes', required=True)
parser.add_argument('-x', '--blacklist', help='Blacklist Region in BED format', required=True)
parser.add_argument('--prefix', help='Output File Prefix', required=True)
parser.add_argument('--chromsize', help='Chrom Sizes', required=True)
parser.add_argument('--blacklist', help='Blacklist Region in BED format', required=True)

### optional args
parser.add_argument('-l', '--length', help='Bin Length', required=False, default=500)
parser.add_argument('-s', '--step', help='Step Size', required=False, default=100)
parser.add_argument('--length', help='Bin Length', required=False, default=500)
parser.add_argument('--step', help='Step Size', required=False, default=100)

args = parser.parse_args()

Expand Down
7 changes: 3 additions & 4 deletions starrpeaker/2_procCov.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,11 @@

### required args

parser.add_argument('-p', '--prefix', help='Output File Prefix', required=True)
parser.add_argument('-c', '--cov', help='BigWig Files as Covariates', nargs='+', required=True)
parser.add_argument('-b', '--bed', help='Bin BED File', required=True)
parser.add_argument('--prefix', help='Output File Prefix', required=True)
parser.add_argument('--cov', help='BigWig Files as Covariates', nargs='+', required=True)

args = parser.parse_args()

if __name__ == "__main__": core.proc_cov(bwFiles=args.cov,
bedFile=args.bed,
bedFile=args.prefix + ".bin.bed",
fileOut=args.prefix + ".cov.tsv")
12 changes: 4 additions & 8 deletions starrpeaker/3_procBam.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,24 +16,20 @@
parser = argparse.ArgumentParser(description='Process BAM File(s)')

### required args
parser.add_argument('-p', '--prefix', help='Output File Prefix', required=True)
parser.add_argument('-c', '--chromsize', help='Chrom Sizes', required=True)
parser.add_argument('-b', '--bed', help='Bin BED File', required=True)
parser.add_argument('--prefix', help='Output File Prefix', required=True)
parser.add_argument('--chromsize', help='Chrom Sizes', required=True)
parser.add_argument('-i', '--input', help='STARR-seq Input BAM File', required=True)
parser.add_argument('-o', '--output', help='STARR-seq Output BAM File', required=True)

### optional args
parser.add_argument('--min', help='Minimum Insert Size', required=False, default=100)
parser.add_argument('--max', help='Maximum Insert Size', required=False, default=1000)
parser.add_argument('--pseudocount', help='Pseudocount for Input Normalization', required=False, default=1)

args = parser.parse_args()

if __name__ == "__main__": core.proc_bam(bamFiles=[args.input, args.output],
bedFile=args.bed,
bedFile=args.prefix + ".bin.bed",
chromSize=args.chromsize,
fileOut=args.prefix + ".bam.bct",
minSize=args.min,
maxSize=args.max,
normalize=True,
pseudocount=args.pseudocount)
maxSize=args.max)
22 changes: 8 additions & 14 deletions starrpeaker/4_callPeak.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,26 +16,20 @@
parser = argparse.ArgumentParser(description='Call Peaks')

### required args
parser.add_argument('-p', '--prefix', help='Output File Prefix', required=True)
parser.add_argument('--bed', help='Bin BED File', required=True)
parser.add_argument('--bct', help='Bincount BCT File', required=True)
parser.add_argument('--cov', help='Covariate File', required=True)
parser.add_argument('--bw', help='STARR-seq Output Coverage BigWig File', required=True)
parser.add_argument('-c', '--chromsize', help='Chrom Sizes', required=True)
parser.add_argument('--prefix', help='Output File Prefix', required=True)
parser.add_argument('--chromsize', help='Chrom Sizes', required=True)

### optional args
parser.add_argument('-t', '--threshold', help='Adjusted P-value Threshold', required=False, default=0.05)
parser.add_argument('-q', '--minquantile', help='Minimum Input Quantile', required=False, default=0.2)
parser.add_argument('-m', '--mode', help='Mode', required=False, default=1)
parser.add_argument('--threshold', help='Adjusted P-value Threshold', required=False, default=0.05)
parser.add_argument('--mode', help='Mode', required=False, default=1)

args = parser.parse_args()

if __name__ == "__main__": core.call_peak(prefix=args.prefix,
bedFile=args.bed,
bctFile=args.bct,
covFile=args.cov,
bwFile=args.bw,
bedFile=args.prefix + ".bin.bed",
bctFile=args.prefix + ".bam.bct",
covFile=args.prefix + ".cov.tsv",
bwFile=args.prefix + ".bam.bct.1.bw",
chromSize=args.chromsize,
threshold=args.threshold,
minInputQuantile=args.minquantile,
mode=args.mode)
66 changes: 20 additions & 46 deletions starrpeaker/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ def count_total_proper_templates(bam, minSize, maxSize):
return proper_template_count


def proc_bam(bamFiles, bedFile, chromSize, fileOut, minSize, maxSize, normalize=False, pseudocount=1):
def proc_bam(bamFiles, bedFile, chromSize, fileOut, minSize, maxSize):
'''
Args:
Expand All @@ -125,8 +125,6 @@ def proc_bam(bamFiles, bedFile, chromSize, fileOut, minSize, maxSize, normalize=
fileOut: output file
minSize: minimum size of fragment insert to consider
maxSize: maximum size of fragment insert to consider
normalize: if True, normalized input count is added to additional column
pseudocount: pseudocount for input normalization
Returns:
writes bin count output file
Expand Down Expand Up @@ -238,20 +236,15 @@ def proc_bam(bamFiles, bedFile, chromSize, fileOut, minSize, maxSize, normalize=
bdg2bw(bdgFile=fileOut + "." + str(j) + ".bdg", bwFile=fileOut + "." + str(j) + ".bw", chromSize=chromSize)
safe_remove(fileOut + "." + str(j) + ".bdg")

if normalize:
### normalize input count
normalized_input = mat[:, 0] * (tct[1] / tct[0])
# nonzero = normalized_input != 0
# normalized_input[nonzero] += float(pseudocount)
np.savetxt(fileOut, np.concatenate((mat, normalized_input.reshape(-1, 1)), axis=1), fmt='%.5f', delimiter="\t")
else:
np.savetxt(fileOut, mat, fmt='%i', delimiter="\t")
### normalize input count, normalized input count is added to additional column
normalized_input = mat[:, 0] * (tct[1] / tct[0])
np.savetxt(fileOut, np.concatenate((mat, normalized_input.reshape(-1, 1)), axis=1), fmt='%.5f', delimiter="\t")

del a, mat, tct, normalized_input
print("[%s] Done" % (timestamp()))


def proc_bam_readstart(bamFiles, bedFile, chromSize, fileOut, normalize=False, pseudocount=1):
def proc_bam_readstart(bamFiles, bedFile, chromSize, fileOut):
print("[%s] Counting template depth per bin %s" % (timestamp(), bedFile))

### initialize numpy array
Expand Down Expand Up @@ -340,14 +333,9 @@ def proc_bam_readstart(bamFiles, bedFile, chromSize, fileOut, normalize=False, p
safe_remove(fileOut + "." + str(j) + ".bdg")
del merged, mergedBed, readDepth

if normalize:
### normalize input count
normalized_input = mat[:, 0] * (tct[1] / tct[0])
nonzero = normalized_input != 0
normalized_input[nonzero] += float(pseudocount)
np.savetxt(fileOut, np.concatenate((mat, normalized_input.reshape(-1, 1)), axis=1), fmt='%.5f', delimiter="\t")
else:
np.savetxt(fileOut, mat, fmt='%i', delimiter="\t")
### normalize input count
normalized_input = mat[:, 0] * (tct[1] / tct[0])
np.savetxt(fileOut, np.concatenate((mat, normalized_input.reshape(-1, 1)), axis=1), fmt='%.5f', delimiter="\t")

print("[%s] Done" % (timestamp()))
del a, mat, tct
Expand Down Expand Up @@ -403,7 +391,7 @@ def theta(y, mu, verbose=False):
return t0, se


def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, minInputQuantile, mode=1):
def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, mode):
'''
Args:
Expand All @@ -414,7 +402,7 @@ def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, m
bwFile: fragment coverage in BigWig format
chromSize: chromosome sizes
threshold: threshold to call peak
minInputQuantile: minimum input coverage
mode: 1 - using input as covariate 2 - using input as offset
Returns:
peak files (original peaks, merged peaks, and centered final peaks)
Expand Down Expand Up @@ -447,30 +435,16 @@ def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, m
lastbin = int(bin.split("\t")[2])
nonSliding[i] = True

### remove bins with input count of zero (i.e., untested region) OR extreme values (top 1%, i.e., sequencing artifacts)
# minInput = np.quantile(mat[(mat[:, 1] > 0), 1], 0.01)
# maxInput = np.quantile(mat[(mat[:, 1] > 0), 1], 0.99)
### remove bins with input (and output) count of zero (i.e., untested region) OR extreme values (top 1%, i.e., sequencing artifacts)
minInput = 0
maxInput = np.quantile(mat[:, 1], 0.99)
nonZeroInput = (mat[:, 1] > minInput) & (mat[:, 1] < maxInput)

# minOutput = np.quantile(mat[:, 0], 0.01)
# maxOutput = np.quantile(mat[:, 0], 0.99)
# nonZeroInput = (mat[:, 1] > 0) & (mat[:, 0] > minOutput) & (mat[:, 0] < maxOutput)

### remove bins with normalized input count of zero (i.e., untested region) OR below "minimum threshold" defined by minInputQuantile
# minInput = np.quantile(mat[(mat[:, 1] > 0), 1], float(minInputQuantile))
# print("[%s] Minimum Input Coverage: %f" % (timestamp(), minInput))
# nonZeroInput = mat[:, 1] > minInput
nonZeroOutput = (mat[:, 0] > 0)

### calculate fold change
fc = np.zeros(mat.shape[0])
fc[mat[:, 1] > 0] = mat[mat[:, 1] > 0, 0] / (mat[mat[:, 1] > 0, 2])

minOutputThreshold=0.9
testOutput = mat[:, 0] > np.quantile(mat[:, 0], float(minOutputThreshold))
# minFC = fc > 1

### filtering bins
print("[%s] Before filtering: %s" % (timestamp(), mat.shape[0]))

Expand Down Expand Up @@ -515,8 +489,8 @@ def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, m

### predict
print("[%s] Predicting expected counts for bins above a minimum threshold: %s" % (
timestamp(), mat[nonZeroInput & testOutput, :].shape[0]))
df = pd.DataFrame(mat[nonZeroInput & testOutput, :], columns=["y", "exposure"] + x)
timestamp(), mat[nonZeroInput & nonZeroOutput, :].shape[0]))
df = pd.DataFrame(mat[nonZeroInput & nonZeroOutput, :], columns=["y", "exposure"] + x)
y_hat = model.predict(df, offset=np.log(df["exposure"]))

### mode 1 uses "input" as covariate (default):
Expand Down Expand Up @@ -552,15 +526,15 @@ def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, m

### predict
print("[%s] Predicting expected counts for bins above a minimum threshold: %s" % (
timestamp(), mat[nonZeroInput & testOutput, :].shape[0]))
df = pd.DataFrame(mat[nonZeroInput & testOutput, :], columns=["y"] + x)
timestamp(), mat[nonZeroInput & nonZeroOutput, :].shape[0]))
df = pd.DataFrame(mat[nonZeroInput & nonZeroOutput, :], columns=["y"] + x)
y_hat = model.predict(df)

### calculate P-value
print("[%s] Calculating P-value" % (timestamp()))
theta_hat = np.repeat(th, len(y_hat))
prob = th / (th + y_hat) ### prob=theta/(theta+mu)
pval = 1 - nbinom.cdf(mat[nonZeroInput & testOutput, 0] - 1, n=theta_hat, p=prob)
pval = 1 - nbinom.cdf(mat[nonZeroInput & nonZeroOutput, 0] - 1, n=theta_hat, p=prob)
del mat

### multiple testing correction
Expand All @@ -573,16 +547,16 @@ def call_peak(prefix, bedFile, bctFile, covFile, bwFile, chromSize, threshold, m
### output peak
with open(prefix + ".peak.bed", "w") as out:
with open(bedFile, "r") as bed:
for i, bin in enumerate(list(compress(bed.readlines(), nonZeroInput & testOutput))):
for i, bin in enumerate(list(compress(bed.readlines(), nonZeroInput & nonZeroOutput))):
if pval[i] <= float(threshold):
out.write("%s\t%.3f\t%.3f\t%.3f\t%.5e\t%.5e\n" % (
bin.strip(), fc[nonZeroInput & testOutput][i], p_score[i], q_score[i], pval[i], pval_adj[i]))
bin.strip(), fc[nonZeroInput & nonZeroOutput][i], p_score[i], q_score[i], pval[i], pval_adj[i]))

### output p-val track
print("[%s] Generating P-value bedGraph" % (timestamp()))
with open(prefix + ".pval.bdg", "w") as out:
with open(bedFile, "r") as bed:
for i, bin in enumerate(list(compress(bed.readlines(), nonZeroInput & testOutput))):
for i, bin in enumerate(list(compress(bed.readlines(), nonZeroInput & nonZeroOutput))):
out.write("%s\t%.3f\n" % (bin.strip(), abs(p_score[i])))
del p_score, q_score, pval, pval_adj

Expand Down
24 changes: 10 additions & 14 deletions starrpeaker/starrpeaker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,27 @@
parser = argparse.ArgumentParser(description='STARR-Peaker')

### required args
parser.add_argument('-p', '--prefix', help='Output File Prefix', required=True)
parser.add_argument('--prefix', help='Output File Prefix', required=True)
parser.add_argument('--chromsize', help='Chrom Sizes', required=True)
parser.add_argument('-l', '--length', help='Bin Length', required=False, default=500)
parser.add_argument('-s', '--step', help='Step Size', required=False, default=100)
parser.add_argument('-x', '--blacklist', help='Blacklist Region in BED format', required=True)
parser.add_argument('--blacklist', help='Blacklist Region in BED format', required=True)
parser.add_argument('--cov', help='Covariate BigWig Files', nargs='+', required=True)
parser.add_argument('-i', '--input', help='Input BAM File', required=True)
parser.add_argument('-o', '--output', help='STARR-seq BAM File', required=True)

### optional args
parser.add_argument('--length', help='Bin Length', required=False, default=500)
parser.add_argument('--step', help='Step Size', required=False, default=100)
parser.add_argument('--min', help='Minimum Template Size', required=False, default=100)
parser.add_argument('--max', help='Maximum Template Size', required=False, default=1000)
parser.add_argument('-t', '--threshold', help='Adjusted P-value Threshold', required=False, default=0.05)
parser.add_argument('--pseudocount', help='Pseudocount for Input Normalization', required=False, default=1)
parser.add_argument('-q', '--minquantile', help='Minimum Input Quantile', required=False, default=0.2)
parser.add_argument('-m', '--mode', help='Mode', required=False, default=1)
parser.add_argument('--threshold', help='Adjusted P-value Threshold', required=False, default=0.05)
parser.add_argument('--minquantile', help='Minimum Input Quantile', required=False, default=0.2)
parser.add_argument('--mode', help='Mode', required=False, default=1)

args = parser.parse_args()


def main():
### make bin with specified window length and step size
### make genomic bin with specified window length and step size
core.make_bin(chromSize=args.chromsize,
binLength=args.length,
stepSize=args.step,
Expand All @@ -49,15 +48,13 @@ def main():
bedFile=args.prefix + ".bin.bed",
fileOut=args.prefix + ".cov.tsv")

### process input, output bam
### process input, output bam files
core.proc_bam(bamFiles=[args.input, args.output],
bedFile=args.prefix + ".bin.bed",
chromSize=args.chromsize,
fileOut=args.prefix + ".bam.bct",
minSize=args.min,
maxSize=args.max,
normalize=True,
pseudocount=args.pseudocount)
maxSize=args.max)

### call peaks
core.call_peak(prefix=args.prefix,
Expand All @@ -67,7 +64,6 @@ def main():
bwFile=args.prefix + ".bam.bct.1.bw",
chromSize=args.chromsize,
threshold=args.threshold,
minInputQuantile=args.minquantile,
mode=args.mode)

if __name__ == "__main__": main()

0 comments on commit 79f2dab

Please sign in to comment.