Skip to content

Commit

Permalink
Added code for streamlining feature extraction.
Browse files Browse the repository at this point in the history
Added pre-compiled model of CoRE-ATAC.
  • Loading branch information
ajt986 committed Feb 3, 2020
1 parent ee2e28e commit f4d3b06
Show file tree
Hide file tree
Showing 28 changed files with 5,739,635 additions and 37 deletions.
Binary file added BAMExtractor.jar
Binary file not shown.
15 changes: 11 additions & 4 deletions BAMExtractor/src/org/jax/bamextractor/BAMSplitter.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,24 @@
public class BAMSplitter {

public static void main(String[] args) {
if(args.length == 3) {
boolean useduplicateflag = true;
if(args.length >= 3) {
BAMSplitter b = new BAMSplitter();
String bamfile = args[0];
String chromosomefile = args[1];
String outdir = args[2];
if(args.length > 3 && args[3].equals("--keepduplicates")) {
useduplicateflag = false;
}
try {
b.writeThreshold(outdir, b.getThresholdAndSplitBAM(bamfile, outdir, b.readChromosomes(chromosomefile)));
b.writeThreshold(outdir, b.getThresholdAndSplitBAM(bamfile, outdir, b.readChromosomes(chromosomefile), useduplicateflag));
} catch (IOException e) {
e.printStackTrace();
}
}
else {
System.out.println("Usage: bamfile chromosomefile outputdirectory --keepduplicates(optional)");
}
}

private void writeThreshold(String outdir, int thresh) throws IOException {
Expand All @@ -51,7 +58,7 @@ private String[] readChromosomes(String file) throws IOException {
return rv.toArray(new String[0]);
}

private int getThresholdAndSplitBAM(String bamfile, String outdir, String[] chromosomes){
private int getThresholdAndSplitBAM(String bamfile, String outdir, String[] chromosomes, boolean useduplicateflag){
SamReaderFactory factory = SamReaderFactory.makeDefault()
.enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
.validationStringency(ValidationStringency.SILENT);
Expand All @@ -71,7 +78,7 @@ private int getThresholdAndSplitBAM(String bamfile, String outdir, String[] chro

while(it.hasNext()){
SAMRecord next = it.next();
stats.addRecord(next);
stats.addRecord(next, useduplicateflag);
SAMFileWriter w = writers.get(next.getReferenceName());
if(w != null){
w.addAlignment(next);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
public class InsertSizeThreshold {


public int getInsertSizeThreshold(String chrbamfile) throws IOException{
public int getInsertSizeThreshold(String chrbamfile, boolean useduplicateflag) throws IOException{
SamReaderFactory factory = SamReaderFactory.makeDefault()
.enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
.validationStringency(ValidationStringency.SILENT);
Expand All @@ -33,7 +33,7 @@ public int getInsertSizeThreshold(String chrbamfile) throws IOException{

while(it.hasNext()){
SAMRecord next = it.next();
stats.addRecord(next);
stats.addRecord(next, useduplicateflag);
}

int thresh = 20;
Expand Down
21 changes: 12 additions & 9 deletions BAMExtractor/src/org/jax/bamextractor/ReadCountExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,13 @@ public class ReadCountExtractor {
//TODO make the code reusable instead of copying, only copying due to time limitations

public static void main(String[] args){

boolean useduplicateflag = true;
if(args.length >= 3) {

if(args[args.length-1].equals("--keepduplicates")) {
useduplicateflag = false;
}

String bamfile = args[0];
String locifile = args[1];
String outfile = args[2];
Expand All @@ -35,15 +39,14 @@ public static void main(String[] args){
ATACReadProcessor[] processors = new ATACReadProcessor[1];
processors[0] = new ReadCounts(outfile);


int thresh = Integer.MAX_VALUE;
if(args.length > 3) {
thresh = Integer.parseInt(args[3]);
}
else {
InsertSizeThreshold ist = new InsertSizeThreshold();
try {
thresh = ist.getInsertSizeThreshold(bamfile);
thresh = ist.getInsertSizeThreshold(bamfile, useduplicateflag);
} catch (IOException e) {
e.printStackTrace();
System.exit(1);
Expand All @@ -54,7 +57,7 @@ public static void main(String[] args){

try {
ReadCountExtractor t = new ReadCountExtractor();
t.extractReadCounts(bamfile, locifile, processors, thresh);
t.extractReadCounts(bamfile, locifile, processors, thresh, useduplicateflag);
} catch (IOException e) {
e.printStackTrace();
}
Expand All @@ -63,12 +66,12 @@ public static void main(String[] args){
}
}
else {
System.out.println("Usage: ReadCountExtractor.jar <bamfile> <peakfile> <outfile> <insertsize threshold>");
System.out.println("Usage: ReadCountExtractor.jar <bamfile> <peakfile> <outfile> [insertsize threshold] [--keepduplicates]");
}

}

public void extractReadCounts(String chrbamfile, String peakfile, ATACReadProcessor[] processors, int insertthresh) throws IOException{
public void extractReadCounts(String chrbamfile, String peakfile, ATACReadProcessor[] processors, int insertthresh, boolean useduplicateflag) throws IOException{

SamReaderFactory factory = SamReaderFactory.makeDefault()
.enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
Expand Down Expand Up @@ -142,7 +145,7 @@ public void extractReadCounts(String chrbamfile, String peakfile, ATACReadProces
totalreads++;
boolean isnegative = next.getReadNegativeStrandFlag();

if(!isValidRead(next, insertthresh)){
if(!isValidRead(next, insertthresh, useduplicateflag)){
invalidreads++;
continue;
}
Expand Down Expand Up @@ -281,12 +284,12 @@ private void addCurrentToNext(LinkedList<SAMRecord> currentlist, LinkedList<SAMR
}
}

private boolean isValidRead(SAMRecord record, int thresh){
private boolean isValidRead(SAMRecord record, int thresh, boolean useduplicateflag){
if ((!record.getReadPairedFlag() ||
record.getReadUnmappedFlag() ||
record.getMateUnmappedFlag() ||
record.isSecondaryOrSupplementary() ||
record.getDuplicateReadFlag())) {
(record.getDuplicateReadFlag() && useduplicateflag) )) {
return false;
}

Expand Down
23 changes: 16 additions & 7 deletions BAMExtractor/src/org/jax/bamextractor/ReadExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,14 @@
public class ReadExtractor {

public static void main(String[] args){

boolean useduplicateflag = true;
if(args.length >= 6) {

if(args[args.length-1].equals("--keepduplicates")) {
useduplicateflag = false;
System.out.println("Keeping duplicates.");
}

String bamfile = args[0];
String locifile = args[1];
Map<String, String> fastafiles = null;
Expand Down Expand Up @@ -70,7 +75,7 @@ public static void main(String[] args){
else {
InsertSizeThreshold ist = new InsertSizeThreshold();
try {
thresh = ist.getInsertSizeThreshold(bamfile);
thresh = ist.getInsertSizeThreshold(bamfile, useduplicateflag);
} catch (IOException e) {
e.printStackTrace();
System.exit(1);
Expand All @@ -81,7 +86,7 @@ public static void main(String[] args){

try {
ReadExtractor t = new ReadExtractor();
t.extractFeatures(bamfile, locifile, fastafiles, charperline, processors, thresh);
t.extractFeatures(bamfile, locifile, fastafiles, charperline, processors, thresh, useduplicateflag);
} catch (IOException e) {
e.printStackTrace();
}
Expand All @@ -92,7 +97,7 @@ public static void main(String[] args){

}

public void extractFeatures(String chrbamfile, String peakfile, Map<String, String> fastafiles, int charperline, ATACReadProcessor[] processors, int insertthresh) throws IOException{
public void extractFeatures(String chrbamfile, String peakfile, Map<String, String> fastafiles, int charperline, ATACReadProcessor[] processors, int insertthresh, boolean useduplicateflag) throws IOException{

SamReaderFactory factory = SamReaderFactory.makeDefault()
.enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS)
Expand Down Expand Up @@ -129,6 +134,10 @@ public void extractFeatures(String chrbamfile, String peakfile, Map<String, Stri
String curchr = null;
while(it.hasNext()){
SAMRecord next = it.next();

if(!fastafiles.containsKey(next.getReferenceName())){
continue;
}

if(first || !curchr.equals(next.getReferenceName())) {

Expand Down Expand Up @@ -172,7 +181,7 @@ public void extractFeatures(String chrbamfile, String peakfile, Map<String, Stri
totalreads++;
boolean isnegative = next.getReadNegativeStrandFlag();

if(!isValidRead(next, insertthresh)){
if(!isValidRead(next, insertthresh, useduplicateflag)){
invalidreads++;
continue;
}
Expand Down Expand Up @@ -330,12 +339,12 @@ private void addCurrentToNext(LinkedList<SAMRecord> currentlist, LinkedList<SAMR
}
}

private boolean isValidRead(SAMRecord record, int thresh){
private boolean isValidRead(SAMRecord record, int thresh, boolean useduplicateflag){
if ((!record.getReadPairedFlag() ||
record.getReadUnmappedFlag() ||
record.getMateUnmappedFlag() ||
record.isSecondaryOrSupplementary() ||
record.getDuplicateReadFlag())) {
(record.getDuplicateReadFlag() && useduplicateflag) )) {
return false;
}

Expand Down
4 changes: 2 additions & 2 deletions BAMExtractor/src/org/jax/bamextractor/ReadStatistics.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public int getFlagRemovedCount(){
return _flagremoved;
}

public void addRecord(SAMRecord record){
public void addRecord(SAMRecord record, boolean useduplicatereadflag){
_total++;
int is = record.getInferredInsertSize();
boolean removed = false;
Expand All @@ -60,7 +60,7 @@ public void addRecord(SAMRecord record){
removed = true;
}

if(record.getDuplicateReadFlag()){
if(record.getDuplicateReadFlag() && useduplicatereadflag){
_duplicate++;
removed = true;
}
Expand Down
Binary file added CoRE-ATAC_bulk_hg19_Dec_2019.h5
Binary file not shown.
81 changes: 81 additions & 0 deletions DeepLearningPEAS.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
#!/bin/bash
args=()
((index=0))
for i in "$@"
do
args[${index}]="$i"
((index++))
done

inDir="${args[0]}"
prefix="${args[1]}"
outDir="${args[2]}"
fasta="${args[3]}"
homerref="${args[4]}"
homermotifs="${args[5]}"
conservation="${args[6]}"
ctcfmotifs="${args[7]}"

inputpeaks="${args[8]}"

cd "${outDir}"

mkdir peak_features
cd peak_features

jarpath="${args[9]}/"


##Start with peaks and extract features.
##Sort BAM #TODO add option to skip this step if already sorted
echo "--- Sorting bam file. ---"
#echo "${inDir}/${prefix}.bam"
samtools sort -T PEASEXTRACT_${prefix} -o ${prefix}_sorted.bam "${inDir}/${prefix}.bam"
samtools index ${prefix}_sorted.bam

#cp "${inDir}/${prefix}.bam" ${prefix}_sorted.bam

#

echo "--- Calling annotations & known motifs. ---"
#HOMER Annotations
annotatePeaks.pl "${inputpeaks}" "${homerref}" -m "${homermotifs}" -nmotifs > ${prefix}_peaks_annotated.bed

#call denovo motifs
echo "--- Calling denovo motifs. ---"
findMotifsGenome.pl "${inputpeaks}" "${fasta}" "${outDir}/denovo"

mkdir "${outDir}/denovo/merge"
cp "${outDir}/denovo/homerResults/"*.motif "${outDir}/denovo/merge"
rm "${outDir}/denovo/merge/"*.similar*
rm "${outDir}/denovo/merge/"*RV.motif
cat "${outDir}/denovo/merge/"*.motif >> "${outDir}/denovo/merge/merged.motifs"
#call motifs with homer again using denovo motifs file homerMotifs.all.motifs
annotatePeaks.pl "${inputpeaks}" "${homerref}" -m "${outDir}/denovo/merge/merged.motifs" -nmotifs > ${prefix}_peaks_denovo.bed

echo "--- Calling CTCF motifs. ---"
annotatePeaks.pl "${inputpeaks}" "${homerref}" -m "${ctcfmotifs}" -nmotifs > ${prefix}_peaks_ctcf.bed

#Get the insert size threshold to remove outlier inserts
echo "--- Getting insert size threshold. ---"
java -jar "${jarpath}PEASTools.jar" insertsizethresh "${prefix}_sorted.bam" "${outDir}/peak_features" --keepduplicates #!!!TODO!!!#
thresh=$(cat "thresh.txt")


#Get Insert features
echo "--- Getting insert features. ---"
for i in {1..22}
do
chr=chr$i
java -jar "${jarpath}PEASTools.jar" insertmetrics "${chr}" "${chr}.bam" "${inputpeaks}" "${prefix}_${chr}_insertmetrics.txt" "$thresh" --keepduplicates #!!!TODO!!!#
rm ${chr}.bam
cat ${prefix}_${chr}_insertmetrics.txt >> ${prefix}_insertmetrics.txt
rm "${prefix}_${chr}_insertmetrics.txt"
done

echo "--- Getting conservation scores. ---"
#Get Conservation Scores
java -jar "${jarpath}PEASTools.jar" conservation "${inputpeaks}" "${conservation}" "${prefix}_conservation.txt"

echo "--- Merging features. ---"
java -jar "${jarpath}PEASTools.jar" mergedl "${inputpeaks}" "${prefix}_peaks_annotated.bed" "${prefix}_insertmetrics.txt" "${prefix}_conservation.txt" "${prefix}_peaks_denovo.bed" "${prefix}_peaks_ctcf.bed" "${prefix}_features.txt" "MERGED"
49 changes: 49 additions & 0 deletions FeatureExtractor.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/bin/bash

args=()
((index=0))
for i in "$@"
do
args[${index}]="$i"
((index++))
done

bamfile="${args[0]}"
peakfile="${args[1]}"
outdir="${args[2]}"
fasta="${args[3]}"
sepfasta="${args[4]}"
path="${args[5]}/"

prefix=$(basename ${bamfile} | sed 's/.bam//g')
inDir=$(dirname ${bamfile})
reformattedpeaks=${outdir}${prefix}_peaks.txt


INSERTTHRESH=900
CHARPERLINE=50
HOMERREF="hg19"
HOMERMOTIFS=${path}PEAS/humantop_Nov2016_HOMER.motifs
CONSERVATION=${path}PEAS/phastCons46wayPlacental.bed
CTCFMOTIFS=${path}PEAS/CTCF.motifs


cd "${outDir}"

#Step 1: Reformat peaks for 600 bp windows
python ${path}PeakFormatter.py ${peakfile} ${sepfasta} ${reformattedpeaks}


#Step 2: Extract features from BAM
java -jar ${path}BAMExtractor.jar ${bamfile} "${reformattedpeaks}" ${sepfasta} ${CHARPERLINE} ${outdir} ${prefix} ${INSERTTHRESH}


#Step 3: Extract PEAS features
${path}DeepLearningPEAS.sh ${inDir} ${prefix} ${outdir} ${fasta} ${HOMERREF} ${HOMERMOTIFS} ${CONSERVATION} ${CTCFMOTIFS} "${reformattedpeaks}_original.txt" ${path}PEAS


#Step 4: Move PEAS features to directory
#mv ${outDir}/peak_features/${prefix}_features.txt ${outDir}

#Done!

Loading

0 comments on commit f4d3b06

Please sign in to comment.