Skip to content

Commit

Permalink
Complete kmer generation logic
Browse files Browse the repository at this point in the history
  • Loading branch information
gayaldassanayake committed Nov 16, 2021
1 parent eb8bc86 commit 709ff81
Show file tree
Hide file tree
Showing 13 changed files with 343 additions and 54 deletions.
278 changes: 278 additions & 0 deletions biomarker_dbs/inc-types.fasta

Large diffs are not rendered by default.

Binary file added biomarker_dbs/orit.nhr
Binary file not shown.
Binary file added biomarker_dbs/orit.nin
Binary file not shown.
Binary file added biomarker_dbs/orit.nsq
Binary file not shown.
Binary file added biomarker_dbs/rRNA.i1f
Binary file not shown.
Binary file added biomarker_dbs/rRNA.i1i
Binary file not shown.
Binary file added biomarker_dbs/rRNA.i1m
Binary file not shown.
Binary file added biomarker_dbs/rRNA.i1p
Binary file not shown.
47 changes: 10 additions & 37 deletions constants.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,30 @@
# default args
default_threads = 8
default_coverage = 5
# batch_size = 1000

# int constants
frag_len = 5000
k = 7
err_file = "error.txt"
log_file = "log.txt"

max_frag = 10**6
# chrom_max = 10**6
# ex_plas_max = 10**5

# progress bar descriptions
frag_bar_desc = "fragments "

# # sequence filter constants
# plas_filter_val = 0.99
# chrom_filter_val = 0.01
# plas_filter_file = "/media/gayal/Programming/FYP/databases/DNAML-plasmid.fasta.probs.out"
# chrom_filter_file = "/media/gayal/Programming/FYP/databases/DNAML_bacterial_batch0.fasta.probs.out"
kmer_bar_desc = "kmers "

# Databases
# plas_db_path = "/media/gayal/Programming/FYP/databases/DNA-ML_FYP_2021/plasmid_refs"
# chrom_db_path = '/media/gayal/Programming/FYP/databases/DNA-ML_FYP_2021/bacterial_references'
# plas_db_path = "/media/gayal/Programming/FYP/databases/DNA-ML_FYP_2021/plas-ref-small"
# chrom_db_path = '/media/gayal/Programming/FYP/databases/DNA-ML_FYP_2021/bact-ref-tiny'
db_path = "../../references/biomarkerdbs"
db_path = "./biomarker_dbs"
cmscan_path = "cmscan"
blastn_path = "blastn"
seq2vec_path = "seq2vec"
nucmer_path = "nucmer"

# # label files
# plas_label_path = "results/plasmid/Label"
# chrom_label_path = "results/chromosome/Label"
# ex_plas_label_path = "results/extra-plasmid/Label"

# # target csv files
# plas_target_path = "results/plasmid/target.csv"
# chrom_target_path = "results/chromosome/target.csv"
# ex_plas_target_path = "results/extra-plasmid/target.csv"
# log files
err_file = "error.txt"
log_file = "log.txt"

# results paths
frag_write_path = "results/fragments"
kmer_write_path = "results/kmers"

# # seq target csv files
# seq_plas_target_path = "results/plasmid/seq_target.csv"
Expand All @@ -52,17 +36,6 @@
# seq_chrom_write_path = "results/chromosome/Data/seq"
# seq_extra_plasmid_write_path = "results/extra-plasmid/Data/seq"

# # fragment txt files
# plas_txt_write_path = "results/plasmid/extra/frag-txt"
# chrom_txt_write_path = "results/chromosome/extra/frag-txt"
# extra_plasmid_txt_write_path = "results/extra-plasmid/extra/frag-txt"

# # kmer files

# plas_7mer_write_path = "results/plasmid/Data/7mers"
# chrom_7mer_write_path = "results/chromosome/Data/7mers"
# ex_plas_7mer_write_path = "results/extra-plasmid/Data/7mers"

# # circularity files
# plas_circ_write_path = "results/plasmid/Data/circular"
# chrom_circ_write_path = "results/chromosome/Data/circular"
Expand Down
16 changes: 10 additions & 6 deletions fragment.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from progress_bar import *
from constants import *
from helpers import create_fragment_dirs, timestamp, print_error, print_log
from helpers import create_fragment_dirs, print_error, print_log

def frag_generator(files, coverage):
''' Iterate through the input files and generate fragments
Expand All @@ -29,7 +29,7 @@ def frag_generator(files, coverage):
yield record.id, fragments

except Exception as err:
print_error(f"{timestamp()} Error reading fasta file {filename}: {err}\n")
print_error(f"Error reading fasta file {filename}: {err}")

def write_frags(seq_id, seq_frags):
''' Write the generated fragments into files
Expand All @@ -39,22 +39,26 @@ def write_frags(seq_id, seq_frags):
for frag in seq_frags:
fout.write(f'>{frag["n"]} {frag["id"]}\n{frag["seq"]}\n')
except Exception as err:
print_error(f"{timestamp()} Error writing fragment fasta file: {err}\n")
print_error(f"Error writing fragment fasta file: {err}")

def fragment(input_files, coverage):
''' Handler function for metagenomic fragment generation
'''
print_log("Generating metagenomic fragments...")
create_fragment_dirs()
frag_count = {}

frag_gen = frag_generator(input_files, coverage)

prog_bar = create_progress_bar(frag_bar_desc)
progress_bar = create_progress_bar(frag_bar_desc)
for input in enumerate(frag_gen):
_, seqs = input
seq_id, seq_frags = seqs
seq_frag_count = len(seq_frags)
write_frags(seq_id, seq_frags)
update_progress_bar(prog_bar, len(seq_frags))
update_progress_bar(progress_bar, seq_frag_count)
frag_count[seq_id] = seq_frag_count

close_progress_bar(prog_bar)
close_progress_bar(progress_bar)
print_log("Generating metagenomic fragments completed.")
return frag_count
13 changes: 5 additions & 8 deletions helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ def timestamp():
def print_error(err_msg):
print(err_msg)
with open(err_file, 'a') as fout:
fout.write(err_msg)
fout.write(f"{timestamp()} {err_msg}\n")

def print_log(log_msg):
print(log_msg)
with open(log_file, 'a') as fout:
fout.write(log_msg)
fout.write(f"{timestamp()} {log_msg}\n")

# def read_filter_data():
# plas_filter = { line[0]: float(line[1]) for line in [line.strip("\n").split("\t") for line in open(plas_filter_file).readlines()] }
Expand All @@ -37,6 +37,9 @@ def delete_dir_if_exist(path):
def create_fragment_dirs():
create_dir_if_needed(frag_write_path)

def create_kmer_dirs():
create_dir_if_needed(kmer_write_path)

# def create_sequence_dirs():
# # sequences
# create_dir_if_needed(seq_plas_write_path)
Expand All @@ -57,12 +60,6 @@ def create_fragment_dirs():
# create_dir_if_needed(chrom_frag_split_path)
# create_dir_if_needed(ex_plas_frag_split_path)

# def create_kmer_files():
# # kmer files
# create_dir_if_needed(plas_7mer_write_path)
# create_dir_if_needed(chrom_7mer_write_path)
# create_dir_if_needed(ex_plas_7mer_write_path)

# def create_inc_factor_dirs():
# # inc factor files
# create_dir_if_needed(plas_inc_out_path)
Expand Down
35 changes: 35 additions & 0 deletions kmer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import subprocess as sp

from progress_bar import *
from helpers import create_kmer_dirs, print_error, print_log
from constants import *
import os

def count_kmers(threads, frag_count):
''' Generates the kmer counts for each fragment using seq2vec
'''
try:
print_log("Generating kmer counts...")
create_kmer_dirs()
progress_bar = create_progress_bar(kmer_bar_desc)
for filename in os.listdir(frag_write_path):
name = os.path.splitext(filename)[0]
frag_file = os.path.join(frag_write_path, filename)
write_file = os.path.join(kmer_write_path, name)
args = [
seq2vec_path,
'-f', str(frag_file),
'-o', str(write_file),
'-t', str(threads),
'-k', str(k),
]
sp.run(args, stdout=sp.PIPE, stderr=sp.PIPE)

update_val = frag_count[name] if name in frag_count.keys() else 0
update_progress_bar(progress_bar, update_val)

close_progress_bar(progress_bar)
except Exception as err:
print_error(f"Error generating kmer counts {err}")

print_log("Generating kmer counts completed.")
8 changes: 5 additions & 3 deletions preprocess.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import os

from fragment import fragment
from kmer import count_kmers
from constants import *
from helpers import timestamp, print_error
from helpers import print_error

def preprocess(args):
''' Preprocess sequences t0 generate kmer, biomarker features
Expand All @@ -21,7 +22,8 @@ def preprocess(args):
files = os.listdir(args.directory)
files = [os.path.join(args.directory, file) for file in files]
except Exception as err:
print_error(f"{timestamp()} Error listing input directory files: {err}\n")
print_error(f"Error listing input directory files: {err}\n")
return -1
fragment(files, args.coverage)
frag_count = fragment(files, args.coverage)
count_kmers(args.threads, frag_count)
return 0

0 comments on commit 709ff81

Please sign in to comment.