Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #47

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open

Dev #47

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 18 additions & 10 deletions bin/cctyper
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ import argparse
import logging
import os
import sys
import pkg_resources

from importlib.metadata import version
import inspect
from cctyper.controller import Controller
from cctyper.prodigal import Prodigal
from cctyper.gff import GFF
from cctyper.hmmer import HMMER
from cctyper.castyping import Typer
from cctyper.minced import Minced
Expand All @@ -16,8 +17,9 @@ from cctyper.crisprcas import CRISPRCas
from cctyper.plot import Map
from cctyper.blast import RepeatMatch


########## Arguments ##########
ap = argparse.ArgumentParser(description='CRISPRCasTyper version {}'.format(pkg_resources.require("cctyper")[0].version))
ap = argparse.ArgumentParser(description=f'CRISPRCasTyper version {version("cctyper")}')

# Required
ap.add_argument('input', help='Input fasta file')
Expand All @@ -31,11 +33,13 @@ ap.add_argument('--keep_tmp', help='Keep temporary files (prodigal, hmmer, mince
ap.add_argument('--log_lvl', help='Logging level [%(default)s].', default='INFO', type=str, choices=['DEBUG','INFO','WARNING','ERROR'])
ap.add_argument('--redo_typing', help='Redo the typing. Skip prodigal and HMMER and load the hmmer.tab from the output dir.', action='store_true')
ap.add_argument('--simplelog', help='No color or progress bar in log.', action='store_true')

ap.add_argument('--gff', help='Path to GFF file.', default=None, type=str)
ap.add_argument('--prot', help='Path to protein FASTA file.', default=None, type=str)
# Data
apd = ap.add_argument_group('data arguments')
apd.add_argument('--db', help='Path to database. Only needed if CCTYPER_DB environment variable is not set.', default='', type=str)


# Thresholds
apt = ap.add_argument_group('cas arguments')
apt.add_argument('--dist', help='Max allowed number of unknown genes between cas genes in operon [%(default)s].', default=3, type=int)
Expand Down Expand Up @@ -69,14 +73,18 @@ app.add_argument('--no_plot', help='Do not draw a map of CRISPR-Cas.', action='s
app.add_argument('--no_grid', help='Do not add grid to plot.', action='store_true')

# Workflow starts here

args = ap.parse_args()

########## Initialize ##########
master = Controller(ap.parse_args())

########## Prodigal ##########
proteins = Prodigal(master)
proteins.run_prod()
master = Controller(args)

########## Prodigal/GFF read ##########
if args.gff and args.prot:
proteins = GFF(master)
proteins.get_genes()
else:
proteins = Prodigal(master)
proteins.run_prod()

########## Hmmer ##########
hmmeri = HMMER(proteins)
Expand Down
1 change: 0 additions & 1 deletion cctyper/blast.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import pandas as pd
import statistics as st

from Bio import pairwise2
from Bio import SeqIO
from Bio.Seq import Seq
from joblib import Parallel, delayed
Expand Down
7 changes: 4 additions & 3 deletions cctyper/castyping.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ def type_operon(self, operon):
tmpX['Hmm'] = [re.sub("_.*","",x) for x in tmpX['Hmm']]
tmpX.drop_duplicates('Hmm', inplace=True)

start = list(tmp['start'])
end = list(tmp['end'])

start = list(tmp['start'].astype(int))
end = list(tmp['end'].astype(int))
if operon in self.circ_operons:
gene_end = np.argmax(np.diff(list(tmp['Pos'])))
start_operon = start[gene_end+1]
Expand Down Expand Up @@ -187,6 +187,7 @@ def type_operon(self, operon):
"Best_type": best_type,
"Best_score": best_score,
"Genes": list(tmp['Hmm']),
"Prot_IDs": list(tmp['ORF']),
"Positions": list(tmp['Pos']),
"E-values": ['{:0.2e}'.format(x) for x in list(tmp['Eval'])],
"CoverageSeq": [round(x,3) for x in list(tmp['Cov_seq'])],
Expand Down
15 changes: 14 additions & 1 deletion cctyper/controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ def __init__(self, args):
self.out = args.output
self.threads = args.threads
self.dist = args.dist
self.gff = args.gff
self.prot = args.prot
self.prod = args.prodigal
self.db = args.db
self.circular = args.circular
Expand Down Expand Up @@ -107,6 +109,15 @@ def check_input(self):
else:
logging.error('Could not find input file')
sys.exit()
if self.prot and self.gff:
shutil.copy(self.prot, self.prot_path)
subprocess.run(['sed', '-i', 's/ .*$//', self.prot_path])
elif self.gff and (not self.gff):
logging.error('Gff file provided but no protein file. Please provide both')
sys.exit()
elif (not self.gff) and self.gff:
logging.error('Protein file provided but no GFF file. Please provide both')
sys.exit()

def check_fasta(self):

Expand Down Expand Up @@ -159,7 +170,6 @@ def clean(self):
shutil.rmtree(self.out+'hmmer')

os.remove(self.out+'minced.out')
os.remove(self.out+'prodigal.log')
os.remove(self.out+'proteins.faa')

if os.path.exists(self.out+'blast.tab'):
Expand All @@ -168,6 +178,9 @@ def clean(self):
os.remove(self.out+'Flank.nhr')
os.remove(self.out+'Flank.nin')
os.remove(self.out+'Flank.nsq')

if os.path.exists(self.out+'prodigal.log'):
os.remove(self.out+'prodigal.log')

def check_db(self):

Expand Down
2 changes: 1 addition & 1 deletion cctyper/crisprcas.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def dist_ll(x,ll,ss,co):
circ_op = False

# Find distances between operon and crisprs
dists = dist_ll((int(cas_operon['Start']), int(cas_operon['End'])), zip(crispr_sub['Start'], crispr_sub['End']), seq_size, circ_op)
dists = dist_ll((int(cas_operon['Start'].iloc[0]), int(cas_operon['End'].iloc[0])), zip(crispr_sub['Start'], crispr_sub['End']), seq_size, circ_op)

cc_circs = [x[1] for x in dists]
distances = [x[0] for x in dists]
Expand Down
71 changes: 71 additions & 0 deletions cctyper/gff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import os
import subprocess
import logging
import sys
import re

import pandas as pd

class GFF(object):

def __init__(self, obj):
self.master = obj
for key, val in vars(obj).items():
setattr(self, key, val)

def get_genes(self):

def unwrap_attributes(df):
attribute_names = set()
for attributes in df['attributes']:
attributes_list = [n for n in attributes.split(';') if n and n!='']
for attr in attributes_list:
attribute_name = attr.split('=')[0]
attribute_names.add(attribute_name)

# Create new columns with attribute names and default values
for attribute_name in attribute_names:
df = df.assign(**{attribute_name: None})

# Function to extract attribute values and assign them to columns
def extract_attribute_values(row):
attributes_list = [n for n in row['attributes'].split(';') if n and n!='']
for attr in attributes_list:
attribute_name, attribute_value = attr.split('=')
row[attribute_name] = attribute_value
return row

# Apply the function to each row
df = df.apply(lambda row: extract_attribute_values(row), axis=1)

# Drop the original "attributes" columns
df = df.drop(columns=['attributes'])
return df

custom_header = ['Contig', 'source', 'type','Start','End','score','Strand','phase','attributes']

gff = pd.read_csv(self.gff,sep="\t",comment="#",names=custom_header,engine="c")
gff = gff[gff['type'] == 'CDS']
gff['Strand'] = gff['Strand'].map({'+': 1, '-': -1})

gff=unwrap_attributes(gff)
cols = [col for col in gff.columns if col in [n for n in custom_header if n[0].isupper()]]
if "ID" in gff.columns:
if gff['ID'].str.startswith('cds-').all():
gff["protein_id"]=gff["ID"].str.split("-").apply(lambda x: x[1])
elif gff["ID"].str.match(r'^\D+_\d+$').all():
gff["protein_id"]=gff["ID"]
elif gff["ID"].str.match(r'^\d+_\d+$').all():
gff["protein_id"] = gff["Contig"] + "_" +gff["ID"].str.split("_").apply(lambda x: x[1])
else:
prot_ids = subprocess.run(['sed', '-n', 's/^>//p', self.prot_path],capture_output=True).stdout.decode().splitlines()
if set(prot_ids).issubset(gff['ID']):
gff["protein_id"]=gff["ID"]
else:
logging.error("GFF file does not match expected format. Please check that it meets the format requirements.")
sys.exit()
gff=gff[cols+["protein_id"]].dropna(subset=["protein_id"])
gff = gff.sort_values(['Contig', 'Start'])
gff['Pos'] = gff.groupby('Contig').cumcount() + 1
self.genes = gff[cols+["Pos","protein_id"]]
self.genes.to_csv(self.out+'genes.tab', index=False, sep='\t')
42 changes: 27 additions & 15 deletions cctyper/hmmer.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,29 +75,37 @@ def load_hmm(self):

# Get files
hmm_files = glob.glob(os.path.join(self.out+'hmmer', '*.tab'))

# Parse externally
with open(self.out+'hmmer.tab', 'w') as hmmer_tab:
subprocess.run(['grep', '-v', '^#']+hmm_files, stdout=hmmer_tab)
subprocess.run(['sed', '-i', 's/:/ /', self.out+'hmmer.tab'])

# Load
usecols = [0,1,3,6,7,
8,16,17,18,19,
20,21,22]
names = ['Hmm','ORF','tlen','qlen','Eval',
'score','hmm_from','hmm_to','ali_from','ali_to',
'env_from','env_to','pprop']
if not (self.gff and self.prot):
usecols = usecols + [24,26,28]
names= names + ['start','end','strand']
hmm_df = pd.read_csv(self.out+'hmmer.tab', sep='\s+', header=None,
usecols=(0,1,3,6,7,
8,16,17,18,19,
20,21,22,24,26,28),
names=('Hmm','ORF','tlen','qlen','Eval',
'score','hmm_from','hmm_to','ali_from','ali_to',
'env_from','env_to','pprop','start','end','strand'))

usecols=usecols,
names=names)
# Parse HMM names
hmm_df['Hmm'] = [re.sub('\.tab', '',
re.sub(os.path.join(self.out, 'hmmer', ''), '', x))
for x in hmm_df['Hmm']]

# Add columns
hmm_df['Acc'] = [re.sub("_[0-9]*$","",x) for x in hmm_df['ORF']]
hmm_df['Pos'] = [int(re.sub(".*_","",x)) for x in hmm_df['ORF']]

if self.gff and self.prot:
self.genes = pd.read_csv(self.out+'genes.tab', sep='\t')
hmm_df = pd.merge(hmm_df,self.genes[["Start","End","Strand","Contig","Pos","protein_id"]],
left_on="ORF",right_on="protein_id",how="left").drop("protein_id",axis=1)
hmm_df.rename(columns={'Contig': 'Acc','Strand': 'strand', 'Start': 'start', 'End': 'end'}, inplace=True)
else:
hmm_df['Acc'] = [re.sub("_[0-9]*$","",x) for x in hmm_df['ORF']]
hmm_df['Pos'] = [int(re.sub(".*_","",x)) for x in hmm_df['ORF']]

# Coverages of aligments
def covs(df_sub):
Expand All @@ -115,7 +123,6 @@ def covs(df_sub):
hmm_df = hmm_df.groupby(['Hmm','ORF']).apply(covs)
hmm_df.reset_index(drop=True, inplace=True)
self.hmm_df = hmm_df.drop_duplicates()

# Write to file
def write_hmm(self):
self.hmm_df.to_csv(self.out+'hmmer.tab', sep='\t', index=False)
Expand Down Expand Up @@ -182,6 +189,11 @@ def load_custom_hmm(self):
self.custom_hmm_df.drop_duplicates('Target', inplace=True)

# New columns
self.custom_hmm_df['Contig'] = [re.sub("_[0-9]*$","",x) for x in self.custom_hmm_df['Target']]
self.custom_hmm_df['Pos'] = [int(re.sub(".*_","",x)) for x in self.custom_hmm_df['Target']]
if self.gff and self.prot:
self.genes = pd.read_csv(self.out+'genes.tab', sep='\t')
self.custom_hmm_df = pd.merge(self.custom_hmm_df,self.genes[["Contig","Pos","protein_id"]],
left_on="Target",right_on="protein_id",how="left").drop("protein_id",axis=1)
else:
self.custom_hmm_df['Contig'] = [re.sub("_[0-9]*$","",x) for x in self.custom_hmm_df['Target']]
self.custom_hmm_df['Pos'] = [int(re.sub(".*_","",x)) for x in self.custom_hmm_df['Target']]

15 changes: 12 additions & 3 deletions cctyper/minced.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import statistics as st

from Bio import pairwise2
from Bio import Align
from joblib import Parallel, delayed

# Define the CRISPR class
Expand All @@ -33,8 +33,17 @@ def addSpacer(self, spacer):
def getConsensus(self):
self.cons = max(set(self.repeats), key = self.repeats.count)
def identity(self, i, j, sqlst):
align = pairwise2.align.globalxx(sqlst[i], sqlst[j])
return(align[0][2]/align[0][4]*100)
aligner = Align.PairwiseAligner()
aligner.mode = 'global' # Use global alignment
aligner.match_score = 1 # Match score is 1
aligner.mismatch_score = 0 # Mismatch score is 0
aligner.open_gap_score = 0 # Gap opening score is 0
aligner.extend_gap_score = 0 # Gap extension score is 0
seq1 = sqlst[i]
seq2 = sqlst[j]
alignments = aligner.align(seq1, seq2)
best_alignment = alignments[0]
return best_alignment.score / max(len(seq1), len(seq2)) * 100
def identLoop(self, seqs, threads):
if self.exact:
sqr = range(len(seqs))
Expand Down