-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathparse_acr_aca_with_db.py
executable file
·190 lines (152 loc) · 7.86 KB
/
parse_acr_aca_with_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/python3
'''
*****************************************************************************************************
Purpose: Parses Aca/Acr result file with the aide of gi/pai database information.
Limits the number of Aca/Acr results to only include the ones found near (lax) or strictly within (strict) either a known gi or a pai.
Supports various options so user can specifiy the type of output (lax/strict) and which database to use.
Author:
Javi Gomez - https://github.com/rtomyj
Haidong Yi - https://github.com/HaidYi
*****************************************************************************************************
'''
from collections import defaultdict
'''
Purpose: fetch the number of intervals in list 2 that intersect with list 1
Arguments: list_Interval_1: the first interval list
list_Interval_2: the second interval list
Returns: set of elements in list 2 that intersect with elements in list 1
'''
def intervalIntersection(list_Interval_1, list_Interval_2):
locus_hit_set = set()
i = j = 0
while i < len(list_Interval_1) and j < len(list_Interval_2):
l_start = max(list_Interval_1[i][0], list_Interval_2[j][0])
r_end = min(list_Interval_1[i][1], list_Interval_2[j][1])
if l_start < r_end:
locus_hit_set.add(list_Interval_2[j])
if list_Interval_1[i][1] < list_Interval_2[j][1]:
i += 1
else:
j += 1
return locus_hit_set
'''
Purpose: Parses GI DB file.
Creates a dict using unique ncid's to map to start and end positions of regions found in GI DB.
Arguments: GI_NC_ID_maps_START_END = defaultdict with an empty list as its default value. Stores ncids found in GI DB and all the GI's found within that NCID
GI_DB_FILE = file name of the GI DB file
Returns: None
'''
def parse_gi_file(GI_NC_ID_maps_START_END, GI_DB_FILE):
with open(GI_DB_FILE, 'r', 512) as handle:
'''
Cycles through GI DB file
'''
for line in handle:
if line.startswith('Accession_number'):
continue
cols = line.split('\t')
ncid, start, end = cols[0], cols[1], cols[2] # extracts an assumed NC ID and its start and end position
tupStartEnd = (int(start), int(end)) # converts start and end from string to int and makes a tuple out of it
GI_NC_ID_maps_START_END[ncid].append(tupStartEnd) # appends start/end tuple to ncid dict with using the tuples ID as key
'''
Purpose: Parses PAI DB file
Creates a dict using unique ncid's to map to start and end positions of regions found in PAI DB.
Arguments: PAI_NC_ID_maps_START_END = defaultdict with an empty list as its default value. Stores ncids found in PAI DB and all the PAI's found within that NCID
PAI_DB_FILE = file name of the PAI DB file
Returns: None
'''
def parse_pai_file(PAI_NC_ID_maps_START_END, PAI_DB_FILE):
from regex import Regex # contains often used regex's
with open(PAI_DB_FILE) as handle:
'''
Cycles through PAI DB file.
'''
for line in handle:
ncid = ""
match = Regex.NC_REGEX.search(line) # tries to find NC ID in line
if match != None: # if NC regex matches something in the line then extraction of start and end can continue
ncid = match.group(0)
match = Regex.START_END_REGEX2.search(line) # uses a regex to find the start and end position of PAI region
start, end = int(match.group(1)), int(match.group(2)) # obtains start and end from regex match and converts from string to int
startEnd = (start, end) # creates tuple of start and end
PAI_NC_ID_maps_START_END[ncid].append(startEnd) # appends tuple to dict
'''
Purpose: Reads in Acr/Aca results containted in IN_FILE.
Picks out only the loci that are near or within (lax/strict) GI or PAI DB.
Arguments: IN_FILE = Acr/Aca results file user wants parsed
OUT_FILE = file name to use for output (might not be exactly as they entered)
NC_ID_maps_START_END = dictionary contianing ncids found either in GI or PAI
uniqueNcids = unique ncids found either in GI or PAI
PRINT_STRICT = whether user wants to strict results
PRINT_LAX = whether user wants to lax results
_type = string identifying which DB is being used eg) 'PAI', 'GI'
Returns: None
'''
def limit_acr(candidateAcrs, NC_ID_maps_START_END, uniqueNcids, PRINT_STRICT, PRINT_LAX, queue, _type):
'''
Using Interval Overlapping Algorithm to decide the Acr hit against 'PAI' and 'GI' Database.
'''
for position, locus in enumerate(candidateAcrs):
locusNcid = locus[0].nc # nc id of the locus
locus_start_end_list = list()
'''
We only care about loci found in a DB
Therefore if the ncid of the loci doesn't appear in the keys of the dict that contains unique ncids then we don't need to parse that locus. It won't be in a region the DB identified.
'''
if locusNcid in uniqueNcids:
startEndList = NC_ID_maps_START_END[locusNcid]
isLaxHit, isStrictHit = False, False
for protein in locus:
locus_start_end_list.append( (protein.start, protein.end) ) # add (protein.start, protein.end) into list
# Sort the list using the start of different intervals
locus_start_end_list = sorted(locus_start_end_list, key = lambda x: x[0])
startEndList = sorted(startEndList, key = lambda x: x[0])
locus_hit_set = intervalIntersection(startEndList, locus_start_end_list)
# isLaxHit = true if there exists a protein in the loci is hitted in the db
if len(locus_hit_set) > 0:
isLaxHit = True
# isStrictHit = true if all proteins in the loci are hitted in the db
if len(locus) == len(locus_hit_set):
isStrictHit = True
if PRINT_STRICT: # Strict Mode
if isStrictHit:
lociHits = queue.get()
lociHits.add(position)
queue.put(lociHits)
elif PRINT_LAX: # Lax Mode
if isLaxHit:
lociHits = queue.get()
lociHits.add(position)
queue.put(lociHits)
'''
Purpose: Builds dict of ncids that contain GI's using the GI DB file.
Uses dict to limit the acr results of input file.
Arguments: IN_FILE = Acr/Aca results file user wants parsed
OUT_FILE = file name to use for output (might not be exactly as they entered)
PRINT_STRICT = whether user wants to strict results
PRINT_LAX = whether user wants to lax results
queue - contains synchronized data (such as a list of loci indices that should be kept)
Returns: None
'''
def use_gi_db_on_acr(candidateAcrs, PRINT_STRICT, PRINT_LAX, GI_DB_FILE, queue):
GI_NC_ID_maps_START_END = defaultdict(lambda: list())
parse_gi_file(GI_NC_ID_maps_START_END, GI_DB_FILE) # parses GI DB file to build GI dict
mobilomeNcids = list(GI_NC_ID_maps_START_END.keys())
print('Total number of ncids found in gi db: ' + str(len(mobilomeNcids))) # prints number of unique ncids found in GI file
limit_acr(candidateAcrs, GI_NC_ID_maps_START_END, mobilomeNcids, PRINT_STRICT, PRINT_LAX, queue, 'gi') # parses the input file and creates new files (lax/strict) with only the results that lie within/strictly in a known GI
'''
Purpose: Builds dict of ncids that contain GI's using the GI DB file.
Uses dict to limit the acr results of input file.
Arguments: IN_FILE = Acr/Aca results file user wants parsed
OUT_FILE = file name to use for output (might not be exactly as they entered)
PRINT_STRICT = whether user wants to strict results
PRINT_LAX = whether user wants to lax results
queue - contains synchronized data (such as a list of loci indices that should be kept)
Returns: None
'''
def use_pai_db_on_acr(candidateAcrs, PRINT_STRICT, PRINT_LAX, PAI_DB_FILE, queue):
PAI_NC_ID_maps_START_END = defaultdict(lambda: list())
parse_pai_file(PAI_NC_ID_maps_START_END, PAI_DB_FILE) # parses PAI DB file to build PAI dict
paiNcids = list(PAI_NC_ID_maps_START_END.keys())
print('Total number of ncids found in PAI db: ' + str(len(paiNcids))) # prints number of unique ncids found in PAI file
limit_acr(candidateAcrs, PAI_NC_ID_maps_START_END, paiNcids, PRINT_STRICT, PRINT_LAX, queue, 'pai') # parses the input file and creates new files (lax/strict) with only the results that lie within/strictly in a known PAI