-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprepare_enum_conf_lists.py
executable file
·118 lines (90 loc) · 4.48 KB
/
prepare_enum_conf_lists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python
# Copyright 2022 Informatics Matters Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse, os
import utils
from utils import get_path_from_digest
from dm_job_utilities.dm_log import DmLog
def prepare_lists(infile, outfile_enum, outfile_le_confs, data_dir, interval=0):
total = 0
existing_enum = 0
existing_le_confs = 0
count_enum = 0
count_le_confs = 0
duplicates = 0
errors = 0
dups = set()
DmLog.emit_event('Processing file', infile)
with open(infile) as inf:
utils.expand_path(outfile_enum)
with open(outfile_enum, 'w') as outenum:
utils.expand_path(outfile_le_confs)
with open(outfile_le_confs, 'w') as outleconfs:
for line in inf:
total += 1
if interval and total % interval == 0:
DmLog.emit_event("Processed {} records".format(total))
tokens = line.strip().split('\t')
smi = tokens[0]
uid = tokens[1]
digest = tokens[2]
parts = [data_dir]
parts.extend(get_path_from_digest(digest))
path = os.path.join(*parts)
if not os.path.isdir(path):
utils.log('WARNING, path', path, 'not found')
errors += 1
continue
tgt_enum_smi = os.path.join(path, digest + '.smi')
tgt_enum_sdf = os.path.join(path, digest + '.sdf.gz')
tgt_le_confs = os.path.join(path, digest + '_le_confs.sdf.gz')
if smi in dups:
duplicates += 1
continue
else:
dups.add(smi)
if os.path.exists(tgt_enum_smi) and os.path.exists(tgt_enum_sdf):
existing_enum += 1
else:
outenum.write(smi + '\t' + uid + '\t' + digest + '\n')
count_enum += 1
if os.path.exists(tgt_le_confs):
existing_le_confs += 1
else:
outleconfs.write(smi + '\t' + uid + '\t' + digest + '\n')
count_le_confs += 1
return total, existing_enum, existing_le_confs, count_enum, count_le_confs, duplicates, errors
def main():
# Example:
# python3 prepare_enum_conf_lists.py -i foo.smi --outfile-enum need-enum.smi --outfile-confs need-confs.smi
### command line args definitions #########################################
parser = argparse.ArgumentParser(description='Prepare enumeration and conformer lists')
parser.add_argument('-i', '--input', required=True, help="File with inputs")
parser.add_argument('--outfile-enum', default='need-enum.smi', help="Output file for molecules needing enumeration")
parser.add_argument('--outfile-confs', default='need-confs.smi',
help="Output file for molecules needing low energy 3D conformer generation")
parser.add_argument('-d', '--data-dir', default='molecules/sha256', help="Directory with sharded data")
parser.add_argument("--interval", type=int, help="Reporting interval")
args = parser.parse_args()
utils.log("prepare_enum_conf_lists.py: ", args)
total, existing_enum, existing_confs, count_enum, count_confs, duplicates, errors = \
prepare_lists(args.input, args.outfile_enum, args.outfile_confs, args.data_dir, interval=args.interval)
tmpl = 'Processed {} records. {} duplicates, {} errors.\
{} already enumerated, {} already have low energy conformers,\
{} need enumeration, {} need low energy conformers generated'
DmLog.emit_event(tmpl.format(
total, duplicates, errors, existing_enum, existing_confs, count_enum, count_confs))
DmLog.emit_cost(total)
if __name__ == "__main__":
main()