forked from corinnabrungs/msn_tree_library
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpubchem_client.py
516 lines (424 loc) · 16.8 KB
/
pubchem_client.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
import datetime as dt
import logging
# from diskcache import Cache
# cache = Cache("tmpcache")
import pandas as pd
import pubchempy
from joblib import Memory
from pubchempy import Compound, get_compounds
import pandas_utils
import synonyms
from date_utils import create_expired_entries_dataframe, iso_datetime_now
from meta_constants import MetaColumns
from pandas_utils import (
notnull,
isnull,
update_dataframes,
get_or_else,
make_str_floor_to_int_number,
get_first_or_else,
notnull_not_empty,
isnull_or_empty,
)
from tqdm import tqdm
from rdkit_mol_identifiers import ensure_smiles_column
tqdm.pandas()
logging.getLogger("pubchempy").setLevel(logging.DEBUG)
memory = Memory("memcache")
def prepend_pubchem_synonyms(df: pd.DataFrame) -> pd.DataFrame:
try:
pc_synonyms = df["pubchem"].progress_apply(
lambda compound: pubchem_get_synonyms(compound)
)
df = synonyms.ensure_synonyms_column(df)
df = synonyms.add_synonyms_columns(df, new_synonyms=pc_synonyms, prepend=True)
except:
logging.exception("No synonyms")
return df
cid_search_columns = [MetaColumns.pubchem_cid, MetaColumns.input_pubchem_cid]
def pubchem_search_by_cid(row):
if notnull(row["pubchem"]):
return row["pubchem"]
for col in cid_search_columns:
if col in row:
comp = pubchem_by_cid(row[col])
if notnull(comp):
return comp
return None
name_search_columns = [
MetaColumns.compound_name,
MetaColumns.cas,
MetaColumns.input_name,
"product_name",
]
def pubchem_search_by_names(row) -> str | None:
if notnull(row["pubchem"]):
return row["pubchem"]
for col in name_search_columns:
value = get_or_else(row, col)
if isinstance(value, str) and len(value) > 0:
compound = search_pubchem_by_name(value)
if notnull(compound):
return compound
return None
def transform_pubchem_columns(
filtered: pd.DataFrame, apply_structures: bool
) -> pd.DataFrame:
filtered[MetaColumns.pubchem_cid] = [
compound.cid for compound in filtered["pubchem"]
]
filtered = make_str_floor_to_int_number(filtered, MetaColumns.pubchem_cid)
filtered[MetaColumns.iupac] = [
compound.iupac_name for compound in filtered["pubchem"]
]
filtered["pubchem_logp"] = [compound.xlogp for compound in filtered["pubchem"]]
filtered = prepend_pubchem_synonyms(filtered)
filtered = synonyms.use_first_synonym_as_compound_name(filtered)
if apply_structures:
filtered[MetaColumns.isomeric_smiles] = [
compound.isomeric_smiles for compound in filtered["pubchem"]
]
filtered[MetaColumns.canonical_smiles] = [
compound.canonical_smiles for compound in filtered["pubchem"]
]
return filtered
def split_label_structure_sources(df: pd.DataFrame, source_name: str) -> pd.DataFrame:
input_smiles_df = df[df[MetaColumns.smiles].notnull()].copy()
if len(input_smiles_df) > 0:
# concat the new structures with the old ones
pubchem_smiles_df = df[
df[MetaColumns.isomeric_smiles].notnull()
| df[MetaColumns.canonical_smiles].notnull()
].copy()
pubchem_smiles_df[MetaColumns.structure_source] = source_name
pubchem_smiles_df[MetaColumns.smiles] = None
pubchem_smiles_df = ensure_smiles_column(pubchem_smiles_df)
# priority pubchem over input later when drop duplicates
df = pd.concat(
[pubchem_smiles_df, input_smiles_df], ignore_index=False, sort=False
)
else:
# no structures were available before - just copy the structures over
df = ensure_smiles_column(df)
df[MetaColumns.structure_source] = source_name
return df
def pubchem_search_structure_by_cid(
df: pd.DataFrame,
apply_structures: bool,
refresh_expired_entries_after: dt.timedelta = dt.timedelta(days=90),
) -> pd.DataFrame:
logging.info("Search PubChem by pubchem_cid")
df["pubchem"] = None
# only work on expired elements
# define which rows are old or were not searched before
filtered = create_expired_entries_dataframe(
df, MetaColumns.date_pubchem_cid_search, refresh_expired_entries_after
)
if len(filtered) == 0: # no need to update
return df
# some are filled from the name or cid search
filtered["pubchem"] = filtered.progress_apply(
lambda row: pubchem_search_by_cid(row), axis=1
)
filtered = filtered[filtered["pubchem"].notnull()].copy()
# refresh date
filtered[MetaColumns.date_pubchem_cid_search] = iso_datetime_now()
# transform create columns, do not copy structures as they are already cleaned by this script
filtered = transform_pubchem_columns(filtered, apply_structures=apply_structures)
if apply_structures:
filtered = split_label_structure_sources(filtered, source_name="pubchem_cid")
# combine new data with old rows that were not processed
# keep pubchem to limit name search
return update_dataframes(filtered, df)
# return update_dataframes(filtered, df).drop(columns=["pubchem"], errors="ignore")
def pubchem_search_parent(
df: pd.DataFrame,
apply_structures: bool,
refresh_expired_entries_after: dt.timedelta = dt.timedelta(days=90),
) -> pd.DataFrame:
logging.info("Search PubChem parents pubchem_cid")
df["pubchem"] = None
# only work on expired elements
# define which rows are old or were not searched before
filtered = create_expired_entries_dataframe(
df, MetaColumns.date_pubchem_parent_cid_search, refresh_expired_entries_after
)
if len(filtered) == 0: # no need to update
return df
# some are filled from the name or cid search
if MetaColumns.input_pubchem_cid not in filtered:
filtered[MetaColumns.input_pubchem_cid] = None
if MetaColumns.pubchem_cid not in filtered:
filtered[MetaColumns.pubchem_cid] = None
filtered[MetaColumns.input_pubchem_cid] = [
input_cid if notnull_not_empty(input_cid) else cid
for input_cid, cid in zip(
filtered[MetaColumns.input_pubchem_cid], filtered[MetaColumns.pubchem_cid]
)
]
# get parent CIDs, filter out failed, map to compound, apply all columns
filtered[MetaColumns.pubchem_cid] = filtered[
MetaColumns.input_pubchem_cid
].progress_apply(lambda cid: get_pubchem_parent_cid(cid))
filtered = filtered[filtered[MetaColumns.pubchem_cid].notnull()].copy()
filtered["pubchem"] = filtered[MetaColumns.pubchem_cid].progress_apply(
lambda cid: pubchem_by_cid(cid)
)
filtered = filtered[filtered["pubchem"].notnull()].copy()
# refresh date
filtered[MetaColumns.date_pubchem_parent_cid_search] = iso_datetime_now()
# transform create columns, do not copy structures as they are already cleaned by this script
filtered = transform_pubchem_columns(filtered, apply_structures=apply_structures)
if apply_structures:
filtered = split_label_structure_sources(
filtered, source_name="parent_pubchem_cid"
)
# combine new data with old rows that were not processed
# keep pubchem to limit name search
return update_dataframes(filtered, df)
def pubchem_search_structure_by_name(
df: pd.DataFrame,
refresh_expired_entries_after: dt.timedelta = dt.timedelta(days=90),
) -> pd.DataFrame:
logging.info("Search PubChem by name")
if "pubchem" not in df.columns:
df["pubchem"] = None
# only work on expired elements
# define which rows are old or were not searched before
filtered = create_expired_entries_dataframe(
df, MetaColumns.date_pubchem_name_search, refresh_expired_entries_after
)
if len(filtered) == 0: # no need to update
return df
# apply search but limit to those without pubchem results from CID search
filtered = filtered[filtered["pubchem"].isnull()].copy()
filtered["pubchem"] = filtered.progress_apply(
lambda row: pubchem_search_by_names(row), axis=1
)
filtered = filtered[filtered["pubchem"].notnull()].copy()
# refresh date
filtered[MetaColumns.date_pubchem_name_search] = iso_datetime_now()
# transform create new columns
filtered = transform_pubchem_columns(filtered, apply_structures=True)
filtered = split_label_structure_sources(filtered, source_name="pubchem_name")
# combine new data with old rows that were not processed
# clear pubchem to allow CID and structure search on all of them
# return update_dataframes(filtered, df)
df = update_dataframes(filtered, df).drop(columns=["pubchem"], errors="ignore")
df = make_str_floor_to_int_number(df, MetaColumns.pubchem_cid)
return df
def pubchem_search_by_structure(
df: pd.DataFrame,
refresh_expired_entries_after: dt.timedelta = dt.timedelta(days=90),
) -> pd.DataFrame:
logging.info("Search PubChem by structure")
if "pubchem" not in df.columns:
df["pubchem"] = None
# only work on expired elements
# define which rows are old or were not searched before
filtered = create_expired_entries_dataframe(
df, MetaColumns.date_pubchem_structure_search, refresh_expired_entries_after
)
if len(filtered) == 0: # no need to update
return df
filtered = filtered[filtered["pubchem"].isnull()].copy()
filtered["pubchem"] = filtered.progress_apply(
lambda row: search_pubchem_by_structure(
row[MetaColumns.smiles], row[MetaColumns.inchi], row[MetaColumns.inchikey]
),
axis=1,
)
filtered = filtered[filtered["pubchem"].notnull()].copy()
# refresh date
filtered[MetaColumns.date_pubchem_structure_search] = iso_datetime_now()
# transform create columns, do not copy structures as they are already cleaned by this script
filtered = transform_pubchem_columns(filtered, apply_structures=False)
# combine new data with old rows that were not processed
# clear pubchem to allow CID and structure search on all of them
return update_dataframes(filtered, df).drop(columns=["pubchem"], errors="ignore")
def pubchem_compound_score(comp: Compound):
"""
SMILES might contain . to denote salts - rate higher those smiles with less . characters
:param comp:
:return:
"""
smiles = comp.canonical_smiles
if not smiles:
return 0
return 1000 - str(smiles).count(".")
@memory.cache
def search_pubchem_by_name(name_or_cas: str) -> Compound | None:
"""
In pubchem many entries contain the cas as an alternative name - so searching for cas in name works often
:param name_or_cas: input name or cas
:return: first compound or None
"""
if isnull_or_empty(name_or_cas):
return None
try:
compounds = get_compounds(name_or_cas, "name")
if not compounds:
logging.info("Pubchem has no entry named:{}".format(name_or_cas))
return None
else:
compounds.sort(key=lambda comp: pubchem_compound_score(comp), reverse=True)
return compounds[0]
except:
logging.warning(f"FAILED PUBCHEM by name {name_or_cas}")
pass
def pubchem_by_cid(cid) -> Compound | None:
try:
return _pubchem_by_cid_cached(cid)
except:
logging.exception("Cannot retrieve PUBCHEM CID {}".format(cid))
return None
@memory.cache
def _pubchem_by_cid_cached(cid) -> Compound | None:
return _pubchem_by_cid(cid)
def _pubchem_by_cid(cid, ntry=1, max_tries=10) -> Compound | None:
if isnull_or_empty(cid):
return None
try:
return pubchempy.Compound.from_cid(cid)
except Exception as error:
if ntry < max_tries:
return _pubchem_by_cid(cid, ntry=ntry + 1, max_tries=max_tries)
else:
raise error
def search_pubchem_by_structure(
smiles=None, inchi=None, inchikey=None
) -> Compound | None:
"""
In pubchem many entries contain the cas as an alternative name - so searching for cas in name works often
:param smiles:
:param inchi:
:param inchikey:
:return: first compound or None
"""
if isnull_or_empty(smiles) and isnull_or_empty(inchi) and isnull_or_empty(inchikey):
return None
compounds = None
try:
if notnull_not_empty(inchikey):
compounds = get_pubchem_compound(inchikey, MetaColumns.inchikey)
if not compounds and notnull_not_empty(smiles):
compounds = get_pubchem_compound(smiles, MetaColumns.smiles)
if not compounds and notnull_not_empty(inchi):
compounds = get_pubchem_compound(inchi, MetaColumns.inchi)
if not compounds:
logging.info(
"NO PUBCHEM FOR: smiles:{} inchi:{} inchikey:{}".format(
smiles, inchi, inchikey
)
)
return None
else:
compounds.sort(key=lambda comp: pubchem_compound_score(comp), reverse=True)
return compounds[0]
except:
logging.warning(
"FAILED PUBCHEM FOR: smiles:{} inchi:{} inchikey:{}".format(
smiles, inchi, inchikey
)
)
return None
def get_pubchem_compound(value, key):
try:
return _get_pubchem_compound_cached(value, key)
except:
logging.info(f"Failed pubchem by {value} as {key}")
return None
@memory.cache
def _get_pubchem_compound_cached(value, key):
result = _get_pubchem_compound(value, key)
if result is None:
raise Exception("failed pubchem client by key value")
return result
def _get_pubchem_compound(value, key, ntry=1, max_tries=10):
if isnull_or_empty(value):
return None
try:
return get_compounds(value, key)
except:
if ntry < max_tries:
return _get_pubchem_compound(value, key, ntry + 1, max_tries)
else:
logging.warning("FAILED PUBCHEM FOR: {} (as {})".format(value, key))
return None
def pubchem_get_synonyms(compound):
try:
return _pubchem_get_synonyms_cached(compound)
except:
logging.exception(
f"FAILED to retrieve PUBCHEM synonyms for compound {compound.cid}"
)
return None
@memory.cache
def _pubchem_get_synonyms_cached(compound):
synonyms = _pubchem_get_synonyms(compound)
return synonyms
def _pubchem_get_synonyms(compound, try_n=1, max_tries=10):
"""
Try to get synonyms with a maximum retry
:param compound:
:param try_n: current call
:param max_tries: maximum tries
:return: the synonyms or an empty list on maximum number of tries with fail
"""
if isnull(compound):
return []
try:
synonyms = compound.synonyms
if synonyms is not None:
return synonyms
else:
return []
except Exception as error:
if try_n < max_tries:
return _pubchem_get_synonyms(compound, try_n=try_n + 1, max_tries=max_tries)
else:
raise error
@memory.cache
def get_pubchem_parent_cid(cid, orphans_as_self=True) -> str | None:
return _get_pubchem_parent_cid(cid, orphans_as_self)
def _get_pubchem_parent_cid(
cid, orphans_as_self=True, try_n=1, max_tries=10
) -> str | None:
"""
From a pubchem_cid, retreive the parent compound's cid.
If function is unsuccesful in retrieving a single parent,
`orphans_as_self = True` returns `cid` rather than None.
According to pubmed:
> A parent is conceptually the "important" part of the molecule
> when the molecule has more than one covalent component.
> Specifically, a parent component must have at least one carbon
> and contain at least 70% of the heavy (non-hydrogen) atoms of
> all the unique covalent units (ignoring stoichiometry).
> Note that this is a very empirical definition and is subject to change.
A parallel query can be executed using the REST PUG API:
http://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/11477084/cids/XML?cids_type=parent
"""
if isnull_or_empty(cid):
return None
try:
parent_cids = pubchempy.get_cids(
identifier=cid, namespace="cid", domain="compound", cids_type="parent"
)
except pubchempy.BadRequestError:
logging.info("Error getting parent of {}".format(cid))
return None
except Exception:
if try_n < max_tries:
return _get_pubchem_parent_cid(cid, orphans_as_self, try_n + 1, max_tries)
else:
logging.exception("Error getting parent of {}".format(cid))
return None # error return None to redo later
try:
if len(parent_cids) > 0:
return str(parent_cids[0])
return cid if orphans_as_self else None
except Exception:
logging.exception(
"Error getting parent of {}. Parents retrieved: {}".format(cid, parent_cids)
)
return cid if orphans_as_self else None