Skip to content

Commit

Permalink
Docking study
Browse files Browse the repository at this point in the history
  • Loading branch information
YifanDengWHU committed Jul 17, 2024
1 parent 67612bc commit 7ff2c65
Show file tree
Hide file tree
Showing 171 changed files with 578 additions and 0 deletions.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import sys

targ = sys.argv[1]

# get raw scores
df1 = pd.read_pickle('scores_'+targ+'_ranking_ECR.pkl')
df2 = df1.reset_index()

# get pickle with comopund subset labels
df = pd.read_pickle('docking_results_llm_high_medium_low_smi.pkl')
df = df[['molid','compound_set','smiles']]

# add the compound subset labels to raw scores
df3 = df2.merge(df, on='molid', how='left' )

# remove cpds with imputed (mean) scores
if targ == "low":
df3 = df3.loc[ ~((df3['fred'] <= -9.457) & (df3['fred'] >= -9.459)) ]
df3 = df3.loc[ ~((df3['gnina'] <= -6.100895) & (df3['gnina'] >= -6.100897)) ]

# add mean_qt score
df3['mean_qt'] = df3[['fred_qt','gnina_qt','plants_qt','rdock_qt']].mean(axis=1)

# plot the docking score distributions for each program
fig, axes = plt.subplots( 2, 3, figsize=(24,16), sharey=False, sharex=False )

progs = ['fred_qt','gnina_qt','plants_qt','rdock_qt','mean_qt','ECR_score']

for i, ax in enumerate(axes.flat):

hue_list = ['FDA', 'llm_background', 'llm_gen_ens_'+targ, 'llm_gen_ref_'+targ ]
df_temp = df3.loc[ df['compound_set'].isin( hue_list ) ]

if progs[i] != 'ECR_score':
# set reasonable bounds to avoid inclusion of outliers
stats = df3[ progs[i] ].describe()
upper_bound = 3.75
lower_bound = -3.75
df_temp = df_temp.loc[ (df_temp[progs[i]] <= upper_bound) & (df_temp[progs[i]] >= lower_bound) ]
ax.set_xlim( (upper_bound, lower_bound) )
ax.set_ylim( (0,90) )
elif progs[i] == 'ECR_score':
ax.set_ylim( (0,100) )

sns.histplot(
data=df_temp, x=progs[i], hue='compound_set',
hue_order=hue_list, bins=100, multiple='stack',
palette='bright', ax=ax
)

ax.set_title( progs[i] + " docking scores, targ="+targ )

fig.tight_layout( pad=5.0 )
fig.savefig("docking_all_ecr_qt_qtmean_scores_"+targ+".png", dpi=800 )
plt.close()

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import sys

targ = sys.argv[1]

# get raw scores
df1 = pd.read_pickle('scores_'+targ+'_ranking_ECR.pkl')
df2 = df1.reset_index()

# get pickle with comopund subset labels
df = pd.read_pickle('docking_results_llm_high_medium_low_smi.pkl')
df = df[['molid','compound_set','smiles']]

# add the compound subset labels to raw scores
df3 = df2.merge(df, on='molid', how='left' )

# remove cpds with imputed (mean) scores
if targ == "low":
df3 = df3.loc[ ~((df3['fred'] <= -9.457) & (df3['fred'] >= -9.459)) ]
df3 = df3.loc[ ~((df3['gnina'] <= -6.100895) & (df3['gnina'] >= -6.100897)) ]

# add mean_qt score
df3['mean_qt'] = df3[['fred_qt','gnina_qt','plants_qt','rdock_qt']].mean(axis=1)

# plot the docking score distributions for each program
fig, axes = plt.subplots( 2, 3, figsize=(24,16), sharey=False, sharex=False )

progs = ['fred_qt','gnina_qt','plants_qt','rdock_qt','mean_qt','ECR_score']

hue_list = ['FDA', 'llm_background', 'llm_gen_ref_'+targ, 'llm_gen_ens_'+targ ]
#hue_list = ['FDA', 'llm_background', 'llm_gen_ens_'+targ, 'llm_gen_ref_'+targ ]

palette = { 'FDA':'tab:blue', 'llm_background':'tab:green', 'llm_gen_ref_'+targ:'tab:red', 'llm_gen_ens_'+targ:'tab:orange' }

for i, ax1 in enumerate(axes.flat):

df_temp = df3.loc[ df['compound_set'].isin( hue_list ) ]

if progs[i] != 'ECR_score':
# set reasonable bounds to avoid inclusion of outliers
stats = df3[ progs[i] ].describe()
upper_bound = 3.75
lower_bound = -3.75
df_temp = df_temp.loc[ (df_temp[progs[i]] <= upper_bound) & (df_temp[progs[i]] >= lower_bound) ]
ax1.set_xlim( (upper_bound, lower_bound) )
ax1.set_ylim( (0,90) )
elif progs[i] == 'ECR_score':
ax1.set_ylim( (0,100) )

# compute bin intervals based on all scores
bins = np.histogram( df_temp[ progs[i] ].values, bins=100 )[1].tolist()


# plot background cpds (FDA and LLM bg)
df_temp1 = df_temp.loc[ df_temp['compound_set'].isin( hue_list[0:2] ) ]
sns.histplot(
data=df_temp1, x=progs[i], hue='compound_set',
hue_order=hue_list[0:2], bins=bins, multiple='layer',
alpha=0.5, palette=palette, ax=ax1
)
sns.move_legend( ax1, loc='upper left', title='compound sets background' )

ax1.set_title( progs[i] + " docking scores, targ="+targ )


# plot active cpds (ref and ensemble)
df_temp2 = df_temp.loc[ df_temp['compound_set'].isin( hue_list[2:4] ) ]

ax2 = ax1.twinx()

sns.histplot(
data=df_temp2, x=progs[i], hue='compound_set',
hue_order=hue_list[2:4], bins=bins, multiple='layer',
alpha=1.00, palette=palette, ax=ax2
)
sns.move_legend(ax2, loc='upper right', title='compound sets active' )
ax2.set_ylim((0,10))


#dump multiplot image
fig.tight_layout( pad=5.0 )
fig.savefig("docking_all_ecr_qt_qtmean_scores_"+targ+"_layer.png", dpi=800 )
plt.close()

Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@



from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd

df = pd.read_pickle('docking_results_llm_high_medium_low_smi.pkl')

fig, axes = plt.subplots( 3, 1, figsize=(10,15) )

targets = [ 'low','medium','high' ]

#kws = ["layer", "dodge", "stack", "fill"]

for i, ax in enumerate(axes):
df_temp = df.dropna( subset='ECR_score_'+targets[i] )
hue_list = ['FDA', 'llm_background', 'llm_gen_ens_'+targets[i], 'llm_gen_ref_'+targets[i] ]
df_temp = df.loc[ df['compound_set'].isin( hue_list ) ]
#sns.histplot( data=df_temp, x='ECR_score_'+targets[i], hue='compound_set', multiple='dodge', ax=ax )
#sns.histplot( data=df_temp, x='ECR_score_'+targets[i], kde=True, common_norm=False, hue='compound_set', hue_order=hue_list, multiple='layer', ax=ax )
sns.histplot( data=df_temp, x='ECR_score_'+targets[i], stat='density', log_scale=False, common_norm=False, hue='compound_set', hue_order=hue_list, multiple='layer', ax=ax )
ax.set_title( targets[i]+' target ECR scores by compound subset')
#ax.set_ylim( (0,100) )

fig.tight_layout( pad=5.0 )
fig.savefig("all_targets_ecr_score_kdes.png", dpi=800 )
plt.close()

Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@



from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd

df = pd.read_pickle('docking_results_llm_high_medium_low_smi.pkl')

fig, axes = plt.subplots( 3, 1, figsize=(10,15) )

targets = [ 'low','medium','high' ]

#kws = ["layer", "dodge", "stack", "fill"]

for i, ax in enumerate(axes):
df_temp = df.dropna( subset='ECR_score_'+targets[i] )
hue_list = ['FDA', 'llm_background', 'llm_gen_ens_'+targets[i], 'llm_gen_ref_'+targets[i] ]
df_temp = df.loc[ df['compound_set'].isin( hue_list ) ]
#sns.histplot( data=df_temp, x='ECR_score_'+targets[i], hue='compound_set', multiple='dodge', ax=ax )
#sns.histplot( data=df_temp, x='ECR_score_'+targets[i], kde=True, common_norm=False, hue='compound_set', hue_order=hue_list, multiple='layer', ax=ax )
sns.histplot(
data=df_temp, x='ECR_score_'+targets[i], bins=100, stat='density',
log_scale=True, common_norm=False, hue='compound_set', element='step',
fill=True, alpha=0.5, palette='bright', hue_order=hue_list,
multiple='layer', linewidth=0, ax=ax
)
sns.move_legend(ax, "upper left")
ax.set_title( targets[i]+' target ECR scores by compound subset')
#ax.set_ylim( (0,100) )

fig.tight_layout( pad=5.0 )
fig.savefig("all_targets_ecr_score_pdfs_logscale.png", dpi=800 )
plt.close()

Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@

import sys
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import rdMolDraw2D

try:
import Image
except ImportError:
from PIL import Image
from io import BytesIO

def DrawMolsZoomed( mols, molsPerRow=3, subImgSize=(200, 200), legends=None ):
nRows = len(mols) // molsPerRow
if len(mols) % molsPerRow: nRows += 1
fullSize = (molsPerRow * subImgSize[0], nRows * subImgSize[1])
full_image = Image.new('RGBA', fullSize )
for ii, mol in enumerate(mols):
if mol.GetNumConformers() == 0:
AllChem.Compute2DCoords(mol)
column = ii % molsPerRow
row = ii // molsPerRow
offset = ( column * subImgSize[0], row * subImgSize[1] )
d2d = rdMolDraw2D.MolDraw2DCairo(subImgSize[0], subImgSize[1] )
d2d.drawOptions().legendFontSize=20
d2d.DrawMolecule(mol, legend=legends[ii] )
d2d.FinishDrawing()
sub = Image.open(BytesIO(d2d.GetDrawingText()))
full_image.paste(sub,box=offset)
return full_image

# read in 'inhibitor_molecule_canonical_2024_06_13_rdkitcln_P_tidy_clusts_pop.csv'
targ = sys.argv[1]
incsv = sys.argv[2]

df1 = pd.read_csv( incsv )
df2 = df1.loc[ df1['target'] == targ ]
df2 = df2.drop_duplicates( subset='cid_0.750', keep='first' )
#df2 = df2.loc[ df1['medoid_0.750'] == 1 ]
df2 = df2[['molid','rdkit_smiles_cln_protonated','cid_0.750','cpop_0.750']]
df2['cid_0.750'] = df2['cid_0.750'].astype('int')
df2['cpop_0.750'] = df2['cpop_0.750'].astype('int')
df2 = df2.sort_values( by=['cpop_0.750','cid_0.750'], ascending=[False, True] )

clst_id_list = df2['cid_0.750'].tolist()
#clst_id_list = df2['cid_0.750'].sort_values().tolist()
ms = []
ms_titles = []

for clst_id in clst_id_list:
clst_pop = df2.loc[ df2['cid_0.750'] == clst_id, 'cpop_0.750' ].iloc[0]
cid = df2.loc[ df2['cid_0.750'] == clst_id, 'molid' ].iloc[0]
smiles = df2.loc[ df2['cid_0.750'] == clst_id, 'rdkit_smiles_cln_protonated' ].iloc[0]
ms.append( Chem.MolFromSmiles( smiles ) )
wrapped_string = "clustID: "+str(clst_id)+" size: "+str(clst_pop)+" CID: "+str(cid)
ms_titles.append( wrapped_string )
print( "clust_id:{}, clust_pop:{}, molid:{}, smiles:{}".format( clst_id, clst_pop, cid, smiles) )

#img = Draw.MolsToGridImage( ms, molsPerRow=7, subImgSize=(600,600), legends=ms_titles )
img = DrawMolsZoomed( ms, molsPerRow=6, subImgSize=(500,500), legends=ms_titles )
img.save( "cluster_medoids_"+targ+"_0.750_landscape.png")

1 change: 1 addition & 0 deletions docking_result/visual_cpds/0README-2024-06-26.boxnote
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"version":1970,"schema_version":1,"doc":{"type":"doc","attrs":{"table_of_contents":{"enabled":true,"allowedLevels":[1,2,3]}},"content":[{"type":"paragraph","content":[{"type":"text","marks":[{"type":"strong"},{"type":"author_id","attrs":{"authorId":"222648505"}}],"text":"NOTE"},{"type":"text","marks":[{"type":"author_id","attrs":{"authorId":"222648505"}}],"text":": landscape grid images of compounds show single representatives from each cluster. The cluster sizes (population) is also appended to the compound labels in the figures. The compounds are ordered by size of cluster--cluster sizes decreasing from left-to-right and top-to-bottom. I think these images would be more meaningful if I could include the reference compound. I added an additional image that shows the reference cpds and the original generated compound."}]},{"type":"paragraph"},{"type":"paragraph","content":[{"type":"text","marks":[{"type":"author_id","attrs":{"authorId":"222648505"}}],"text":"Clustering (HAC, average linkage, ECFP6 2024 bit with Jaccard distance matrix) was performed simultaneously on all 300 compounds (100 contributed from each target). Therefore, in some cases compound generated from different targets belong to a common cluster (cid_0.750)."}]},{"type":"paragraph"},{"type":"paragraph","content":[{"type":"text","marks":[{"type":"strong"},{"type":"author_id","attrs":{"authorId":"222648505"}}],"text":"Upon inspection of the compound structures, I am surprised by the structural diversity in the generated ensembles. I assumed the generated compounds would closely resemble the reference since the scores are favorably shifted from the mean in the ‘low’ and ‘medium’ targets. It is exciting that the model can produce "},{"type":"text","marks":[{"type":"em"},{"type":"strong"},{"type":"author_id","attrs":{"authorId":"222648505"}}],"text":"favorable"},{"type":"text","marks":[{"type":"strong"},{"type":"author_id","attrs":{"authorId":"222648505"}}],"text":" variation in the molecules it generates--at least based on docking scores."}]},{"type":"paragraph"}]},"savepoint_metadata":{"savepointFileId":null,"savepointsInfo":{"1039":{"timestamp":1719416855158,"type":"session","authorsSinceLastSavepoint":["222648505"]},"1200":{"timestamp":1719417541355,"type":"periodic","authorsSinceLastSavepoint":["222648505"]}},"shouldCreateSavepointBeforeApplyingNextVersion":false,"authorsSinceLastSavepoint":{"222648505":true},"allAuthorNames":{"222648505":"SPENCER ERICKSEN"}},"last_edit_timestamp":1719417839344}
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 7ff2c65

Please sign in to comment.