Skip to content

Commit

Permalink
add GauL-HDAD and fingerprints
Browse files Browse the repository at this point in the history
  • Loading branch information
mrodobbe committed Oct 8, 2024
1 parent 4b25023 commit 0be0128
Show file tree
Hide file tree
Showing 20 changed files with 2,707 additions and 332 deletions.
2 changes: 1 addition & 1 deletion src/chemperium/__about__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Package version."""

__version__ = "0.0.1"
__version__ = "1.0.0"
64 changes: 61 additions & 3 deletions src/chemperium/data/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from chemperium.inp import InputArguments
from chemperium.molecule.graph import Mol3DGraph
from chemperium.molecule.batch import featurize_graphs
from chemperium.gaussian.feature_vector import Featurizer, MolFeatureVector
from chemperium.gaussian.histogram import Histograms, Gaussian
from chemperium.features.fingerprint import RDF, Morgan, MACCS
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.rdchem import Mol
Expand Down Expand Up @@ -46,9 +49,15 @@ def __init__(self, input_pars: Union[InputArguments, TestInputArguments],
self.df = df
self.rdmol_list = np.array(self.get_rdmol())
self.smiles = self.get_smiles()
self.graphs = self.get_graphs()
self.scaler = self.get_scaler()
self.x = self.get_xs()

if self.inp.fingerprint is None:
self.graphs = self.get_graphs()
self.x = self.get_xs()
else:
self.graphs = []
self.x = self.get_fingerprints()

self.y = self.get_outputs(inputs=self.inp)

def load_data(self) -> pd.DataFrame:
Expand Down Expand Up @@ -246,18 +255,67 @@ def get_outputs(self,

return outputs

def get_fingerprints(self):
if self.inp.ff_3d:
import_type = "smiles"
else:
import_type = "precalculated"

fingerprints = []
if self.inp.fingerprint == "hdad":
features = Featurizer(self.rdmol_list, import_type, self.inp)
try:
with open(str(self.inp.gmm_file), "rb") as f:
gmm_dict = pickle.load(f)
except:
hist = Histograms(features.all_features, features.name_all_features, self.inp)
geometry_dict = hist.histogram_dict
gauss = Gaussian(self.inp)
gauss.cluster(geometry_dict)
gmm_dict = gauss.gmm_dict
for molecule in features.molecules:
fp = MolFeatureVector(molecule, gmm_dict, self.inp).vector
fingerprints.append(fp)
else:
for molecule in self.rdmol_list:
if self.inp.fingerprint == "rdf":
fp = RDF(molecule).make_fingerprint(add_mfd=self.inp.mfd)
elif self.inp.fingerprint == "maccs":
fp = MACCS(molecule).make_fingerprint()
elif self.inp.fingerprint == "morgan":
fp = Morgan(molecule).make_fingerprint()
else:
raise KeyError(f"Invalid fingerprint: {self.inp.fingerprint}! Choose from rdf, maccs, morgan, hdad")
fingerprints.append(fp)

fingerprints = np.array(fingerprints).astype(np.float32)
return fingerprints


def input_checker(save_dir: str) -> None:
def input_checker(save_dir: str, gaul: bool = False) -> None:
"""
Evaluate if save_dir exists
:param save_dir: Folder to store all data of the training.
:param gaul: Whether to use GauL-HDAD so that gmm and hist folders are created.
:return: Function does not return anything
"""
try:
os.mkdir(save_dir)
except FileExistsError:
print("Folder already exists. Data in this folder will be overwritten.")

if gaul:
try:
os.mkdir(str(save_dir + "/gmm"))
print("GMM folder created")
except FileExistsError:
print("GMM folder already exists.")
try:
os.mkdir(str(save_dir + "/hist"))
print("Hist folder created")
except FileExistsError:
print("Hist folder already exists.")


def split_dataset(num_data: int,
seed: int = 120897,
Expand Down
20 changes: 19 additions & 1 deletion src/chemperium/data/load_test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,11 @@ def __init__(self, dimension: Union[None, str] = None):
self.simple_features = False
self.num_layers = 5
self.hidden_size = 512
self.activation = "LeakyReLU"
self.hidden_activation = "LeakyReLU"
self.activation = "linear"
self.dropout = 0.0
self.batch_normalization = False
self.l2 = 0.0
self.bias = True
self.max_epochs = 700
self.patience = 50
Expand All @@ -58,11 +61,15 @@ def __init__(self, dimension: Union[None, str] = None):
self.init_lr = 1e-3
self.clipvalue = 0.1
self.decay_rate = 0.95
self.decay_steps = 10000
self.masked = False
self.cutoff = 2.1
self.hidden_message = 512
self.depth = 6
self.representation_size = 256
self.outer_folds = 1
self.gmm_file = self.dir + "/src/chemperium/pickle/gmm_dictionary.pickle"
self.fingerprint = None

# Plotting

Expand All @@ -72,6 +79,17 @@ def __init__(self, dimension: Union[None, str] = None):
self.font = "Arial"
self.font_size = 24

# GauL-HDAD
self.distances = True
self.angles = True
self.dihedrals = True
self.tol = 1e-4
self.max_iter = 100
self.plot_gmm = False
self.plot_hist = True
self.radicals = True
self.carbenium = False

if dimension is not None:

if dimension == "2d":
Expand Down
27 changes: 19 additions & 8 deletions src/chemperium/data/parse_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,28 @@ def df_from_csv(fname: str,
raise KeyError("No column with SMILES detected in the DataFrame. Please add a column named smiles.")

if include_3d and not ff_3d:
if "xyz" not in list(df.keys()) and not ff_3d:
if "xyz" not in list(df.keys()) and not ff_3d and "molblock" not in list(df.keys()):
raise KeyError("XYZ coordinates not provided!")
if "RDMol" not in df.keys():
df["RDMol"] = ""
for i in df.index:
mol = make_3d_mol(df["xyz"][i])
if mol is None:
print(f"WARNING! Could not parse {df[smiles_key][i]}!")
df = df.drop(i)
else:
df.loc[i, "RDMol"] = mol
if "molblock" in list(df.keys()):
for i in df.index:
mol = Chem.MolFromMolBlock(df["molblock"][i], removeHs=False)
if mol is None:
print(f"WARNING! Could not parse {df[smiles_key][i]}!")
df = df.drop(i)
else:
df.loc[i, "RDMol"] = mol
elif "xyz" in list(df.keys()):
for i in df.index:
mol = make_3d_mol(df["xyz"][i])
if mol is None:
print(f"WARNING! Could not parse {df[smiles_key][i]}!")
df = df.drop(i)
else:
df.loc[i, "RDMol"] = mol
else:
raise KeyError("XYZ coordinates not provided!")
elif "xyz" not in list(df.keys()):
df["xyz"] = ""

Expand Down
33 changes: 33 additions & 0 deletions src/chemperium/features/calc_features.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.rdchem import Atom, Mol, Bond
from typing import Dict, Tuple
import numpy.typing as npt
Expand Down Expand Up @@ -375,3 +376,35 @@ def get_atomic_rdf(atom: Atom,
r = np.array(r).astype(np.float64)

return r, g_array


def dict_to_vector(representation_dict):
r = []
for part in representation_dict:
r = np.append(r, representation_dict[part])
return np.asarray(r).astype(np.float32)


def num_radicals(mol: Chem.rdchem.Mol):
return Descriptors.NumRadicalElectrons(mol)


def carbenium_degree(mol: Chem.rdchem.Mol):

# TODO: Add aromatic carbocations

for atom in mol.GetAtoms():
if (atom.GetFormalCharge() == 1) and (atom.GetSymbol() == 'C'):
degree = 0
for neighbor in atom.GetNeighbors():
if neighbor.GetSymbol() == 'C':
degree += 1
return degree


def remove_atom_mapping(mol: Chem.rdchem.Mol):
for atom in mol.GetAtoms():
for key in atom.GetPropsAsDict():
atom.ClearProp(key)

return mol
34 changes: 21 additions & 13 deletions src/chemperium/features/featurize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,26 @@
from typing import Union


def get_simple_atomic_features(atom: Atom) -> npt.NDArray[np.float64]:
feat_list = np.array([])

feat_list = np.append(feat_list, atomic_feature_vector(atom.GetAtomicNum()))

# degree of atom
feat_list = np.append(feat_list, one_hot_vector(atom.GetDegree(), 7))

# hybridization: S, SP, SP2, SP3, SP3D, SP3D2
feat_list = np.append(feat_list, hybridization_vector(atom.GetHybridization()))

# aromaticity: 0 or 1
feat_list = np.append(feat_list, np.array([int(atom.GetIsAromatic())]))

# chiral tag
feat_list = np.append(feat_list, one_hot_vector(atom.GetChiralTag(), 9))

return feat_list


def get_atomic_features(atom: Atom,
mol: Mol,
xyz: npt.NDArray[np.float64],
Expand All @@ -31,19 +51,7 @@ def get_atomic_features(atom: Atom,

if input_pars.simple_features:
# atomic number
feat_list = np.append(feat_list, atomic_feature_vector(atom.GetAtomicNum()))

# degree of atom
feat_list = np.append(feat_list, one_hot_vector(atom.GetDegree(), 7))

# hybridization: S, SP, SP2, SP3, SP3D, SP3D2
feat_list = np.append(feat_list, hybridization_vector(atom.GetHybridization()))

# aromaticity: 0 or 1
feat_list = np.append(feat_list, np.array([int(atom.GetIsAromatic())]))

# chiral tag
feat_list = np.append(feat_list, one_hot_vector(atom.GetChiralTag(), 9))
feat_list = get_simple_atomic_features(atom)

return feat_list

Expand Down
Loading

0 comments on commit 0be0128

Please sign in to comment.