From 3a23ad68929c87017d59c3e2361583ef66473d42 Mon Sep 17 00:00:00 2001 From: xiangyan93 Date: Thu, 7 Dec 2023 20:09:36 -0500 Subject: [PATCH] Update to 0.2.0: add circular fingerprints. --- mgktools/__init__.py | 2 +- mgktools/features_mol/features_generators.py | 12 +++++++++++- setup.py | 1 + test/cross_validation/test_cv_pure.py | 13 +++++++------ test/data/test_data_pure.py | 2 ++ 5 files changed, 22 insertions(+), 8 deletions(-) diff --git a/mgktools/__init__.py b/mgktools/__init__.py index 6b4fc35..27d06a8 100644 --- a/mgktools/__init__.py +++ b/mgktools/__init__.py @@ -2,4 +2,4 @@ # -*- coding: utf-8 -*- -__version__ = '0.1.1' +__version__ = '0.2.0' diff --git a/mgktools/features_mol/features_generators.py b/mgktools/features_mol/features_generators.py index 4a91eff..a4594f2 100644 --- a/mgktools/features_mol/features_generators.py +++ b/mgktools/features_mol/features_generators.py @@ -5,6 +5,7 @@ import numpy as np from rdkit import Chem, DataStructs from rdkit.Chem import AllChem, Descriptors +import deepchem from descriptastorus.descriptors import rdDescriptors, rdNormalizedDescriptors @@ -15,7 +16,7 @@ def __init__(self, features_generator_name: Union[str, Callable], self.features_generator_name = features_generator_name self.radius = radius self.num_bits = num_bits - if features_generator_name in ['morgan', 'morgan_count']: + if features_generator_name in ['morgan', 'morgan_count', 'circular']: assert self.radius is not None assert self.num_bits is not None @@ -26,6 +27,8 @@ def __call__(self, mol: Union[str, Chem.Mol]) -> np.ndarray: return self.morgan_binary_features_generator(mol) elif self.features_generator_name == 'morgan_count': return self.morgan_counts_features_generator(mol) + elif self.features_generator_name == 'circular': + return self.circular_features_generator(mol) elif self.features_generator_name == 'rdkit_208': return self.rdkit_208_features_generator(mol) elif self.features_generator_name == 'rdkit_2d': @@ -67,6 +70,13 @@ def morgan_counts_features_generator(self, mol: Union[str, Chem.Mol]) -> np.ndar return features + @staticmethod + def circular_features_generator(mol: Union[str, Chem.Mol]) -> np.ndarray: + circular_fp_featurizer = deepchem.feat.CircularFingerprint(size=2048, radius=8, + sparse=False, smiles=True) + features = circular_fp_featurizer.featurize([mol]).ravel() + return features + @staticmethod def rdkit_2d_features_generator(mol: Union[str, Chem.Mol]) -> np.ndarray: """ diff --git a/setup.py b/setup.py index 2bbd2c9..3b6002a 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ def read(*filenames, **kwargs): 'rxntools>=0.0.2', 'pycuda>=2022.1', 'rdkit>=2022.9.2', + 'deepchem==2.7.2.dev20231207083329' ], author='Yan Xiang', author_email='1993.xiangyan@gmail.com', diff --git a/test/cross_validation/test_cv_pure.py b/test/cross_validation/test_cv_pure.py index 85d496b..ffa1015 100644 --- a/test/cross_validation/test_cv_pure.py +++ b/test/cross_validation/test_cv_pure.py @@ -11,10 +11,11 @@ from mgktools.evaluators.cross_validation import Evaluator -pure = ['CCCC', 'CCCCCO', 'c1ccccc1', 'CCNCCO', 'CCCCN', 'NCCCCCO', 'c1ccccc1N', 'NCCNCCO'] -targets_regression = [3.1, 14.5, 25.6, 56.7, 9.1, 17.5, 22.6, 36.7] +pure = ['CCCC', 'CCCCCO', 'c1ccccc1', 'CCNCCO', 'CCCCN', 'NCCCCCO', 'c1ccccc1N', 'NCCNCCO', + 'CNC(CC)CC', 'c1ccccc1', 'c1ccccc1CCCCc1ccccc1', 'CC(=O)OCCO'] +targets_regression = [3.1, 14.5, 25.6, 56.7, 9.1, 17.5, 22.6, 36.7, 23.1, 32.1, 1.4, 7.6] df_regression = pd.DataFrame({'pure': pure, 'targets': targets_regression}) -targets_classification = [1, 1, 0, 1, 1, 0, 0, 1] +targets_classification = [1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1] df_classification = pd.DataFrame({'pure': pure, 'targets': targets_classification}) @@ -22,7 +23,7 @@ product_norm, product_pnorm, product_msnorm]) @pytest.mark.parametrize('model', ['gpc', 'svc']) @pytest.mark.parametrize('split_type', ['random', 'scaffold_order', 'scaffold_random']) -def test_only_graph(mgk_file, model, split_type): +def test_only_graph_classification(mgk_file, model, split_type): dataset = Dataset.from_df(df=df_classification, pure_columns=['pure'], target_columns=['targets']) @@ -49,12 +50,12 @@ def test_only_graph(mgk_file, model, split_type): product_norm, product_pnorm, product_msnorm]) @pytest.mark.parametrize('modelsets', [('gpr', None, None, None), ('gpr-sod', 2, 3, 'smallest_uncertainty'), - ('gpr-sod', 2, 3, 'weight_uncertainty'), + # ('gpr-sod', 2, 3, 'weight_uncertainty'), ('gpr-sod', 2, 3, 'mean'), ('gpr-nystrom', None, 3, None), ('gpr-nle', None, 3, None)]) @pytest.mark.parametrize('split_type', ['random', 'scaffold_order', 'scaffold_random']) -def test_only_graph(mgk_file, modelsets, split_type): +def test_only_graph_scalable_gps(mgk_file, modelsets, split_type): model_type, n_estimators, n_samples, consensus_rule = modelsets dataset = Dataset.from_df(df=df_regression, pure_columns=['pure'], diff --git a/test/data/test_data_pure.py b/test/data/test_data_pure.py index de8161b..d4dca85 100644 --- a/test/data/test_data_pure.py +++ b/test/data/test_data_pure.py @@ -27,6 +27,7 @@ def test_only_graph(testset): @pytest.mark.parametrize('testset', [ ('morgan', 2048), + ('circular', 2048), ('rdkit_2d', 200), ('rdkit_2d_normalized', 200), ]) @@ -42,6 +43,7 @@ def test_only_fingerprints(testset): @pytest.mark.parametrize('testset', [ ('morgan', 2048), + ('circular', 2048), ('rdkit_2d', 200), ('rdkit_2d_normalized', 200), ])