drigoni
diff --git a/‎.gitignore
+18 b/‎.gitignore
+18
diff --git a/‎CCGVAE.py
+1,563 b/‎CCGVAE.py
+1,563
diff --git a/‎ccgvae_env.yml
+82 b/‎ccgvae_env.yml
+82
diff --git a/‎ccgvae_env_requirements.txt
+25 b/‎ccgvae_env_requirements.txt
+25
diff --git a/‎data/make_dataset.py
+151 b/‎data/make_dataset.py
+151
@@ -0,0 +1,18 @@
+cluster/*
+
+.idea/*
+
+data/*.smi
+data/*.json
+data/*.pkl
+
+histogramAnalysis/*.png
+
+utils/fpscores.pkl.gz
+utils/sascorer.py
+
+results/*
+
+*__*__/*
+*.pyc
+tmp/*
@@ -0,0 +1,82 @@
+name: ccgvae
+channels:
+- rdkit
+- defaults
+dependencies:
+- _tflow_select=2.1.0=gpu
+- absl-py=0.4.1=py35_0
+- astor=0.7.1=py35_0
+- blas=1.0=mkl
+- bzip2=1.0.6=h14c3975_5
+- ca-certificates=2018.03.07=0
+- cairo=1.14.12=h8948797_3
+- certifi=2018.8.24=py35_1
+- cudatoolkit=9.2=0
+- cudnn=7.3.1=cuda9.2_0
+- cupti=9.2.148=0
+- fontconfig=2.13.0=h9420a91_0
+- freetype=2.9.1=h8a8886c_1
+- gast=0.2.0=py35_0
+- glib=2.56.2=hd408876_0
+- grpcio=1.12.1=py35hdbcaa40_0
+- icu=58.2=h9c2bf20_1
+- intel-openmp=2019.1=144
+- jpeg=9b=h024ee3a_2
+- libboost=1.65.1=habcd387_4
+- libedit=3.1.20170329=h6b74fdf_2
+- libffi=3.2.1=hd88cf55_4
+- libgcc-ng=8.2.0=hdf63c60_1
+- libgfortran-ng=7.3.0=hdf63c60_0
+- libpng=1.6.35=hbc83047_0
+- libprotobuf=3.6.0=hdbcaa40_0
+- libstdcxx-ng=8.2.0=hdf63c60_1
+- libtiff=4.0.9=he85c1e1_2
+- libuuid=1.0.3=h1bed415_2
+- libxcb=1.13=h1bed415_1
+- libxml2=2.9.8=h26e45fe_1
+- markdown=2.6.11=py35_0
+- mkl=2018.0.3=1
+- mkl_fft=1.0.6=py35h7dd41cf_0
+- mkl_random=1.0.1=py35h4414c95_1
+- ncurses=6.1=he6710b0_1
+- numpy=1.14.5
+- numpy-base=1.14.5
+- olefile=0.46=py35_0
+- openssl=1.0.2p=h14c3975_0
+- pandas=0.23.4=py35h04863e7_0
+- pcre=8.42=h439df22_0
+- pillow=5.2.0=py35heded4f4_0
+- pip=10.0.1=py35_0
+- pixman=0.34.0=hceecf20_3
+- protobuf=3.6.0=py35hf484d3e_0
+- py-boost=1.65.1=py35hf484d3e_4
+- python=3.5.6=hc3d631a_0
+- python-dateutil=2.7.3=py35_0
+- pytz=2018.5=py35_0
+- readline=7.0=h7b6447c_5
+- setuptools=39.1.0
+- six=1.11.0=py35_1
+- sqlite=3.25.3=h7b6447c_0
+- tensorboard=1.10.0=py35hf484d3e_0
+- tensorflow=1.10.0=gpu_py35hd9c640d_0
+- tensorflow-base=1.10.0=gpu_py35had579c0_0
+- tensorflow-gpu=1.10.0=hf154084_0
+- termcolor=1.1.0=py35_1
+- tk=8.6.8=hbc83047_0
+- werkzeug=0.14.1=py35_0
+- wheel=0.31.1=py35_0
+- xz=5.2.4=h14c3975_4
+- zlib=1.2.11=h7b6447c_3
+- rdkit=2018.03.4.0=py35h71b666b_1
+- pip:
+  - bleach==1.5.0
+  - cython==0.29.1
+  - docopt==0.6.2
+  - html5lib==0.9999999
+  - mkl-fft==1.0.6
+  - mkl-random==1.0.1
+  - planarity==0.4.1
+  - tensorflow-tensorboard==0.1.8
+  - typing==3.6.6
+prefix: /home/user/Programs/miniconda/envs/givae
+
@@ -0,0 +1,25 @@
+absl-py==0.4.1
+astor==0.7.1
+bleach==1.5.0
+certifi==2018.8.24
+Cython==0.29.1
+docopt==0.6.2
+gast==0.2.0
+grpcio==1.12.1
+html5lib==0.9999999
+Markdown==3.0.1
+mkl-fft==1.0.6
+mkl-random==1.0.1
+olefile==0.46
+pandas==0.23.4
+Pillow==5.2.0
+planarity==0.4.1
+protobuf==3.6.1
+python-dateutil==2.7.3
+pytz==2018.5
+six==1.11.0
+tensorboard==1.10.0
+tensorflow==1.10.0
+termcolor==1.1.0
+typing==3.6.6
+Werkzeug==0.14.1
@@ -0,0 +1,151 @@
+#!/usr/bin/env/python
+"""
+Usage:
+    make_dataset.py [options]
+
+Options:
+    -h --help           Show this screen.
+    --dataset NAME      QM9 or ZINC
+"""
+
+import json
+import os
+import sys
+
+import numpy as np
+from docopt import docopt
+from rdkit import Chem
+from rdkit.Chem import QED
+
+import utils
+
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
+
+# get current directory in order to work with full path and not dynamic
+current_dir = os.path.dirname(os.path.realpath(__file__))
+
+
+def readStr_qm9():
+    f = open(current_dir + '/qm9.smi', 'r')
+    L = []
+    for line in f:
+        line = line.strip()
+        L.append(line)
+    f.close()
+    np.random.seed(1)
+    np.random.shuffle(L)
+    return L
+
+
+def read_zinc():
+    f = open(current_dir + '/zinc.smi', 'r')
+    L = []
+    for line in f:
+        line = line.strip()
+        L.append(line)
+    f.close()
+    return L
+
+
+def train_valid_split(dataset):
+    n_mol_out = 0
+    n_test = 5000
+    test_idx = np.arange(0, n_test)
+    valid_idx = np.random.randint(n_test, high=len(dataset), size=round(len(dataset) * 0.1))
+
+    # save the train, valid dataset.
+    raw_data = {'train': [], 'valid': [], 'test': []}
+    file_count = 0
+    for i, smiles in enumerate(dataset):
+        val = QED.qed(Chem.MolFromSmiles(smiles))
+        hist = make_hist(smiles)
+        if hist is not None:
+            if i in valid_idx:
+                raw_data['valid'].append({'smiles': smiles, 'QED': val, 'hist': hist.tolist()})
+            elif i in test_idx:
+                raw_data['test'].append({'smiles': smiles, 'QED': val, 'hist': hist.tolist()})
+            else:
+                raw_data['train'].append({'smiles': smiles, 'QED': val, 'hist': hist.tolist()})
+            file_count += 1
+            if file_count % 1000 == 0:
+                print('Finished reading: %d' % file_count, end='\r')
+        else:
+            n_mol_out += 1
+
+    print("Number of molecules left out: ", n_mol_out)
+    return raw_data
+
+
+def make_hist(smiles):
+    mol = Chem.MolFromSmiles(smiles)
+    atoms = mol.GetAtoms()
+    hist = np.zeros(utils.dataset_info(dataset)['hist_dim'])
+    for atom in atoms:
+        if dataset == 'qm9':
+            atom_str = atom.GetSymbol()
+        else:
+            # zinc dataset # transform using "<atom_symbol><valence>(<charge>)"  notation
+            symbol = atom.GetSymbol()
+            valence = atom.GetTotalValence()
+            charge = atom.GetFormalCharge()
+            atom_str = "%s%i(%i)" % (symbol, valence, charge)
+
+            if atom_str not in utils.dataset_info(dataset)['atom_types']:
+                print('Unrecognized atom type %s' % atom_str)
+                return None
+
+        ind = utils.dataset_info(dataset)['atom_types'].index(atom_str)
+        val = utils.dataset_info(dataset)['maximum_valence'][ind]
+        hist[val - 1] += 1  # in the array the valence number start from 1, instead the array start from 0
+    return hist
+
+
+def preprocess(raw_data, dataset):
+    print('Parsing smiles as graphs...')
+    processed_data = {'train': [], 'valid': [], 'test': []}
+
+    file_count = 0
+    for section in ['train', 'valid', 'test']:
+        all_smiles = []  # record all smiles in training dataset
+        for i, (smiles, QED, hist) in enumerate([(mol['smiles'], mol['QED'], mol['hist'])
+                                                 for mol in raw_data[section]]):
+            nodes, edges = utils.to_graph(smiles, dataset)
+            if len(edges) <= 0:
+                print('Error. Molecule with len(edges) <= 0')
+                continue
+            processed_data[section].append({
+                'targets': [[QED]],
+                'graph': edges,
+                'node_features': nodes,
+                'smiles': smiles,
+                'hist': hist
+            })
+            all_smiles.append(smiles)
+            if file_count % 1000 == 0:
+                print('Finished processing: %d' % file_count, end='\r')
+            file_count += 1
+        print('%s: 100 %%                   ' % (section))
+        with open('molecules_%s_%s.json' % (section, dataset), 'w') as f:
+            json.dump(processed_data[section], f)
+
+    print("Train molecules = " + str(len(processed_data['train'])))
+    print("Valid molecules = " + str(len(processed_data['valid'])))
+    print("Test molecules = " + str(len(processed_data['test'])))
+
+
+if __name__ == "__main__":
+    args = docopt(__doc__)
+    dataset = args.get('--dataset')
+
+    print('Reading dataset: ' + str(dataset))
+    data = []
+    if dataset == 'qm9':
+        data = readStr_qm9()
+    elif dataset == 'zinc':
+        data = read_zinc()
+    else:
+        print('Error. The database doesn\'t exist')
+        exit(1)
+
+    raw_data = train_valid_split(data)
+    preprocess(raw_data, dataset)