diff options
author | David Luevano Alvarado <55825613+luevano@users.noreply.github.com> | 2020-02-28 06:48:42 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-02-28 06:48:42 -0700 |
commit | 86faabeac5aed75c1dfc883fe757c21eba94b645 (patch) | |
tree | 463515502ff326afa8b0dfeeaa15e3ba00ee08b2 | |
parent | 7920c4b8471741cb9b5a9aa06da436c1b678f62a (diff) | |
parent | 6ad25bb4cd5822fa84d1c4cec54952c6fd8df2f6 (diff) |
Merge pull request #2 from luevano/rewrite
Rewrite
-rw-r--r-- | ml_exp/__init__.py | 40 | ||||
-rw-r--r-- | ml_exp/__main__.py | 49 | ||||
-rw-r--r-- | ml_exp/adj_matrix.py | 131 | ||||
-rw-r--r-- | ml_exp/bob.py | 117 | ||||
-rw-r--r-- | ml_exp/bostuff.py | 99 | ||||
-rw-r--r-- | ml_exp/c_matrix.py | 179 | ||||
-rw-r--r-- | ml_exp/compound.py | 153 | ||||
-rw-r--r-- | ml_exp/data.py | 141 | ||||
-rw-r--r-- | ml_exp/do_ml.py | 302 | ||||
-rw-r--r-- | ml_exp/kernels.py (renamed from ml_exp/gauss_kernel.py) | 22 | ||||
-rw-r--r-- | ml_exp/lj_matrix.py | 222 | ||||
-rw-r--r-- | ml_exp/math.py (renamed from ml_exp/cholesky_solve.py) | 26 | ||||
-rw-r--r-- | ml_exp/misc.py | 38 | ||||
-rw-r--r-- | ml_exp/parallel_create_matrices.py | 85 | ||||
-rw-r--r-- | ml_exp/qm7db.py (renamed from ml_exp/frob_norm.py) | 46 | ||||
-rw-r--r-- | ml_exp/read_qm7_data.py | 162 | ||||
-rw-r--r-- | ml_exp/representations.py | 384 | ||||
-rw-r--r-- | requirements.txt | 2 |
18 files changed, 909 insertions, 1289 deletions
diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py index 06425f6a2..48ece56c8 100644 --- a/ml_exp/__init__.py +++ b/ml_exp/__init__.py @@ -20,29 +20,21 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from ml_exp.read_qm7_data import read_nc_data, read_db_data, read_qm7_data -from ml_exp.c_matrix import c_matrix, c_matrix_multiple -from ml_exp.lj_matrix import lj_matrix, lj_matrix_multiple -from ml_exp.frob_norm import frob_norm -from ml_exp.gauss_kernel import gauss_kernel -from ml_exp.cholesky_solve import cholesky_solve -from ml_exp.do_ml import do_ml -from ml_exp.parallel_create_matrices import parallel_create_matrices -from ml_exp.misc import plot_benchmarks +from ml_exp.compound import Compound +from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\ + first_neighbor_matrix, adjacency_matrix, check_bond, bag_of_stuff +from ml_exp.math import cholesky_solve +from ml_exp.qm7db import qm7db +from ml_exp.do_ml import simple_ml, do_ml - -# If somebody does "from package import *", this is what they will -# be able to access: -__all__ = ['read_nc_data', - 'read_db_data', - 'read_qm7_data', - 'c_matrix', - 'c_matrix_multiple', - 'lj_matrix', - 'lj_matrix_multiple', - 'frob_norm', - 'gauss_kernel', +__all__ = ['Compound', + 'coulomb_matrix', + 'lennard_jones_matrix', + 'first_neighbor_matrix', + 'adjacency_matrix', + 'check_bond', + 'bag_of_stuff', 'cholesky_solve', - 'do_ml', - 'parallel_create_matrices', - 'plot_benchmarks'] + 'qm7db', + 'simple_ml', + 'do_ml'] diff --git a/ml_exp/__main__.py b/ml_exp/__main__.py index aaeb18714..0d9b87bad 100644 --- a/ml_exp/__main__.py +++ b/ml_exp/__main__.py @@ -20,52 +20,5 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -# from ml_exp.do_ml import do_ml -# from ml_exp.misc import plot_benchmarks -from ml_exp.read_qm7_data import read_qm7_data -from ml_exp.adj_matrix import fneig_matrix, adj_matrix -from ml_exp.c_matrix import c_matrix -from ml_exp.bob import bob -from ml_exp.bostuff import bok_cx - if __name__ == '__main__': - """ - do_ml(min_training_size=1500, - max_training_size=2000, - training_increment_size=500, - test_size=None, - ljm_diag_value=None, - ljm_sigma=1.0, - ljm_epsilon=1.0, - r_seed=111, - save_benchmarks=False, - show_msgs=True) - """ - # plot_benchmarks() - """ - xyz, nc, pbe0, delta, atoms = read_qm7_data(return_atoms=True) - for i in range(1): - fnm, bonds, forces = fneig_matrix(atoms[i], xyz[i], nc[i], True) - am = adj_matrix(bonds, forces) - - print(f'{i} first neighbor matrix\n{fnm}') - print(f'{i} bond list\n{bonds}') - print(f'{i} force list\n{forces}') - print(f'{i} adjacency matrix\n{am}') - print('-'*30) - print('OK!') - """ - xyz, nc, pbe0, delta, atoms = read_qm7_data(return_atoms=True) - for i in range(1): - cm = c_matrix(xyz[i], nc[i], as_eig=False) - bob_i = bob(cm, atoms[i]) - - print(f'{i} coulomb matrix\n{cm}') - print(f'{i} bag of bonds\n{bob}') - - fnm, bonds = fneig_matrix(atoms[i], xyz[i], nc[i], False) - am = adj_matrix(fnm, bonds, None) - bok_cx_i = bok_cx(am, atoms[i]) - - print(f'{i} adjacency matrix\n{am}') - print(f'{i} bag of k_cx\n{bok_cx_i}') + print('Everything OK!\n\tThis is a placeholder.') diff --git a/ml_exp/adj_matrix.py b/ml_exp/adj_matrix.py deleted file mode 100644 index de484b5be..000000000 --- a/ml_exp/adj_matrix.py +++ /dev/null @@ -1,131 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -from numpy import array, zeros, dot -from numpy.linalg import norm -from ml_exp.misc import printc - - -def fneig_matrix(atoms, - xyz, - nc=None, - use_forces=False): - """ - Creates the first neighbor matrix of the given molecule data. - atoms: list of atoms. - xyz: matrix of atomic coords. - nc: nuclear charge info. - use_forces: if the use of forces instead of k_cx should be used. - NOTE: Bond distance of carbon to other elements - are (for atoms present in the qm7 dataset): - C: 1.20 - 1.53 A - H: 1.06 - 1.12 A - O: 1.43 - 2.15 A - N: 1.47 - 2.10 A - S: 1.81 - 2.55 A - """ - if use_forces and nc is None: - printc(f'Error. use_forces={use_forces} but nc={nc}', 'RED') - return None - # Possible bonds. - cc_bond = sorted(['C', 'C']) - ch_bond = sorted(['C', 'H']) - co_bond = sorted(['C', 'O']) - cn_bond = sorted(['C', 'N']) - cs_bond = sorted(['C', 'S']) - - # Number of atoms, empty matrix and bond list. - n = len(atoms) - fnm = array(zeros((n, n)), dtype=float) - bonds = [] - forces = [] - for i, xyz_i in enumerate(xyz): - for j, xyz_j in enumerate(xyz): - # Ignore the diagonal. - if i != j: - bond = sorted([atoms[i], atoms[j]]) - rv = xyz_i - xyz_j - r = norm(rv) - # Check for each type of bond. - if (cc_bond == bond) and (r >= 1.20 and r <= 1.53): - fnm[i, j] = 1.0 - if j > i: - bonds.append((i, j)) - if use_forces: - forces.append(rv*nc[i]*nc[j]/r**3) - elif (ch_bond == bond) and (r >= 1.06 and r <= 1.12): - fnm[i, j] = 1.0 - if j > i: - bonds.append((i, j)) - if use_forces: - forces.append(rv*nc[i]*nc[j]/r**3) - elif (co_bond == bond) and (r >= 1.43 and r <= 2.15): - fnm[i, j] = 0.8 - if j > i: - bonds.append((i, j)) - if use_forces: - forces.append(rv*nc[i]*nc[j]/r**3) - elif (cn_bond == bond) and (r >= 1.47 and r <= 2.10): - fnm[i, j] = 1.0 - if j > i: - bonds.append((i, j)) - if use_forces: - forces.append(rv*nc[i]*nc[j]/r**3) - elif (cs_bond == bond) and (r >= 1.81 and r <= 2.55): - fnm[i, j] = 0.7 - if j > i: - bonds.append((i, j)) - if use_forces: - forces.append(rv*nc[i]*nc[j]/r**3) - if use_forces: - return fnm, bonds, forces - return fnm, bonds - - -def adj_matrix(fneig_matrix, - bonds, - forces=None, - max_len=25): - """ - Calculates the adjacency matrix given the bond list. - bonds: list of bonds (tuple of indexes). - max_len: maximum amount of atoms in molecule. - forces: list of forces. - """ - n = len(bonds) - - if max_len < n: - print(''.join(['Error. Molecule matrix dimension (mol_n) is ', - 'greater than max_len. Using mol_n.'])) - max_len = n - - am = array(zeros((max_len, max_len))) - for i, bond_i in enumerate(bonds): - for j, bond_j in enumerate(bonds): - # Ignore the diagonal. - if i != j: - if (bond_i[0] in bond_j) or (bond_i[1] in bond_j): - if forces: - am[i, j] = dot(forces[i], forces[j]) - else: - am[i, j] = fneig_matrix[bond_i[0], bond_i[1]] - return am diff --git a/ml_exp/bob.py b/ml_exp/bob.py deleted file mode 100644 index 86efecdb4..000000000 --- a/ml_exp/bob.py +++ /dev/null @@ -1,117 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -from numpy import array, zeros -from collections import Counter - - -def check_bond(bags, - bond): - """ - Checks if a bond is in a bag. - bags: list of bags, containing a bag per entry, which in turn - contains a list of bond-values. - bond: bond to check. - """ - if bags == []: - return False, None - - for i, bag in enumerate(bags): - if bag[0] == bond: - return True, i - - return False, None - - -def bob(c_matrix, - atoms, - max_n=25, - max_bond_len=325): - """ - Creates the bag of bond using the coulomb matrix data. - c_matrix: coulomb matrix. - atoms: list of atoms. - max_n: maximum amount of atoms. - max_bond_len: maximum amount of bonds in molecule. - """ - n = len(atoms) - bond_n = (n * n - n) / 2 + n - n_r = range(n) - - if max_n < n: - print(''.join(['Error. Molecule matrix dimension (mol_n) is ', - 'greater than max_len. Using mol_n.'])) - max_n = n - - if max_bond_len < bond_n: - print(''.join(['Error. Molecule bond lenght (bond_n) is ', - 'greater than max_bond_len. Using bond_n.'])) - max_bond_len = bond_n - - # List where each bag data is stored. - bags = [] - for i in n_r: - for j in n_r: - # Work only in the upper triangle of the coulomb matrix. - if j >= i: - # Get the string of the current bond. - if i == j: - current_bond = atoms[i] - else: - current_bond = ''.join(sorted([atoms[i], atoms[j]])) - - # Check if that bond is already in a bag. - checker = check_bond(bags, current_bond) - # Either create a new bag or add values to an existing one. - if not checker[0]: - bags.append([current_bond, c_matrix[i, j]]) - else: - bags[checker[1]].append(c_matrix[i, j]) - - # Create the actual bond list ordered. - atom_counter = Counter(atoms) - atom_list = sorted(list(set(atoms))) - bonds = [] - for i, a_i in enumerate(atom_list): - if atom_counter[a_i] > 1: - for a_j in atom_list[i:]: - bonds.append(''.join(sorted([a_i, a_j]))) - bonds = atom_list + bonds - - # Create the final vector for the bob. - bob = array(zeros(max_bond_len), dtype=float) - c_i = 0 - for i, bond in enumerate(bonds): - checker = check_bond(bags, bond) - if checker[0]: - for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]): - # Use c_i as the index for bob if the zero padding should - # be at the end of the vector instead of between each bond. - bob[i*max_n + j] = num - c_i += 1 - # This is set to false because this was a debugging measure. - else: - print(''.join([f'Error. Bond {bond} from bond list coudn\'t', - ' be found in the bags list. This could be', - ' a case where the atom is only present once', - ' in the molecule.'])) - return bob diff --git a/ml_exp/bostuff.py b/ml_exp/bostuff.py deleted file mode 100644 index 1741e26cd..000000000 --- a/ml_exp/bostuff.py +++ /dev/null @@ -1,99 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -from numpy import array, zeros -from collections import Counter -from ml_exp.bob import check_bond - - -def bok_cx(adj_matrix, - atoms, - max_n=25, - max_bond_len=325): - """ - Creates the bag of k_cx using the adjacency matrix data. - adj_matrix: adjacency matrix. - max_n: maximum amount of atoms. - max_bond_len: maximum amount of bonds in molecule. - """ - n = len(atoms) - bond_n = (n * n - n) / 2 + n - n_r = range(n) - - if max_n < n: - print(''.join(['Error. Molecule matrix dimension (mol_n) is ', - 'greater than max_len. Using mol_n.'])) - max_n = n - - if max_bond_len < bond_n: - print(''.join(['Error. Molecule bond length (bond_n) is ', - 'greater than max_bond_len. Using bond_n.'])) - max_bond_len = bond_n - - # List where each bag data is stored. - bags = [] - for i in n_r: - for j in n_r: - # Work only in the upper triangle of the coulomb matrix. - if j >= i: - # Get the string of the current bond. - if i == j: - current_bond = atoms[i] - else: - current_bond = ''.join(sorted([atoms[i], atoms[j]])) - - # Check if that bond is already in a bag. - checker = check_bond(bags, current_bond) - # Either create a new bag or add values to an existing one. - if not checker[0]: - bags.append([current_bond, adj_matrix[i, j]]) - else: - bags[checker[1]].append(adj_matrix[i, j]) - - # Create the actual bond list ordered. - atom_counter = Counter(atoms) - atom_list = sorted(list(set(atoms))) - bonds = [] - for i, a_i in enumerate(atom_list): - if atom_counter[a_i] > 1: - for a_j in atom_list[i:]: - bonds.append(''.join(sorted([a_i, a_j]))) - bonds = atom_list + bonds - - # Create the final vector for the bok_cx. - bok_cx = array(zeros(max_bond_len), dtype=float) - c_i = 0 - for i, bond in enumerate(bonds): - checker = check_bond(bags, bond) - if checker[0]: - for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]): - # Use c_i as the index for bok_cx if the zero padding should - # be at the end of the vector instead of between each bond. - bok_cx[i*max_n + j] = num - c_i += 1 - # This is set to false because this was a debugging measure. - else: - print(''.join([f'Error. Bond {bond} from bond list coudn\'t', - ' be found in the bags list. This could be', - ' a case where the atom is only present once', - ' in the molecule.'])) - return bok_cx diff --git a/ml_exp/c_matrix.py b/ml_exp/c_matrix.py deleted file mode 100644 index 00034a660..000000000 --- a/ml_exp/c_matrix.py +++ /dev/null @@ -1,179 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import time -import math -import numpy as np -from numpy.linalg import eig -from ml_exp.misc import printc - - -def c_matrix(mol_data, - nc_data, - max_len=25, - as_eig=True, - bohr_radius_units=False): - """ - Creates the Coulomb Matrix from the molecule data given. - mol_data: molecule data, matrix of atom coordinates. - nc_data: nuclear charge data, array of atom data. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - """ - if bohr_radius_units: - conversion_rate = 0.52917721067 - else: - conversion_rate = 1 - - mol_n = len(mol_data) - mol_nr = range(mol_n) - - if not mol_n == len(nc_data): - print(''.join(['Error. Molecule matrix dimension is different ', - 'than the nuclear charge array dimension.'])) - else: - if max_len < mol_n: - print(''.join(['Error. Molecule matrix dimension (mol_n) is ', - 'greater than max_len. Using mol_n.'])) - max_len = None - - if max_len: - cm = np.zeros((max_len, max_len)) - ml_r = range(max_len) - - # Actual calculation of the coulomb matrix. - for i in ml_r: - if i < mol_n: - x_i = mol_data[i, 0] - y_i = mol_data[i, 1] - z_i = mol_data[i, 2] - Z_i = nc_data[i] - else: - break - - for j in ml_r: - if j < mol_n: - x_j = mol_data[j, 0] - y_j = mol_data[j, 1] - z_j = mol_data[j, 2] - Z_j = nc_data[j] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - cm[i, j] = (0.5*Z_i**2.4) - else: - cm[i, j] = (conversion_rate*Z_i*Z_j/math.sqrt(x - + y - + z)) - else: - break - - # Now the value will be returned. - if as_eig: - cm_sorted = np.sort(eig(cm)[0])[::-1] - # Thanks to SO for the following lines of code. - # https://stackoverflow.com/a/43011036 - - # Keep zeros at the end. - mask = cm_sorted != 0. - f_mask = mask.sum(0, keepdims=1) >\ - np.arange(cm_sorted.shape[0]-1, -1, -1) - - f_mask = f_mask[::-1] - cm_sorted[f_mask] = cm_sorted[mask] - cm_sorted[~f_mask] = 0. - - return cm_sorted - - else: - return cm - - else: - cm_temp = [] - # Actual calculation of the coulomb matrix. - for i in mol_nr: - x_i = mol_data[i, 0] - y_i = mol_data[i, 1] - z_i = mol_data[i, 2] - Z_i = nc_data[i] - - cm_row = [] - for j in mol_nr: - x_j = mol_data[j, 0] - y_j = mol_data[j, 1] - z_j = mol_data[j, 2] - Z_j = nc_data[j] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - cm_row.append(0.5*Z_i**2.4) - else: - cm_row.append(conversion_rate*Z_i*Z_j/math.sqrt(x - + y - + z)) - - cm_temp.append(np.array(cm_row)) - - cm = np.array(cm_temp) - # Now the value will be returned. - if as_eig: - return np.sort(eig(cm)[0])[::-1] - else: - return cm - - -def c_matrix_multiple(mol_data, - nc_data, - pipe=None, - max_len=25, - as_eig=True, - bohr_radius_units=False): - """ - Calculates the Coulomb Matrix of multiple molecules. - mol_data: molecule data, matrix of atom coordinates. - nc_data: nuclear charge data, array of atom data. - pipe: for multiprocessing purposes. Sends the data calculated - through a pipe. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - """ - printc('Coulomb Matrices calculation started.', 'CYAN') - tic = time.perf_counter() - - cm_data = np.array([c_matrix(mol, nc, max_len, as_eig, bohr_radius_units) - for mol, nc in zip(mol_data, nc_data)]) - - toc = time.perf_counter() - printc('\tCM calculation took {:.4f} seconds.'.format(toc - tic), 'GREEN') - - if pipe: - pipe.send(cm_data) - - return cm_data diff --git a/ml_exp/compound.py b/ml_exp/compound.py new file mode 100644 index 000000000..1598fa482 --- /dev/null +++ b/ml_exp/compound.py @@ -0,0 +1,153 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import numpy as np +from ml_exp.data import NUCLEAR_CHARGE +from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\ + first_neighbor_matrix, adjacency_matrix, bag_of_stuff + + +class Compound: + def __init__(self, + xyz=None): + """ + Initialization of the Compound. + xyz: (path to) the xyz file. + """ + self.name = None + + self.n = None + self.extra = None # In case a comment is in the compound file. + self.atoms = None + self.atoms_nc = None + self.coordinates = None + self.pbe0 = None + self.delta = None + + self.cm = None + self.ljm = None + self.am = None + self.bos = None + + if xyz is not None: + self.read_xyz(xyz) + + def gen_cm(self, + size=23, + as_eig=True, + bohr_ru=False): + """ + Generate the Coulomb Matrix for the compund. + size: compound size. + as_eig: if the representation should be as the eigenvalues. + bohr_ru: if radius units should be in bohr's radius units. + """ + self.cm = coulomb_matrix(self.coordinates, + self.atoms_nc, + size=size, + as_eig=as_eig, + bohr_ru=bohr_ru) + + def gen_ljm(self, + diag_value=None, + sigma=1.0, + epsilon=1.0, + size=23, + as_eig=True, + bohr_ru=False): + """ + Generate the Lennard-Jones Matrix for the compund. + diag_value: if special diagonal value is to be used. + sigma: sigma value. + epsilon: epsilon value. + size: compound size. + as_eig: if the representation should be as the eigenvalues. + bohr_ru: if radius units should be in bohr's radius units. + """ + self.ljm = lennard_jones_matrix(self.coordinates, + self.atoms_nc, + diag_value=diag_value, + sigma=sigma, + epsilon=epsilon, + size=size, + as_eig=as_eig, + bohr_ru=bohr_ru) + + def gen_am(self, + use_forces=False, + size=23, + bohr_ru=False): + """ + Generate the Adjacency Matrix for the compund. + use_forces: if the use of forces instead of k_cx should be used. + size: compound size. + bohr_ru: if radius units should be in bohr's radius units. + """ + # First, generate the first neighor matrix. + fnm, bonds, forces = first_neighbor_matrix(self.coordinates, + self.atoms_nc, + self.atoms, + size=size, + use_forces=use_forces, + bohr_ru=bohr_ru) + + # Now, generate the adjacency matrix. + self.am = adjacency_matrix(fnm, + bonds, + forces, + size=size) + + def gen_bos(self, + size=23, + stuff='bonds'): + """ + Generate the Bag of Stuff for the compound. + size: compound size. + stuff: elements of the bag, by default the known bag of bonds. + """ + self.bos = bag_of_stuff(self.cm, + self.atoms, + size=size, + stuff=stuff) + + def read_xyz(self, + filename): + """ + Reads an xyz file and adds the corresponding data to the Compound. + filename: (path to) the xyz file. + """ + with open(filename, 'r') as f: + lines = f.readlines() + + self.name = filename.split('/')[-1] + self.n = int(lines[0]) + self.extra = lines[1] + self.atoms = [] + self.atoms_nc = np.empty(self.n, dtype=int) + self.coordinates = np.empty((self.n, 3), dtype=float) + + for i, atom in enumerate(lines[2:self.n + 2]): + atom_d = atom.split() + + self.atoms.append(atom_d[0]) + self.atoms_nc[i] = NUCLEAR_CHARGE[atom_d[0]] + self.coordinates[i] = np.asarray(atom_d[1:4], dtype=float) diff --git a/ml_exp/data.py b/ml_exp/data.py new file mode 100644 index 000000000..243aff168 --- /dev/null +++ b/ml_exp/data.py @@ -0,0 +1,141 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +NUCLEAR_CHARGE = { + 'H': 1, + 'He': 2, + 'Li': 3, + 'Be': 4, + 'B': 5, + 'C': 6, + 'N': 7, + 'O': 8, + 'F': 9, + 'Ne': 10, + 'Na': 11, + 'Mg': 12, + 'Al': 13, + 'Si': 14, + 'P': 15, + 'S': 16, + 'Cl': 17, + 'Ar': 18, + 'K': 19, + 'Ca': 20, + 'Sc': 21, + 'Ti': 22, + 'V': 23, + 'Cr': 24, + 'Mn': 25, + 'Fe': 26, + 'Co': 27, + 'Ni': 28, + 'Cu': 29, + 'Zn': 30, + 'Ga': 31, + 'Ge': 32, + 'As': 33, + 'Se': 34, + 'Br': 35, + 'Kr': 36, + 'Rb': 37, + 'Sr': 38, + 'Y': 39, + 'Zr': 40, + 'Nb': 41, + 'Mo': 42, + 'Tc': 43, + 'Ru': 44, + 'Rh': 45, + 'Pd': 46, + 'Ag': 47, + 'Cd': 48, + 'In': 49, + 'Sn': 50, + 'Sb': 51, + 'Te': 52, + 'I': 53, + 'Xe': 54, + 'Cs': 55, + 'Ba': 56, + 'La': 57, + 'Ce': 58, + 'Pr': 59, + 'Nd': 60, + 'Pm': 61, + 'Sm': 62, + 'Eu': 63, + 'Gd': 64, + 'Tb': 65, + 'Dy': 66, + 'Ho': 67, + 'Er': 68, + 'Tm': 69, + 'Yb': 70, + 'Lu': 71, + 'Hf': 72, + 'Ta': 73, + 'W': 74, + 'Re': 75, + 'Os': 76, + 'Ir': 77, + 'Pt': 78, + 'Au': 79, + 'Hg': 80, + 'Tl': 81, + 'Pb': 82, + 'Bi': 83, + 'Po': 84, + 'At': 85, + 'Rn': 86, + 'Fr': 87, + 'Ra': 88, + 'Ac': 89, + 'Th': 90, + 'Pa': 91, + 'U': 92, + 'Np': 93, + 'Pu': 94, + 'Am': 95, + 'Cm': 96, + 'Bk': 97, + 'Cf': 98, + 'Es': 99, + 'Fm': 100, + 'Md': 101, + 'No': 102, + 'Lr': 103, + 'Rf': 104, + 'Db': 105, + 'Sg': 106, + 'Bh': 107, + 'Hs': 108, + 'Mt': 109, + 'Ds ': 110, + 'Rg ': 111, + 'Cn ': 112, + 'Nh': 113, + 'Fl': 114, + 'Mc': 115, + 'Lv': 116, + 'Ts': 117, + 'Og': 118} diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py index 110242a1d..d8ee415bf 100644 --- a/ml_exp/do_ml.py +++ b/ml_exp/do_ml.py @@ -22,206 +22,206 @@ SOFTWARE. """ import time import numpy as np -from multiprocessing import Process, Pipe from ml_exp.misc import printc -from ml_exp.gauss_kernel import gauss_kernel -from ml_exp.cholesky_solve import cholesky_solve -from ml_exp.read_qm7_data import read_qm7_data -from ml_exp.parallel_create_matrices import parallel_create_matrices - - -def ml(desc_data, - energy_data, - training_size, - desc_type=None, - pipe=None, - test_size=None, - sigma=1000.0, - show_msgs=True): +from ml_exp.kernels import gaussian_kernel +from ml_exp.math import cholesky_solve +from ml_exp.qm7db import qm7db + + +def simple_ml(descriptors, + energies, + training_size, + test_size=None, + sigma=1000.0, + identifier=None, + show_msgs=True): """ - Does the ML methodology. - desc_data: descriptor (or representation) data. - energy_data: energy data associated with desc_data. + Basic ML methodology for a single descriptor type. + descriptors: array of descriptors. + energies: array of energies. training_size: size of the training set to use. - desc_type: string with the name of the descriptor used. - pipe: for multiprocessing purposes. Sends the data calculated - through a pipe. test_size: size of the test set to use. If no size is given, the last remaining molecules are used. sigma: depth of the kernel. - show_msgs: Show debug messages or not. - NOTE: desc_type is just a string and is only for identification purposes. + identifier: string with the name of the descriptor used. + show_msgs: if debug messages should be shown. + NOTE: identifier is just a string and is only for identification purposes. Also, training is done with the first part of the data and - testing with the ending part of the data. + testing with the ending part of the data. """ tic = time.perf_counter() # Initial calculations for later use. - d_len = len(desc_data) - e_len = len(energy_data) + data_size = descriptors.shape[0] - if not desc_type: - desc_type = 'NOT SPECIFIED' + if not identifier: + identifier = 'NOT SPECIFIED' - if d_len != e_len: - printc(''.join(['ERROR. Descriptor data size different ', - 'than energy data size.']), 'RED') - return None + if not data_size == energies.shape[0]: + raise ValueError('Energies size is different than descriptors size.') - if training_size >= d_len: - printc('ERROR. Training size greater or equal than data size.', 'RED') - return None + if training_size >= data_size: + raise ValueError('Training size is greater or equal to the data size.') + # If test_size is not set, it is set to a maximum size of 1500. if not test_size: - test_size = d_len - training_size + test_size = data_size - training_size if test_size > 1500: test_size = 1500 if show_msgs: - printc('{} ML started.'.format(desc_type), 'GREEN') - printc('\tTraining size: {}'.format(training_size), 'CYAN') - printc('\tTest size: {}'.format(test_size), 'CYAN') - printc('\tSigma: {}'.format(sigma), 'CYAN') + printc(f'{identifier} ML started.', 'GREEN') + printc(f'\tTraining size: {training_size}', 'CYAN') + printc(f'\tTest size: {test_size}', 'CYAN') + printc(f'\tSigma: {test_size}', 'CYAN') - X_training = desc_data[:training_size] - Y_training = energy_data[:training_size] - K_training = gauss_kernel(X_training, X_training, sigma) - alpha_ = cholesky_solve(K_training, Y_training) + X_training = descriptors[:training_size] + Y_training = energies[:training_size] + K_training = gaussian_kernel(X_training, X_training, sigma) + alpha = cholesky_solve(K_training, Y_training) - X_test = desc_data[-test_size:] - Y_test = energy_data[-test_size:] - K_test = gauss_kernel(X_test, X_training, sigma) - Y_predicted = np.dot(K_test, alpha_) + X_test = descriptors[-test_size:] + Y_test = energies[-test_size:] + K_test = gaussian_kernel(X_test, X_training, sigma) + Y_predicted = np.dot(K_test, alpha) mae = np.mean(np.abs(Y_predicted - Y_test)) if show_msgs: - printc('\tMAE for {}: {:.4f}'.format(desc_type, mae), 'GREEN') + printc(f'\tMAE for {identifier}: {mae:.4f}', 'GREEN') toc = time.perf_counter() tictoc = toc - tic if show_msgs: - printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc), - 'GREEN') - printc('\t\tTraining size: {}'.format(training_size), 'CYAN') - printc('\t\tTest size: {}'.format(test_size), 'CYAN') - printc('\t\tSigma: {}'.format(sigma), 'CYAN') - - if pipe: - pipe.send([desc_type, training_size, test_size, sigma, mae, tictoc]) + printc(f'\t{identifier} ML took {tictoc:.4f} seconds.', 'GREEN') + printc(f'\t\tTraining size: {training_size}', 'CYAN') + printc(f'\t\tTest size: {test_size}', 'CYAN') + printc(f'\t\tSigma: {sigma}', 'CYAN') return mae, tictoc -def do_ml(min_training_size, - max_training_size=None, - training_increment_size=500, - test_size=None, - ljm_diag_value=None, - ljm_sigma=1.0, - ljm_epsilon=1.0, +def do_ml(db_path='data', + is_shuffled=True, r_seed=111, - save_benchmarks=False, - max_len=25, + diag_value=None, + lj_sigma=1.0, + lj_epsilon=1.0, + use_forces=False, + stuff='bonds', + size=23, as_eig=True, - bohr_radius_units=False, + bohr_ru=False, + training_size=1500, + test_size=None, sigma=1000.0, + identifiers=["CM"], show_msgs=True): """ Main function that does the whole ML process. - min_training_size: minimum training size. - max_training_size: maximum training size. - training_increment_size: training increment size. + db_path: path to the database directory. + is_shuffled: if the resulting list of compounds should be shuffled. + r_seed: random seed to use for the shuffling. + diag_value: if special diagonal value is to be used. + lj_sigma: sigma value. + lj_epsilon: epsilon value. + use_forces: if the use of forces instead of k_cx should be used. + stuff: elements of the bag, by default the known bag of bonds. + size: compound size. + as_eig: if the representation should be as the eigenvalues. + bohr_ru: if radius units should be in bohr's radius units. + training_size: size of the training set to use. test_size: size of the test set to use. If no size is given, the last remaining molecules are used. - ljm_diag_value: if a special diagonal value should be used in lj matrix. - ljm_sigma: sigma value for lj matrix. - ljm_epsilon: epsilon value for lj matrix. - r_seed: random seed to use for the shuffling. - save_benchmarks: if benchmarks should be saved. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. sigma: depth of the kernel. - show_msgs: Show debug messages or not. + identifiers: list of names (strings) of descriptors to use. + show_msgs: if debug messages should be shown. """ - # Initialization time. + if type(identifiers) != list: + raise TypeError('\'identifiers\' is not a list.') + init_time = time.perf_counter() - if not max_training_size: - max_training_size = min_training_size + training_increment_size # Data reading. - molecules, nuclear_charge, energy_pbe0, energy_delta =\ - read_qm7_data(r_seed=r_seed) + tic = time.perf_counter() + compounds, energy_pbe0, energy_delta = qm7db(db_path=db_path, + is_shuffled=is_shuffled, + r_seed=r_seed) + toc = time.perf_counter() + tictoc = toc - tic + if show_msgs: + printc(f'Data reading took {tictoc:.4f} seconds.', 'CYAN') # Matrices calculation. - cm_data, ljm_data = parallel_create_matrices(molecules, - nuclear_charge, - ljm_diag_value, - ljm_sigma, - ljm_epsilon, - max_len, - as_eig, - bohr_radius_units) + tic = time.perf_counter() + for compound in compounds: + if 'CM' in identifiers: + compound.gen_cm(size=size, + as_eig=as_eig, + bohr_ru=bohr_ru) + if 'LJM' in identifiers: + compound.gen_ljm(diag_value=diag_value, + sigma=lj_sigma, + epsilon=lj_epsilon, + size=size, + as_eig=as_eig, + bohr_ru=bohr_ru) + if 'AM' in identifiers: + compound.gen_am(use_forces=use_forces, + size=size, + bohr_ru=bohr_ru) + if 'BOS' in identifiers: + compound.gen_bos(size=size, + stuff=stuff) + + # Create a numpy array for the descriptors. + if 'CM' in identifiers: + cm_data = np.array([comp.cm for comp in compounds], dtype=float) + if 'LJM' in identifiers: + ljm_data = np.array([comp.ljm for comp in compounds], dtype=float) + if 'AM' in identifiers: + am_data = np.array([comp.cm for comp in compounds], dtype=float) + if 'BOS' in identifiers: + bos_data = np.array([comp.bos for comp in compounds], dtype=float) + + toc = time.perf_counter() + tictoc = toc - tic + if show_msgs: + printc(f'Matrices calculation took {tictoc:.4f} seconds.', 'CYAN') # ML calculation. - procs = [] - cm_pipes = [] - ljm_pipes = [] - for i in range(min_training_size, - max_training_size + 1, - training_increment_size): - cm_recv, cm_send = Pipe(False) - p1 = Process(target=ml, - args=(cm_data, - energy_pbe0, - i, - 'CM', - cm_send, - test_size, - sigma, - show_msgs)) - procs.append(p1) - cm_pipes.append(cm_recv) - p1.start() - - ljm_recv, ljm_send = Pipe(False) - p2 = Process(target=ml, - args=(ljm_data, - energy_pbe0, - i, - 'L-JM', - ljm_send, - test_size, - sigma, - show_msgs)) - procs.append(p2) - ljm_pipes.append(ljm_recv) - p2.start() - - cm_bench_results = [] - ljm_bench_results = [] - for cd_pipe, ljd_pipe in zip(cm_pipes, ljm_pipes): - cm_bench_results.append(cd_pipe.recv()) - ljm_bench_results.append(ljd_pipe.recv()) - - for proc in procs: - proc.join() - - if save_benchmarks: - with open('data\\benchmarks.csv', 'a') as save_file: - # save_file.write(''.join(['ml_type,tr_size,te_size,kernel_s,', - # 'mae,time,lj_s,lj_e,date_ran\n'])) - ltime = time.localtime()[:3][::-1] - ljm_se = ',' + str(ljm_sigma) + ',' + str(ljm_epsilon) + ',' - date = '/'.join([str(field) for field in ltime]) - for cm, ljm, in zip(cm_bench_results, ljm_bench_results): - cm_text = ','.join([str(field) for field in cm])\ - + ',' + date + '\n' - ljm_text = ','.join([str(field) for field in ljm])\ - + ljm_se + date + '\n' - save_file.write(cm_text) - save_file.write(ljm_text) + if 'CM' in identifiers: + cm_mae, cm_tictoc = simple_ml(cm_data, + energy_pbe0, + training_size=training_size, + test_size=test_size, + sigma=sigma, + identifier='CM', + show_msgs=show_msgs) + if 'LJM' in identifiers: + ljm_mae, ljm_tictoc = simple_ml(ljm_data, + energy_pbe0, + training_size=training_size, + test_size=test_size, + sigma=sigma, + identifier='LJM', + show_msgs=show_msgs) + if 'AM' in identifiers: + am_mae, am_tictoc = simple_ml(am_data, + energy_pbe0, + training_size=training_size, + test_size=test_size, + sigma=sigma, + identifier='CM', + show_msgs=show_msgs) + if 'BOS' in identifiers: + bos_mae, bos_tictoc = simple_ml(bos_data, + energy_pbe0, + training_size=training_size, + test_size=test_size, + sigma=sigma, + identifier='CM', + show_msgs=show_msgs) # End of program end_time = time.perf_counter() - printc('Program took {:.4f} seconds.'.format(end_time - init_time), - 'CYAN') + totaltime = end_time - init_time + printc(f'Program took {totaltime:.4f} seconds.', 'CYAN') diff --git a/ml_exp/gauss_kernel.py b/ml_exp/kernels.py index 834d62408..3914ffc20 100644 --- a/ml_exp/gauss_kernel.py +++ b/ml_exp/kernels.py @@ -22,27 +22,23 @@ SOFTWARE. """ import math import numpy as np -from ml_exp.frob_norm import frob_norm -def gauss_kernel(X_1, X_2, sigma): +def gaussian_kernel(X1, + X2, + sigma): """ Calculates the Gaussian Kernel. - X_1: first representations. - X_2: second representations. + X1: first representations. + X2: second representations. sigma: kernel width. """ - x1_l = len(X_1) - x1_range = range(x1_l) - x2_l = len(X_2) - x2_range = range(x2_l) - inv_sigma = -0.5 / (sigma*sigma) - K = np.zeros((x1_l, x2_l)) - for i in x1_range: - for j in x2_range: - f_norm = frob_norm(X_1[i] - X_2[j]) + K = np.zeros((X1.shape[0], X2.shape[0]), dtype=float) + for i, x1 in enumerate(X1): + for j, x2 in enumerate(X2): + f_norm = np.linalg.norm(x1 - x2) # print(f_norm) K[i, j] = math.exp(inv_sigma * f_norm) diff --git a/ml_exp/lj_matrix.py b/ml_exp/lj_matrix.py deleted file mode 100644 index 549790755..000000000 --- a/ml_exp/lj_matrix.py +++ /dev/null @@ -1,222 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import time -import math -import numpy as np -from numpy.linalg import eig -from ml_exp.misc import printc - - -def lj_matrix(mol_data, - nc_data, - diag_value=None, - sigma=1.0, - epsilon=1.0, - max_len=25, - as_eig=True, - bohr_radius_units=False): - """ - Creates the Lennard-Jones Matrix from the molecule data given. - mol_data: molecule data, matrix of atom coordinates. - nc_data: nuclear charge data, array of atom data. - diag_value: if special diagonal value is to be used. - sigma: sigma value. - epsilon: epsilon value. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - """ - if bohr_radius_units: - conversion_rate = 0.52917721067 - else: - conversion_rate = 1 - - mol_n = len(mol_data) - mol_nr = range(mol_n) - - if not mol_n == len(nc_data): - print(''.join(['Error. Molecule matrix dimension is different ', - 'than the nuclear charge array dimension.'])) - else: - if max_len < mol_n: - print(''.join(['Error. Molecule matrix dimension (mol_n) is ', - 'greater than max_len. Using mol_n.'])) - max_len = None - - if max_len: - lj = np.zeros((max_len, max_len)) - ml_r = range(max_len) - - # Actual calculation of the coulomb matrix. - for i in ml_r: - if i < mol_n: - x_i = mol_data[i, 0] - y_i = mol_data[i, 1] - z_i = mol_data[i, 2] - Z_i = nc_data[i] - else: - break - - for j in ml_r: - if j < mol_n: - x_j = mol_data[j, 0] - y_j = mol_data[j, 1] - z_j = mol_data[j, 2] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - if diag_value is None: - lj[i, j] = (0.5*Z_i**2.4) - else: - lj[i, j] = diag_value - else: - # Calculations are done after i==j is checked - # so no division by zero is done. - - # A little play with r exponents - # so no square root is calculated. - # Conversion factor is included in r^2. - - # 1/r^2 - r_2 = sigma**2/(conversion_rate**2*(x + y + z)) - - r_6 = math.pow(r_2, 3) - r_12 = math.pow(r_6, 2) - lj[i, j] = (4*epsilon*(r_12 - r_6)) - else: - break - - # Now the value will be returned. - if as_eig: - lj_sorted = np.sort(eig(lj)[0])[::-1] - # Thanks to SO for the following lines of code. - # https://stackoverflow.com/a/43011036 - - # Keep zeros at the end. - mask = lj_sorted != 0. - f_mask = mask.sum(0, keepdims=1) >\ - np.arange(lj_sorted.shape[0]-1, -1, -1) - - f_mask = f_mask[::-1] - lj_sorted[f_mask] = lj_sorted[mask] - lj_sorted[~f_mask] = 0. - - return lj_sorted - - else: - return lj - - else: - lj_temp = [] - # Actual calculation of the coulomb matrix. - for i in mol_nr: - x_i = mol_data[i, 0] - y_i = mol_data[i, 1] - z_i = mol_data[i, 2] - Z_i = nc_data[i] - - lj_row = [] - for j in mol_nr: - x_j = mol_data[j, 0] - y_j = mol_data[j, 1] - z_j = mol_data[j, 2] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - if not diag_value: - lj_row.append(0.5*Z_i**2.4) - else: - lj_row.append(diag_value) - else: - # Calculations are done after i==j is checked - # so no division by zero is done. - - # A little play with r exponents - # so no square root is calculated. - # Conversion factor is included in r^2. - - # 1/r^2 - r_2 = sigma**2/(conversion_rate**2*(x + y + z)) - - r_6 = math.pow(r_2, 3) - r_12 = math.pow(r_6, 2) - lj_row.append(4*epsilon*(r_12 - r_6)) - - lj_temp.append(np.array(lj_row)) - - lj = np.array(lj_temp) - # Now the value will be returned. - if as_eig: - return np.sort(eig(lj)[0])[::-1] - else: - return lj - - -def lj_matrix_multiple(mol_data, - nc_data, - pipe=None, - diag_value=None, - sigma=1.0, - epsilon=1.0, - max_len=25, - as_eig=True, - bohr_radius_units=False): - """ - Calculates the Lennard-Jones Matrix of multiple molecules. - mol_data: molecule data, matrix of atom coordinates. - nc_data: nuclear charge data, array of atom data. - pipe: for multiprocessing purposes. Sends the data calculated - through a pipe. - diag_value: if special diagonal value is to be used. - sigma: sigma value. - epsilon: epsilon value. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - """ - printc('L-J Matrices calculation started.', 'CYAN') - tic = time.perf_counter() - - ljm_data = np.array([lj_matrix(mol, - nc, - diag_value, - sigma, - epsilon, - max_len, - as_eig, - bohr_radius_units) - for mol, nc in zip(mol_data, nc_data)]) - - toc = time.perf_counter() - printc('\tL-JM calculation took {:.4f} seconds.'.format(toc-tic), 'GREEN') - - if pipe: - pipe.send(ljm_data) - - return ljm_data diff --git a/ml_exp/cholesky_solve.py b/ml_exp/math.py index bc6a572a3..da1fdf08f 100644 --- a/ml_exp/cholesky_solve.py +++ b/ml_exp/math.py @@ -21,26 +21,25 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import numpy as np -from numpy.linalg import cholesky -def cholesky_solve(K, y): +def cholesky_solve(K, + y): """ - Applies Cholesky decomposition to obtain the 'alpha coeficients'. + Applies Cholesky decomposition to solve Ka=y. Where a are the alpha + coeficients. K: kernel. y: known parameters. """ - # The initial mathematical problem is to solve Ka=y. - - # First, add a small lambda value. + # Add a small lambda value. K[np.diag_indices_from(K)] += 1e-8 # Get the Cholesky decomposition of the kernel. - L = cholesky(K) - size = len(L) + L = np.linalg.cholesky(K) + size = K.shape[0] # Solve Lx=y for x. - x = np.zeros(size) + x = np.zeros(size, dtype=float) x[0] = y[0] / L[0, 0] for i in range(1, size): temp_sum = 0.0 @@ -50,12 +49,9 @@ def cholesky_solve(K, y): # Now, solve LTa=x for a. L2 = L.T - a = np.zeros(size) - a_ms = size - 1 - a[a_ms] = x[a_ms] / L2[a_ms, a_ms] - # Because of the form of L2 (upper triangular matriz), an inversion of - # range() needs to be done. - for i in range(0, a_ms)[::-1]: + a = np.zeros(size, dtype=float) + a[size - 1] = x[size - 1] / L2[size - 1, size - 1] + for i in range(0, size - 1)[::-1]: temp_sum = 0.0 for j in range(i, size)[::-1]: temp_sum += L2[i, j] * a[j] diff --git a/ml_exp/misc.py b/ml_exp/misc.py index e9142b05f..4bd68eefc 100644 --- a/ml_exp/misc.py +++ b/ml_exp/misc.py @@ -29,29 +29,25 @@ init() def printc(text, color): """ Prints texts normaly, but in color. Using colorama. - text: string with the text to print. + text: text to print. color: color to be used, same as available in colorama. """ - color_dic = {'BLACK': Fore.BLACK, - 'RED': Fore.RED, - 'GREEN': Fore.GREEN, - 'YELLOW': Fore.YELLOW, - 'BLUE': Fore.BLUE, - 'MAGENTA': Fore.MAGENTA, - 'CYAN': Fore.CYAN, - 'WHITE': Fore.WHITE, - 'RESET': Fore.RESET} - - color_dic_keys = color_dic.keys() - if color not in color_dic_keys: - print(Fore.RED - + '\'{}\' not found, using default color.'.format(color) - + Style.RESET_ALL) - actual_color = Fore.RESET - else: - actual_color = color_dic[color] - - print(actual_color + text + Style.RESET_ALL) + color = color.upper() + colors = {'BLACK': Fore.BLACK, + 'RED': Fore.RED, + 'GREEN': Fore.GREEN, + 'YELLOW': Fore.YELLOW, + 'BLUE': Fore.BLUE, + 'MAGENTA': Fore.MAGENTA, + 'CYAN': Fore.CYAN, + 'WHITE': Fore.WHITE, + 'RESET': Fore.RESET} + + av_colors = colors.keys() + if color not in av_colors: + raise KeyError(f'{color} not found.') + + print(colors[color] + text + Style.RESET_ALL) def plot_benchmarks(): diff --git a/ml_exp/parallel_create_matrices.py b/ml_exp/parallel_create_matrices.py deleted file mode 100644 index a68122d48..000000000 --- a/ml_exp/parallel_create_matrices.py +++ /dev/null @@ -1,85 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -from multiprocessing import Process, Pipe -from ml_exp.c_matrix import c_matrix_multiple -from ml_exp.lj_matrix import lj_matrix_multiple - - -def parallel_create_matrices(mol_data, - nc_data, - ljm_diag_value=None, - ljm_sigma=1.0, - ljm_epsilon=1.0, - max_len=25, - as_eig=True, - bohr_radius_units=False): - """ - Creates the Coulomb and L-J matrices in parallel. - mol_data: molecule data, matrix of atom coordinates. - nc_data: nuclear charge data, array of atom data. - ljm_diag_value: if special diagonal value is to be used for lj matrix. - ljm_sigma: sigma value for lj matrix. - ljm_epsilon: psilon value for lj matrix. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - """ - - # Matrices calculation. - procs = [] - pipes = [] - - cm_recv, cm_send = Pipe(False) - p1 = Process(target=c_matrix_multiple, - args=(mol_data, - nc_data, - cm_send, - max_len, - as_eig, - bohr_radius_units)) - procs.append(p1) - pipes.append(cm_recv) - p1.start() - - ljm_recv, ljm_send = Pipe(False) - p2 = Process(target=lj_matrix_multiple, - args=(mol_data, - nc_data, - ljm_send, - ljm_diag_value, - ljm_sigma, - ljm_epsilon, - max_len, - as_eig, - bohr_radius_units)) - procs.append(p2) - pipes.append(ljm_recv) - p2.start() - - cm_data = pipes[0].recv() - ljm_data = pipes[1].recv() - - for proc in procs: - proc.join() - - return cm_data, ljm_data diff --git a/ml_exp/frob_norm.py b/ml_exp/qm7db.py index 4c3a2945d..3fd46b6bc 100644 --- a/ml_exp/frob_norm.py +++ b/ml_exp/qm7db.py @@ -20,32 +20,36 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import math +from ml_exp.compound import Compound +import numpy as np +import random -def frob_norm(array): +def qm7db(db_path='data', + is_shuffled=True, + r_seed=111): """ - Calculates the frobenius norm of a given array or matrix. - array: array of data. + Creates a list of compounds with the qm7 database. + db_path: path to the database directory. + is_shuffled: if the resulting list of compounds should be shuffled. + r_seed: random seed to use for the shuffling. """ + fname = f'{db_path}/hof_qm7.txt' + with open(fname, 'r') as f: + lines = f.readlines() - arr_sh_len = len(array.shape) - arr_range = range(len(array)) - fn = 0.0 + compounds = [] + for i, line in enumerate(lines): + line = line.split() + compounds.append(Compound(f'{db_path}/{line[0]}')) + compounds[i].pbe0 = float(line[1]) + compounds[i].delta = float(line[1]) - float(line[2]) - # If it is a 'vector'. - if arr_sh_len == 1: - for i in arr_range: - fn += array[i]*array[i] + if is_shuffled: + random.seed(r_seed) + random.shuffle(compounds) - return math.sqrt(fn) + e_pbe0 = np.array([compound.pbe0 for compound in compounds], dtype=float) + e_delta = np.array([compound.delta for compound in compounds], dtype=float) - # If it is a matrix. - elif arr_sh_len == 2: - for i in arr_range: - for j in arr_range: - fn += array[i, j]*array[i, j] - - return math.sqrt(fn) - else: - print('Error. Array size greater than 2 ({}).'.format(arr_sh_len)) + return compounds, e_pbe0, e_delta diff --git a/ml_exp/read_qm7_data.py b/ml_exp/read_qm7_data.py deleted file mode 100644 index 81be05a17..000000000 --- a/ml_exp/read_qm7_data.py +++ /dev/null @@ -1,162 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import os -import time -import numpy as np -import random -from ml_exp.misc import printc - - -# 'periodic_table_of_elements.txt' retrieved from -# https://gist.github.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee -def read_nc_data(data_path): - """ - Reads nuclear charge data from file and returns a dictionary. - data_path: path to the data directory. - """ - fname = 'periodic_table_of_elements.txt' - with open(''.join([data_path, '/', fname]), 'r') as infile: - temp_lines = infile.readlines() - - del temp_lines[0] - - lines = [] - for temp_line in temp_lines: - new_line = temp_line.split(sep=',') - lines.append(new_line) - - # Dictionary of nuclear charge. - return {line[2]: int(line[0]) for line in lines} - - -# 'hof_qm7.txt.txt' retrieved from -# https://github.com/qmlcode/tutorial -def read_db_data(zi_data, - data_path, - r_seed=111, - return_atoms=False): - """ - Reads molecule database and extracts - its contents as usable variables. - zi_data: dictionary containing nuclear charge data. - data_path: path to the data directory. - r_seed: random seed to use for the shuffling. - return_atoms: if atom list should be returned. - """ - os.chdir(data_path) - - fname = 'hof_qm7.txt' - with open(fname, 'r') as infile: - lines = infile.readlines() - - # Temporary energy dictionary. - energy_temp = dict() - - for line in lines: - xyz_data = line.split() - - xyz_name = xyz_data[0] - hof = float(xyz_data[1]) - dftb = float(xyz_data[2]) - # print(xyz_name, hof, dftb) - - energy_temp[xyz_name] = np.array([hof, hof - dftb]) - - # Use a random seed. - random.seed(r_seed) - - et_keys = list(energy_temp.keys()) - random.shuffle(et_keys) - - # Temporary energy dictionary, shuffled. - energy_temp_shuffled = dict() - for key in et_keys: - energy_temp_shuffled.update({key: energy_temp[key]}) - - mol_data = [] - mol_nc_data = [] - atoms = [] - # Actual reading of the xyz files. - for i, k in enumerate(energy_temp_shuffled.keys()): - with open(k, 'r') as xyz_file: - lines = xyz_file.readlines() - - len_lines = len(lines) - mol_temp_data = [] - mol_nc_temp_data = np.array(np.zeros(len_lines-2)) - atoms_temp = [] - for j, line in enumerate(lines[2:len_lines]): - line_list = line.split() - - atoms_temp.append(line_list[0]) - mol_nc_temp_data[j] = float(zi_data[line_list[0]]) - line_data = np.array(np.asarray(line_list[1:4], dtype=float)) - mol_temp_data.append(line_data) - - mol_data.append(mol_temp_data) - mol_nc_data.append(mol_nc_temp_data) - atoms.append(atoms_temp) - - # Convert everything to a numpy array. - molecules = np.array([np.array(mol) for mol in mol_data]) - nuclear_charge = np.array([nc_d for nc_d in mol_nc_data]) - energy_pbe0 = np.array([energy_temp_shuffled[k][0] - for k in energy_temp_shuffled.keys()]) - energy_delta = np.array([energy_temp_shuffled[k][1] - for k in energy_temp_shuffled.keys()]) - - if return_atoms: - return molecules, nuclear_charge, energy_pbe0, energy_delta, atoms - return molecules, nuclear_charge, energy_pbe0, energy_delta - - -def read_qm7_data(data_path='data', - r_seed=111, - return_atoms=False): - """ - Reads all the qm7 data. - data_path: path to the data directory. - r_seed: random seed to use for the shuffling. - return_atoms: if atom list should be returned. - """ - tic = time.perf_counter() - printc('Data reading started.', 'CYAN') - - init_path = os.getcwd() - os.chdir(data_path) - data_path = os.getcwd() - - zi_data = read_nc_data(data_path) - if return_atoms: - molecules, nuclear_charge, energy_pbe0, energy_delta, atoms = \ - read_db_data(zi_data, data_path, r_seed, return_atoms) - else: - molecules, nuclear_charge, energy_pbe0, energy_delta = \ - read_db_data(zi_data, data_path, r_seed) - - os.chdir(init_path) - toc = time.perf_counter() - printc('\tData reading took {:.4f} seconds.'.format(toc-tic), 'GREEN') - if return_atoms: - return molecules, nuclear_charge, energy_pbe0, energy_delta, atoms - return molecules, nuclear_charge, energy_pbe0, energy_delta diff --git a/ml_exp/representations.py b/ml_exp/representations.py new file mode 100644 index 000000000..ea079595e --- /dev/null +++ b/ml_exp/representations.py @@ -0,0 +1,384 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import numpy as np +from collections import Counter + + +def coulomb_matrix(coords, + nc, + size=23, + as_eig=True, + bohr_ru=False): + """ + Creates the Coulomb Matrix from the molecule data given. + coords: compound coordinates. + nc: nuclear charge data. + size: compound size. + as_eig: if the representation should be as the eigenvalues. + bohr_ru: if radius units should be in bohr's radius units. + """ + if bohr_ru: + cr = 0.52917721067 + else: + cr = 1.0 + + n = coords.shape[0] + + if not n == nc.shape[0]: + raise ValueError('Compound size is different than the nuclear charge \ +size. Arrays are not of the right shape.') + + if size < n: + print('Error. Compound size (n) is greater han (size). Using (n)', + 'instead of (size).') + size = n + + cm = np.zeros((size, size), dtype=float) + + # Actual calculation of the coulomb matrix. + for i, xyz_i in enumerate(coords): + for j, xyz_j in enumerate(coords): + rv = xyz_i - xyz_j + r = np.linalg.norm(rv)/cr + if i == j: + cm[i, j] = (0.5*nc[i]**2.4) + else: + cm[i, j] = nc[i]*nc[j]/r + + # Now the value will be returned. + if as_eig: + cm_sorted = np.sort(np.linalg.eig(cm)[0])[::-1] + # Thanks to SO for the following lines of code. + # https://stackoverflow.com/a/43011036 + + # Keep zeros at the end. + mask = cm_sorted != 0. + f_mask = mask.sum(0, keepdims=1) >\ + np.arange(cm_sorted.shape[0]-1, -1, -1) + + f_mask = f_mask[::-1] + cm_sorted[f_mask] = cm_sorted[mask] + cm_sorted[~f_mask] = 0. + + return cm_sorted + else: + return cm + + +def lennard_jones_matrix(coords, + nc, + diag_value=None, + sigma=1.0, + epsilon=1.0, + size=23, + as_eig=True, + bohr_ru=False): + """ + Creates the Lennard-Jones Matrix from the molecule data given. + coords: compound coordinates. + nc: nuclear charge data. + diag_value: if special diagonal value is to be used. + sigma: sigma value. + epsilon: epsilon value. + size: compound size. + as_eig: if the representation should be as the eigenvalues. + bohr_ru: if radius units should be in bohr's radius units. + """ + if bohr_ru: + cr = 0.52917721067 + else: + cr = 1.0 + + n = coords.shape[0] + + if not n == nc.shape[0]: + raise ValueError('Compound size is different than the nuclear charge \ +size. Arrays are not of the right shape.') + + if size < n: + print('Error. Compound size (n) is greater han (size). Using (n)', + 'instead of (size).') + size = n + + lj = np.zeros((size, size), dtype=float) + + # Actual calculation of the lennard-jones matrix. + for i, xyz_i in enumerate(coords): + for j, xyz_j in enumerate(coords): + if i == j: + if diag_value is None: + lj[i, j] = (0.5*nc[i]**2.4) + else: + lj[i, j] = diag_value + else: + # Calculations are done after i==j is checked + # so no division by zero is done. + + # A little play with r exponents + # so no square root is calculated. + # Conversion factor is included in r^2. + rv = xyz_i - xyz_j + r = np.linalg.norm(rv)/cr + + # 1/r^n + r_2 = sigma**2/r**2 + r_6 = r_2**3 + r_12 = r_6**2 + lj[i, j] = (4*epsilon*(r_12 - r_6)) + + # Now the value will be returned. + if as_eig: + lj_sorted = np.sort(np.linalg.eig(lj)[0])[::-1] + # Thanks to SO for the following lines of code. + # https://stackoverflow.com/a/43011036 + + # Keep zeros at the end. + mask = lj_sorted != 0. + f_mask = mask.sum(0, keepdims=1) >\ + np.arange(lj_sorted.shape[0]-1, -1, -1) + + f_mask = f_mask[::-1] + lj_sorted[f_mask] = lj_sorted[mask] + lj_sorted[~f_mask] = 0. + + return lj_sorted + else: + return lj + + +def first_neighbor_matrix(coords, + nc, + atoms, + size=23, + use_forces=False, + bohr_ru=False): + """ + Creates the First Neighbor Matrix from the molecule data given. + coords: compound coordinates. + nc: nuclear charge data. + atoms: list of atoms. + use_forces: if the use of forces instead of k_cx should be used. + bohr_ru: if radius units should be in bohr's radius units. + NOTE: Bond distance of carbon to other elements + are (for atoms present in the qm7 dataset): + C: 1.20 - 1.54 A (Edited to 1.19 - 1.54 A) + H: 1.06 - 1.12 A + O: 1.43 - 2.15 A + N: 1.47 - 2.10 A + S: 1.81 - 2.55 A + """ + if bohr_ru: + cr = 0.52917721067 + else: + cr = 1.0 + + n = coords.shape[0] + + if not n == nc.shape[0]: + raise ValueError('Compound size is different than the nuclear charge \ +size. Arrays are not of the right shape.') + + if size < n: + print('Error. Compound size (n) is greater han (size). Using (n)', + 'instead of (size).') + size = n + + # Possible bonds. + cc_bond = sorted(['C', 'C']) + ch_bond = sorted(['C', 'H']) + co_bond = sorted(['C', 'O']) + cn_bond = sorted(['C', 'N']) + cs_bond = sorted(['C', 'S']) + + fnm = np.zeros((size, size), dtype=float) + + bonds = [] + forces = [] + for i, xyz_i in enumerate(coords): + for j, xyz_j in enumerate(coords): + # Ignore the diagonal. + if i != j: + bond = sorted([atoms[i], atoms[j]]) + rv = xyz_i - xyz_j + r = np.linalg.norm(rv)/cr + + # Check for each type of bond. + if (cc_bond == bond) and (r >= 1.19 and r <= 1.54): + fnm[i, j] = 1.0 + if j > i: + bonds.append((i, j)) + if use_forces: + forces.append(rv*nc[i]*nc[j]/r**3) + elif (ch_bond == bond) and (r >= 1.06 and r <= 1.12): + fnm[i, j] = 1.0 + if j > i: + bonds.append((i, j)) + if use_forces: + forces.append(rv*nc[i]*nc[j]/r**3) + elif (co_bond == bond) and (r >= 1.43 and r <= 2.15): + fnm[i, j] = 0.8 + if j > i: + bonds.append((i, j)) + if use_forces: + forces.append(rv*nc[i]*nc[j]/r**3) + elif (cn_bond == bond) and (r >= 1.47 and r <= 2.10): + fnm[i, j] = 1.0 + if j > i: + bonds.append((i, j)) + if use_forces: + forces.append(rv*nc[i]*nc[j]/r**3) + elif (cs_bond == bond) and (r >= 1.81 and r <= 2.55): + fnm[i, j] = 0.7 + if j > i: + bonds.append((i, j)) + if use_forces: + forces.append(rv*nc[i]*nc[j]/r**3) + + return fnm, bonds, forces + + +def adjacency_matrix(fnm, + bonds, + forces, + size=22): + """ + Calculates the adjacency matrix given the bond list. + fnm: first neighbour matrix. + bonds: list of bonds (tuple of indexes). + forces: list of forces. + size: compund size. + """ + n = len(bonds) + + if size < n: + print('Error. Compound size (n) is greater han (size). Using (n)\ + instead of (size).') + size = n + + am = np.zeros((size, size), dtype=float) + + for i, bond_i in enumerate(bonds): + for j, bond_j in enumerate(bonds): + # Ignore the diagonal. + if i != j: + if (bond_i[0] in bond_j) or (bond_i[1] in bond_j): + if forces: + am[i, j] = np.dot(forces[i], forces[j]) + else: + am[i, j] = fnm[bond_i[0], bond_i[1]] + + return am + + +def check_bond(bags, + bond): + """ + Checks if a bond is in a bag. + bags: list of bags, containing a bag per entry, which in turn + contains a list of bond-values. + bond: bond to check. + """ + if bags == []: + return False, None + + for i, bag in enumerate(bags): + if bag[0] == bond: + return True, i + + return False, None + + +def bag_of_stuff(cm, + atoms, + size=23, + stuff='bonds'): + """ + Creates the Bag of Bonds using the Coulomb Matrix. + cm: coulomb matrix. + atoms: list of atoms. + size: compound size. + """ + if cm is None: + raise ValueError('Coulomb Matrix hasn\'t been initialized for the \ +current compound.') + + if cm.ndim == 1: + raise ValueError('Coulomb Matrix (CM) dimension is 1. Maybe it was \ +generated as the vector of eigenvalues, try (re-)generating the CM.') + + n = len(atoms) + + if size < n: + print('Error. Compound size (n) is greater han (size). Using (n)', + 'instead of (size).') + size = n + + # Bond max length, calculated using only the upper triangular matrix. + bond_size = int((size * size - size)/2 + size) + + # List where each bag data is stored. + bags = [] + for i, atom_i in enumerate(atoms): + for j, atom_j in enumerate(atoms): + # Work only in the upper triangle of the coulomb matrix. + if j >= i: + # Get the string of the current bond. + if i == j: + current_bond = atom_i + else: + current_bond = ''.join(sorted([atom_i, atom_j])) + + # Check if that bond is already in a bag. + checker = check_bond(bags, current_bond) + # Either create a new bag or add values to an existing one. + if not checker[0]: + bags.append([current_bond, cm[i, j]]) + else: + bags[checker[1]].append(cm[i, j]) + + # Create the actual bond list ordered. + atom_counter = Counter(atoms) + atom_list = sorted(list(set(atoms))) + bonds = [] + for i, a_i in enumerate(atom_list): + if atom_counter[a_i] > 1: + for a_j in atom_list[i:]: + bonds.append(''.join(sorted([a_i, a_j]))) + bonds = atom_list + bonds + + # Create the final vector for the bos. + bos = np.zeros(bond_size, dtype=float) + c_i = 0 + for i, bond in enumerate(bonds): + checker = check_bond(bags, bond) + if checker[0]: + for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]): + # Use c_i as the index for bos if the zero padding should + # be at the end of the vector instead of between each bond. + bos[i*size + j] = num + c_i += 1 + else: + print(f'Error. Bond {bond} from bond list coudn\'t be found', + 'in the bags list. This could be a case where the atom', + 'is only present oncce in the molecule.') + return bos diff --git a/requirements.txt b/requirements.txt index 437e0428a..4b242770c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ colorama==0.4.3 numpy==1.18.1 -pandas==0.25.3
\ No newline at end of file +pandas==1.0.1 |