From fe4e5afebaf7320ba5bed7f159c84bc5d1e5faec Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 29 Feb 2020 06:23:02 -0700 Subject: Rename bos to bob --- ml_exp/compound.py | 9 +++------ ml_exp/representations.py | 15 +++++++-------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/ml_exp/compound.py b/ml_exp/compound.py index 1598fa482..4dbd8c95c 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -116,18 +116,15 @@ class Compound: forces, size=size) - def gen_bos(self, - size=23, - stuff='bonds'): + def gen_bob(self, + size=23): """ Generate the Bag of Stuff for the compound. size: compound size. - stuff: elements of the bag, by default the known bag of bonds. """ self.bos = bag_of_stuff(self.cm, self.atoms, - size=size, - stuff=stuff) + size=size) def read_xyz(self, filename): diff --git a/ml_exp/representations.py b/ml_exp/representations.py index ea079595e..d4e79c7d9 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -308,10 +308,9 @@ def check_bond(bags, return False, None -def bag_of_stuff(cm, +def bag_of_bonds(cm, atoms, - size=23, - stuff='bonds'): + size=23): """ Creates the Bag of Bonds using the Coulomb Matrix. cm: coulomb matrix. @@ -366,19 +365,19 @@ generated as the vector of eigenvalues, try (re-)generating the CM.') bonds.append(''.join(sorted([a_i, a_j]))) bonds = atom_list + bonds - # Create the final vector for the bos. - bos = np.zeros(bond_size, dtype=float) + # Create the final vector for the bob. + bob = np.zeros(bond_size, dtype=float) c_i = 0 for i, bond in enumerate(bonds): checker = check_bond(bags, bond) if checker[0]: for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]): - # Use c_i as the index for bos if the zero padding should + # Use c_i as the index for bob if the zero padding should # be at the end of the vector instead of between each bond. - bos[i*size + j] = num + bob[i*size + j] = num c_i += 1 else: print(f'Error. Bond {bond} from bond list coudn\'t be found', 'in the bags list. This could be a case where the atom', 'is only present oncce in the molecule.') - return bos + return bob -- cgit v1.2.3-70-g09d2 From d71927fc791e51bf383b88bd3d943d8e3311ee8a Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 29 Feb 2020 06:23:28 -0700 Subject: Rename bos to bob --- ml_exp/compound.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ml_exp/compound.py b/ml_exp/compound.py index 4dbd8c95c..98e31532f 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -46,7 +46,7 @@ class Compound: self.cm = None self.ljm = None self.am = None - self.bos = None + self.bob = None if xyz is not None: self.read_xyz(xyz) @@ -122,7 +122,7 @@ class Compound: Generate the Bag of Stuff for the compound. size: compound size. """ - self.bos = bag_of_stuff(self.cm, + self.bob = bag_of_stuff(self.cm, self.atoms, size=size) -- cgit v1.2.3-70-g09d2 From 6dd33013265ce38122a623b285e0b39f2496aaf8 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 29 Feb 2020 07:19:46 -0700 Subject: Rewrite fnm --- ml_exp/compound.py | 33 +++++++++++++++++---------- ml_exp/representations.py | 58 ++++++++++++++++------------------------------- 2 files changed, 40 insertions(+), 51 deletions(-) diff --git a/ml_exp/compound.py b/ml_exp/compound.py index 98e31532f..ceace6984 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -33,20 +33,29 @@ class Compound: Initialization of the Compound. xyz: (path to) the xyz file. """ + # General compound data. self.name = None - self.n = None - self.extra = None # In case a comment is in the compound file. + self.extra = None self.atoms = None self.atoms_nc = None self.coordinates = None self.pbe0 = None self.delta = None + # Computed data. self.cm = None self.ljm = None + self.fnm = None self.am = None self.bob = None + self.bo_atoms = None + self.bok_cx = None + self.bof = None + + # Helping data. + self.bonds = None + self.forces = None if xyz is not None: self.read_xyz(xyz) @@ -103,17 +112,17 @@ class Compound: bohr_ru: if radius units should be in bohr's radius units. """ # First, generate the first neighor matrix. - fnm, bonds, forces = first_neighbor_matrix(self.coordinates, - self.atoms_nc, - self.atoms, - size=size, - use_forces=use_forces, - bohr_ru=bohr_ru) - + fnm_data = first_neighbor_matrix(self.coordinates, + self.atoms_nc, + self.atoms, + size=size, + use_forces=use_forces, + bohr_ru=bohr_ru) + self.fnm, self.bonds, self.forces = fnm_data # Now, generate the adjacency matrix. - self.am = adjacency_matrix(fnm, - bonds, - forces, + self.am = adjacency_matrix(self.fnm, + self.bonds, + self.forces, size=size) def gen_bob(self, diff --git a/ml_exp/representations.py b/ml_exp/representations.py index d4e79c7d9..515821290 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -181,11 +181,11 @@ def first_neighbor_matrix(coords, bohr_ru: if radius units should be in bohr's radius units. NOTE: Bond distance of carbon to other elements are (for atoms present in the qm7 dataset): - C: 1.20 - 1.54 A (Edited to 1.19 - 1.54 A) - H: 1.06 - 1.12 A - O: 1.43 - 2.15 A - N: 1.47 - 2.10 A - S: 1.81 - 2.55 A + C: 1.19 - 1.54 A, 1.0 + H: 1.06 - 1.12 A, 1.0 + O: 1.43 - 2.15 A, 0.8 + N: 1.47 - 2.10 A, 1.0 + S: 1.81 - 2.55 A, 0.7 """ if bohr_ru: cr = 0.52917721067 @@ -210,7 +210,11 @@ size. Arrays are not of the right shape.') cn_bond = sorted(['C', 'N']) cs_bond = sorted(['C', 'S']) - fnm = np.zeros((size, size), dtype=float) + pos_bonds = {cc_bond: (1.19, 1.54), ch_bond: (1.06, 1.12), + co_bond: (1.43, 2.15), cn_bond: (1.47, 2.19), + cs_bond: (1.81, 2.55)} + + fnm = np.zeros((size, size), dtype=bool) bonds = [] forces = [] @@ -219,39 +223,15 @@ size. Arrays are not of the right shape.') # Ignore the diagonal. if i != j: bond = sorted([atoms[i], atoms[j]]) - rv = xyz_i - xyz_j - r = np.linalg.norm(rv)/cr - - # Check for each type of bond. - if (cc_bond == bond) and (r >= 1.19 and r <= 1.54): - fnm[i, j] = 1.0 - if j > i: - bonds.append((i, j)) - if use_forces: - forces.append(rv*nc[i]*nc[j]/r**3) - elif (ch_bond == bond) and (r >= 1.06 and r <= 1.12): - fnm[i, j] = 1.0 - if j > i: - bonds.append((i, j)) - if use_forces: - forces.append(rv*nc[i]*nc[j]/r**3) - elif (co_bond == bond) and (r >= 1.43 and r <= 2.15): - fnm[i, j] = 0.8 - if j > i: - bonds.append((i, j)) - if use_forces: - forces.append(rv*nc[i]*nc[j]/r**3) - elif (cn_bond == bond) and (r >= 1.47 and r <= 2.10): - fnm[i, j] = 1.0 - if j > i: - bonds.append((i, j)) - if use_forces: - forces.append(rv*nc[i]*nc[j]/r**3) - elif (cs_bond == bond) and (r >= 1.81 and r <= 2.55): - fnm[i, j] = 0.7 - if j > i: - bonds.append((i, j)) - if use_forces: + if bond in pos_bonds.keys(): + r_min = pos_bonds[bond][0] + r_max = pos_bonds[bond][1] + rv = xyz_i - xyz_j + r = np.linalg.norm(rv)/cr + if r >= r_min and r <= r_max: + fnm[i, j] = True + if j > i: + bonds.append((i, j)) forces.append(rv*nc[i]*nc[j]/r**3) return fnm, bonds, forces -- cgit v1.2.3-70-g09d2 From 5bf923679707fd1603a6f73ca4d0ae8ec39e7858 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 29 Feb 2020 07:40:33 -0700 Subject: Almost completely rewrite fnm to hd --- ml_exp/__init__.py | 4 ++-- ml_exp/compound.py | 38 +++++++++++++++++++++++--------------- ml_exp/representations.py | 36 +++++++++++++++++++----------------- 3 files changed, 44 insertions(+), 34 deletions(-) diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py index 48ece56c8..9e64d7754 100644 --- a/ml_exp/__init__.py +++ b/ml_exp/__init__.py @@ -22,7 +22,7 @@ SOFTWARE. """ from ml_exp.compound import Compound from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\ - first_neighbor_matrix, adjacency_matrix, check_bond, bag_of_stuff + get_helping_data, adjacency_matrix, check_bond, bag_of_stuff from ml_exp.math import cholesky_solve from ml_exp.qm7db import qm7db from ml_exp.do_ml import simple_ml, do_ml @@ -30,7 +30,7 @@ from ml_exp.do_ml import simple_ml, do_ml __all__ = ['Compound', 'coulomb_matrix', 'lennard_jones_matrix', - 'first_neighbor_matrix', + 'get_helping_data', 'adjacency_matrix', 'check_bond', 'bag_of_stuff', diff --git a/ml_exp/compound.py b/ml_exp/compound.py index ceace6984..26e393510 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -23,7 +23,7 @@ SOFTWARE. import numpy as np from ml_exp.data import NUCLEAR_CHARGE from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\ - first_neighbor_matrix, adjacency_matrix, bag_of_stuff + get_helping_data, adjacency_matrix, bag_of_stuff class Compound: @@ -46,7 +46,6 @@ class Compound: # Computed data. self.cm = None self.ljm = None - self.fnm = None self.am = None self.bob = None self.bo_atoms = None @@ -54,8 +53,11 @@ class Compound: self.bof = None # Helping data. + self.fnm = None self.bonds = None - self.forces = None + self.bonds_i = None + self.bonds_k = None + self.bonds_f = None if xyz is not None: self.read_xyz(xyz) @@ -101,28 +103,34 @@ class Compound: as_eig=as_eig, bohr_ru=bohr_ru) - def gen_am(self, - use_forces=False, + def gen_hd(self, size=23, bohr_ru=False): """ + Generate the helping data for use in Adjacency Matrix, for example. + size: compund size. + bohr_ru: if radius units should be in bohr's radius units. + """ + hp = get_helping_data(self.coordinates, + self.nc, + self.atoms, + size=size, + bohr_ru=bohr_ru) + + self.fnm, self.bonds, self.bonds_i, self.bonds_k, self.bonds_f = hp + + def gen_am(self, + use_forces=False, + size=23): + """ Generate the Adjacency Matrix for the compund. use_forces: if the use of forces instead of k_cx should be used. size: compound size. bohr_ru: if radius units should be in bohr's radius units. """ - # First, generate the first neighor matrix. - fnm_data = first_neighbor_matrix(self.coordinates, - self.atoms_nc, - self.atoms, - size=size, - use_forces=use_forces, - bohr_ru=bohr_ru) - self.fnm, self.bonds, self.forces = fnm_data - # Now, generate the adjacency matrix. self.am = adjacency_matrix(self.fnm, self.bonds, - self.forces, + self.bonds_f, size=size) def gen_bob(self, diff --git a/ml_exp/representations.py b/ml_exp/representations.py index 515821290..5e684314b 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -166,18 +166,17 @@ size. Arrays are not of the right shape.') return lj -def first_neighbor_matrix(coords, - nc, - atoms, - size=23, - use_forces=False, - bohr_ru=False): +def get_helping_data(coords, + nc, + atoms, + size=23, + bohr_ru=False): """ - Creates the First Neighbor Matrix from the molecule data given. + Creates helping data such as the First Neighbor Matrix for the compound. coords: compound coordinates. nc: nuclear charge data. atoms: list of atoms. - use_forces: if the use of forces instead of k_cx should be used. + size: compund size. bohr_ru: if radius units should be in bohr's radius units. NOTE: Bond distance of carbon to other elements are (for atoms present in the qm7 dataset): @@ -209,15 +208,15 @@ size. Arrays are not of the right shape.') co_bond = sorted(['C', 'O']) cn_bond = sorted(['C', 'N']) cs_bond = sorted(['C', 'S']) - - pos_bonds = {cc_bond: (1.19, 1.54), ch_bond: (1.06, 1.12), - co_bond: (1.43, 2.15), cn_bond: (1.47, 2.19), - cs_bond: (1.81, 2.55)} + pos_bonds = {cc_bond: (1.19, 1.54, 1.0), ch_bond: (1.06, 1.12, 1.0), + co_bond: (1.43, 2.15, 0.8), cn_bond: (1.47, 2.19, 1.0), + cs_bond: (1.81, 2.55, 0.7)} fnm = np.zeros((size, size), dtype=bool) - bonds = [] - forces = [] + bonds_i = [] + bonds_k = [] + bonds_f = [] for i, xyz_i in enumerate(coords): for j, xyz_j in enumerate(coords): # Ignore the diagonal. @@ -230,11 +229,14 @@ size. Arrays are not of the right shape.') r = np.linalg.norm(rv)/cr if r >= r_min and r <= r_max: fnm[i, j] = True + # Only add to the list if in the upper triangle. if j > i: - bonds.append((i, j)) - forces.append(rv*nc[i]*nc[j]/r**3) + bonds.append(bond) + bonds_i.append((i, j)) + bonds_k.append(pos_bonds[bond][2]) + bonds_f.append(rv*nc[i]*nc[j]/r**3) - return fnm, bonds, forces + return fnm, bonds, bonds_i, bonds_k, bonds_f def adjacency_matrix(fnm, -- cgit v1.2.3-70-g09d2 From 96df3d1b5e1de08d543991172f1fb1845b073018 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 29 Feb 2020 07:40:56 -0700 Subject: Fix typo --- ml_exp/compound.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ml_exp/compound.py b/ml_exp/compound.py index 26e393510..23c66ec27 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -111,13 +111,13 @@ class Compound: size: compund size. bohr_ru: if radius units should be in bohr's radius units. """ - hp = get_helping_data(self.coordinates, + hd = get_helping_data(self.coordinates, self.nc, self.atoms, size=size, bohr_ru=bohr_ru) - self.fnm, self.bonds, self.bonds_i, self.bonds_k, self.bonds_f = hp + self.fnm, self.bonds, self.bonds_i, self.bonds_k, self.bonds_f = hd def gen_am(self, use_forces=False, -- cgit v1.2.3-70-g09d2 From e289a2de7a461a1194a47350e5339dce082972e7 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 29 Feb 2020 07:42:25 -0700 Subject: Fix typo --- ml_exp/__init__.py | 4 ++-- ml_exp/compound.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py index 9e64d7754..4d672efd7 100644 --- a/ml_exp/__init__.py +++ b/ml_exp/__init__.py @@ -22,7 +22,7 @@ SOFTWARE. """ from ml_exp.compound import Compound from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\ - get_helping_data, adjacency_matrix, check_bond, bag_of_stuff + get_helping_data, adjacency_matrix, check_bond, bag_of_bonds from ml_exp.math import cholesky_solve from ml_exp.qm7db import qm7db from ml_exp.do_ml import simple_ml, do_ml @@ -33,7 +33,7 @@ __all__ = ['Compound', 'get_helping_data', 'adjacency_matrix', 'check_bond', - 'bag_of_stuff', + 'bag_of_bonds', 'cholesky_solve', 'qm7db', 'simple_ml', diff --git a/ml_exp/compound.py b/ml_exp/compound.py index 23c66ec27..ec5085797 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -23,7 +23,7 @@ SOFTWARE. import numpy as np from ml_exp.data import NUCLEAR_CHARGE from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\ - get_helping_data, adjacency_matrix, bag_of_stuff + get_helping_data, adjacency_matrix, bag_of_bonds class Compound: @@ -139,7 +139,7 @@ class Compound: Generate the Bag of Stuff for the compound. size: compound size. """ - self.bob = bag_of_stuff(self.cm, + self.bob = bag_of_bonds(self.cm, self.atoms, size=size) -- cgit v1.2.3-70-g09d2 From 759b48a965470ac18910951b2cb0599d371d1808 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 29 Feb 2020 07:46:27 -0700 Subject: bos to bob --- ml_exp/do_ml.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py index d8ee415bf..3cb67675f 100644 --- a/ml_exp/do_ml.py +++ b/ml_exp/do_ml.py @@ -168,9 +168,8 @@ def do_ml(db_path='data', compound.gen_am(use_forces=use_forces, size=size, bohr_ru=bohr_ru) - if 'BOS' in identifiers: - compound.gen_bos(size=size, - stuff=stuff) + if 'BOB' in identifiers: + compound.gen_bob(size=size) # Create a numpy array for the descriptors. if 'CM' in identifiers: @@ -179,8 +178,8 @@ def do_ml(db_path='data', ljm_data = np.array([comp.ljm for comp in compounds], dtype=float) if 'AM' in identifiers: am_data = np.array([comp.cm for comp in compounds], dtype=float) - if 'BOS' in identifiers: - bos_data = np.array([comp.bos for comp in compounds], dtype=float) + if 'BOB' in identifiers: + bob_data = np.array([comp.bob for comp in compounds], dtype=float) toc = time.perf_counter() tictoc = toc - tic @@ -212,8 +211,8 @@ def do_ml(db_path='data', sigma=sigma, identifier='CM', show_msgs=show_msgs) - if 'BOS' in identifiers: - bos_mae, bos_tictoc = simple_ml(bos_data, + if 'BOB' in identifiers: + bob_mae, bob_tictoc = simple_ml(bob_data, energy_pbe0, training_size=training_size, test_size=test_size, -- cgit v1.2.3-70-g09d2 From da6b1428cda40534b820642d8e9be398d7fee8be Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 29 Feb 2020 09:06:54 -0700 Subject: Rewrite kernel, gotta fix adj mat --- ml_exp/compound.py | 1 - ml_exp/do_ml.py | 29 +++++++++++++++++++++++------ ml_exp/kernels.py | 25 ++++++++++++++++++------- 3 files changed, 41 insertions(+), 14 deletions(-) diff --git a/ml_exp/compound.py b/ml_exp/compound.py index ec5085797..f603ddd4e 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -126,7 +126,6 @@ class Compound: Generate the Adjacency Matrix for the compund. use_forces: if the use of forces instead of k_cx should be used. size: compound size. - bohr_ru: if radius units should be in bohr's radius units. """ self.am = adjacency_matrix(self.fnm, self.bonds, diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py index 3cb67675f..758bf01ae 100644 --- a/ml_exp/do_ml.py +++ b/ml_exp/do_ml.py @@ -33,6 +33,7 @@ def simple_ml(descriptors, training_size, test_size=None, sigma=1000.0, + opt=True, identifier=None, show_msgs=True): """ @@ -43,6 +44,7 @@ def simple_ml(descriptors, test_size: size of the test set to use. If no size is given, the last remaining molecules are used. sigma: depth of the kernel. + opt: if the optimized algorithm should be used. For benchmarking purposes. identifier: string with the name of the descriptor used. show_msgs: if debug messages should be shown. NOTE: identifier is just a string and is only for identification purposes. @@ -76,13 +78,21 @@ def simple_ml(descriptors, X_training = descriptors[:training_size] Y_training = energies[:training_size] - K_training = gaussian_kernel(X_training, X_training, sigma) - alpha = cholesky_solve(K_training, Y_training) + K_training = gaussian_kernel(X_training, + X_training, + sigma, + opt=opt) + alpha = cholesky_solve(K_training, + Y_training) X_test = descriptors[-test_size:] Y_test = energies[-test_size:] - K_test = gaussian_kernel(X_test, X_training, sigma) - Y_predicted = np.dot(K_test, alpha) + K_test = gaussian_kernel(X_test, + X_training, + sigma, + opt=opt) + Y_predicted = np.dot(K_test, + alpha) mae = np.mean(np.abs(Y_predicted - Y_test)) if show_msgs: @@ -113,6 +123,7 @@ def do_ml(db_path='data', training_size=1500, test_size=None, sigma=1000.0, + opt=True, identifiers=["CM"], show_msgs=True): """ @@ -132,6 +143,7 @@ def do_ml(db_path='data', test_size: size of the test set to use. If no size is given, the last remaining molecules are used. sigma: depth of the kernel. + opt: if the optimized algorithm should be used. For benchmarking purposes. identifiers: list of names (strings) of descriptors to use. show_msgs: if debug messages should be shown. """ @@ -165,9 +177,10 @@ def do_ml(db_path='data', as_eig=as_eig, bohr_ru=bohr_ru) if 'AM' in identifiers: - compound.gen_am(use_forces=use_forces, - size=size, + compound.gen_hd(size=size, bohr_ru=bohr_ru) + compound.gen_am(use_forces=use_forces, + size=size) if 'BOB' in identifiers: compound.gen_bob(size=size) @@ -193,6 +206,7 @@ def do_ml(db_path='data', training_size=training_size, test_size=test_size, sigma=sigma, + opt=opt, identifier='CM', show_msgs=show_msgs) if 'LJM' in identifiers: @@ -201,6 +215,7 @@ def do_ml(db_path='data', training_size=training_size, test_size=test_size, sigma=sigma, + opt=opt, identifier='LJM', show_msgs=show_msgs) if 'AM' in identifiers: @@ -209,6 +224,7 @@ def do_ml(db_path='data', training_size=training_size, test_size=test_size, sigma=sigma, + opt=opt, identifier='CM', show_msgs=show_msgs) if 'BOB' in identifiers: @@ -217,6 +233,7 @@ def do_ml(db_path='data', training_size=training_size, test_size=test_size, sigma=sigma, + opt=opt, identifier='CM', show_msgs=show_msgs) diff --git a/ml_exp/kernels.py b/ml_exp/kernels.py index 3914ffc20..e6855f97e 100644 --- a/ml_exp/kernels.py +++ b/ml_exp/kernels.py @@ -26,20 +26,31 @@ import numpy as np def gaussian_kernel(X1, X2, - sigma): + sigma, + opt=True): """ Calculates the Gaussian Kernel. X1: first representations. X2: second representations. sigma: kernel width. + opt: if the optimized algorithm should be used. For benchmarking purposes. """ - inv_sigma = -0.5 / (sigma*sigma) + i_sigma = -0.5 / (sigma*sigma) K = np.zeros((X1.shape[0], X2.shape[0]), dtype=float) - for i, x1 in enumerate(X1): - for j, x2 in enumerate(X2): - f_norm = np.linalg.norm(x1 - x2) - # print(f_norm) - K[i, j] = math.exp(inv_sigma * f_norm) + if opt: + # Faster way of calculating the kernel. + for i, x1 in enumerate(X1): + if X2.ndim == 3: + norm = np.linalg.norm(X2 - x1, axis=(1, 2)) + else: + norm = np.linalg.norm(X2 - x1, axis=-1) + K[i, :] = np.exp(i_sigma * np.square(norm)) + else: + for i, x1 in enumerate(X1): + for j, x2 in enumerate(X2): + f_norm = np.linalg.norm(x2 - x1) + # print(f_norm) + K[i, j] = math.exp(i_sigma * f_norm**2) return K -- cgit v1.2.3-70-g09d2 From 2db295947a75ab0347ba0e11b98564da9d228c6a Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 29 Feb 2020 09:08:22 -0700 Subject: Temp fix for adj mat --- ml_exp/representations.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ml_exp/representations.py b/ml_exp/representations.py index 5e684314b..5e99627df 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -242,12 +242,14 @@ size. Arrays are not of the right shape.') def adjacency_matrix(fnm, bonds, forces, + use_forces=False, size=22): """ Calculates the adjacency matrix given the bond list. fnm: first neighbour matrix. bonds: list of bonds (tuple of indexes). forces: list of forces. + use_forces: if the use of forces instead of k_cx should be used. size: compund size. """ n = len(bonds) @@ -264,7 +266,7 @@ def adjacency_matrix(fnm, # Ignore the diagonal. if i != j: if (bond_i[0] in bond_j) or (bond_i[1] in bond_j): - if forces: + if use_forces: am[i, j] = np.dot(forces[i], forces[j]) else: am[i, j] = fnm[bond_i[0], bond_i[1]] -- cgit v1.2.3-70-g09d2 From cf6c2d1bdb4bba9d4e3db035d26efd874b5ed230 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 29 Feb 2020 09:13:10 -0700 Subject: Forgot one save --- ml_exp/compound.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ml_exp/compound.py b/ml_exp/compound.py index f603ddd4e..e2b7ab7fd 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -130,6 +130,7 @@ class Compound: self.am = adjacency_matrix(self.fnm, self.bonds, self.bonds_f, + use_forces=use_forces, size=size) def gen_bob(self, -- cgit v1.2.3-70-g09d2 From 1384a40fb90f9118410d5473634ddbff8b628841 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 29 Feb 2020 09:22:57 -0700 Subject: Change " to \ --- ml_exp/do_ml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py index 758bf01ae..db50cc8fa 100644 --- a/ml_exp/do_ml.py +++ b/ml_exp/do_ml.py @@ -124,7 +124,7 @@ def do_ml(db_path='data', test_size=None, sigma=1000.0, opt=True, - identifiers=["CM"], + identifiers=['CM'], show_msgs=True): """ Main function that does the whole ML process. -- cgit v1.2.3-70-g09d2 From 9e90b73721edf370f7be9c77aba8431bbe762bdb Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 29 Feb 2020 11:31:40 -0700 Subject: float to np float) --- ml_exp/kernels.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ml_exp/kernels.py b/ml_exp/kernels.py index e6855f97e..feaf9a990 100644 --- a/ml_exp/kernels.py +++ b/ml_exp/kernels.py @@ -37,9 +37,9 @@ def gaussian_kernel(X1, """ i_sigma = -0.5 / (sigma*sigma) - K = np.zeros((X1.shape[0], X2.shape[0]), dtype=float) + K = np.zeros((X1.shape[0], X2.shape[0]), dtype=np.float64) if opt: - # Faster way of calculating the kernel. + # Faster way of calculating the kernel (no numba support). for i, x1 in enumerate(X1): if X2.ndim == 3: norm = np.linalg.norm(X2 - x1, axis=(1, 2)) @@ -50,7 +50,6 @@ def gaussian_kernel(X1, for i, x1 in enumerate(X1): for j, x2 in enumerate(X2): f_norm = np.linalg.norm(x2 - x1) - # print(f_norm) K[i, j] = math.exp(i_sigma * f_norm**2) return K -- cgit v1.2.3-70-g09d2 From 53c84552df5c7a7e1f084b0aec8eac63f7813e52 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 29 Feb 2020 11:45:17 -0700 Subject: Change dtypes to np dtypes --- ml_exp/compound.py | 8 ++++---- ml_exp/do_ml.py | 8 ++++---- ml_exp/math.py | 4 ++-- ml_exp/qm7db.py | 8 ++++---- ml_exp/representations.py | 10 +++++----- 5 files changed, 19 insertions(+), 19 deletions(-) diff --git a/ml_exp/compound.py b/ml_exp/compound.py index e2b7ab7fd..4428e9164 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -153,15 +153,15 @@ class Compound: lines = f.readlines() self.name = filename.split('/')[-1] - self.n = int(lines[0]) + self.n = np.int32(lines[0]) self.extra = lines[1] self.atoms = [] - self.atoms_nc = np.empty(self.n, dtype=int) - self.coordinates = np.empty((self.n, 3), dtype=float) + self.atoms_nc = np.empty(self.n, dtype=np.int64) + self.coordinates = np.empty((self.n, 3), dtype=np.float64) for i, atom in enumerate(lines[2:self.n + 2]): atom_d = atom.split() self.atoms.append(atom_d[0]) self.atoms_nc[i] = NUCLEAR_CHARGE[atom_d[0]] - self.coordinates[i] = np.asarray(atom_d[1:4], dtype=float) + self.coordinates[i] = np.asarray(atom_d[1:4], dtype=np.float64) diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py index db50cc8fa..04f8abe87 100644 --- a/ml_exp/do_ml.py +++ b/ml_exp/do_ml.py @@ -186,13 +186,13 @@ def do_ml(db_path='data', # Create a numpy array for the descriptors. if 'CM' in identifiers: - cm_data = np.array([comp.cm for comp in compounds], dtype=float) + cm_data = np.array([comp.cm for comp in compounds], dtype=np.float64) if 'LJM' in identifiers: - ljm_data = np.array([comp.ljm for comp in compounds], dtype=float) + ljm_data = np.array([comp.ljm for comp in compounds], dtype=np.float64) if 'AM' in identifiers: - am_data = np.array([comp.cm for comp in compounds], dtype=float) + am_data = np.array([comp.cm for comp in compounds], dtype=np.float64) if 'BOB' in identifiers: - bob_data = np.array([comp.bob for comp in compounds], dtype=float) + bob_data = np.array([comp.bob for comp in compounds], dtype=np.float64) toc = time.perf_counter() tictoc = toc - tic diff --git a/ml_exp/math.py b/ml_exp/math.py index da1fdf08f..253abe5d2 100644 --- a/ml_exp/math.py +++ b/ml_exp/math.py @@ -39,7 +39,7 @@ def cholesky_solve(K, size = K.shape[0] # Solve Lx=y for x. - x = np.zeros(size, dtype=float) + x = np.zeros(size, dtype=np.float64) x[0] = y[0] / L[0, 0] for i in range(1, size): temp_sum = 0.0 @@ -49,7 +49,7 @@ def cholesky_solve(K, # Now, solve LTa=x for a. L2 = L.T - a = np.zeros(size, dtype=float) + a = np.zeros(size, dtype=np.float64) a[size - 1] = x[size - 1] / L2[size - 1, size - 1] for i in range(0, size - 1)[::-1]: temp_sum = 0.0 diff --git a/ml_exp/qm7db.py b/ml_exp/qm7db.py index 3fd46b6bc..3ba2c5814 100644 --- a/ml_exp/qm7db.py +++ b/ml_exp/qm7db.py @@ -42,14 +42,14 @@ def qm7db(db_path='data', for i, line in enumerate(lines): line = line.split() compounds.append(Compound(f'{db_path}/{line[0]}')) - compounds[i].pbe0 = float(line[1]) - compounds[i].delta = float(line[1]) - float(line[2]) + compounds[i].pbe0 = np.float64(line[1]) + compounds[i].delta = np.float64(line[1]) - np.float64(line[2]) if is_shuffled: random.seed(r_seed) random.shuffle(compounds) - e_pbe0 = np.array([compound.pbe0 for compound in compounds], dtype=float) - e_delta = np.array([compound.delta for compound in compounds], dtype=float) + e_pbe0 = np.array([comp.pbe0 for comp in compounds], dtype=np.float64) + e_delta = np.array([comp.delta for comp in compounds], dtype=np.float64) return compounds, e_pbe0, e_delta diff --git a/ml_exp/representations.py b/ml_exp/representations.py index 5e99627df..31259c79c 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -53,7 +53,7 @@ size. Arrays are not of the right shape.') 'instead of (size).') size = n - cm = np.zeros((size, size), dtype=float) + cm = np.zeros((size, size), dtype=np.float64) # Actual calculation of the coulomb matrix. for i, xyz_i in enumerate(coords): @@ -120,7 +120,7 @@ size. Arrays are not of the right shape.') 'instead of (size).') size = n - lj = np.zeros((size, size), dtype=float) + lj = np.zeros((size, size), dtype=np.float64) # Actual calculation of the lennard-jones matrix. for i, xyz_i in enumerate(coords): @@ -259,7 +259,7 @@ def adjacency_matrix(fnm, instead of (size).') size = n - am = np.zeros((size, size), dtype=float) + am = np.zeros((size, size), dtype=np.float64) for i, bond_i in enumerate(bonds): for j, bond_j in enumerate(bonds): @@ -317,7 +317,7 @@ generated as the vector of eigenvalues, try (re-)generating the CM.') size = n # Bond max length, calculated using only the upper triangular matrix. - bond_size = int((size * size - size)/2 + size) + bond_size = np.int32((size * size - size)/2 + size) # List where each bag data is stored. bags = [] @@ -350,7 +350,7 @@ generated as the vector of eigenvalues, try (re-)generating the CM.') bonds = atom_list + bonds # Create the final vector for the bob. - bob = np.zeros(bond_size, dtype=float) + bob = np.zeros(bond_size, dtype=np.float64) c_i = 0 for i, bond in enumerate(bonds): checker = check_bond(bags, bond) -- cgit v1.2.3-70-g09d2 From 4a3f602aab73bb0304a237ddca5d9f3fa8867001 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 29 Feb 2020 14:07:24 -0700 Subject: Fix bugs --- ml_exp/do_ml.py | 4 ++-- ml_exp/representations.py | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py index 04f8abe87..5054b2b06 100644 --- a/ml_exp/do_ml.py +++ b/ml_exp/do_ml.py @@ -225,7 +225,7 @@ def do_ml(db_path='data', test_size=test_size, sigma=sigma, opt=opt, - identifier='CM', + identifier='AM', show_msgs=show_msgs) if 'BOB' in identifiers: bob_mae, bob_tictoc = simple_ml(bob_data, @@ -234,7 +234,7 @@ def do_ml(db_path='data', test_size=test_size, sigma=sigma, opt=opt, - identifier='CM', + identifier='BOB', show_msgs=show_msgs) # End of program diff --git a/ml_exp/representations.py b/ml_exp/representations.py index 31259c79c..6b5d26bcb 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -358,10 +358,12 @@ generated as the vector of eigenvalues, try (re-)generating the CM.') for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]): # Use c_i as the index for bob if the zero padding should # be at the end of the vector instead of between each bond. - bob[i*size + j] = num + # bob[i*size + j] = num + bob[c_i] = num c_i += 1 else: print(f'Error. Bond {bond} from bond list coudn\'t be found', 'in the bags list. This could be a case where the atom', 'is only present oncce in the molecule.') + return bob -- cgit v1.2.3-70-g09d2 From 62cef5f1782caf4ff379cd021744b98be4be6c8b Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Mon, 2 Mar 2020 12:01:58 -0700 Subject: Fix bug --- ml_exp/compound.py | 16 ++++++++-------- ml_exp/do_ml.py | 6 ++++++ ml_exp/representations.py | 44 ++++++++++++++++++++++++-------------------- 3 files changed, 38 insertions(+), 28 deletions(-) diff --git a/ml_exp/compound.py b/ml_exp/compound.py index 4428e9164..6c8525195 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -38,7 +38,7 @@ class Compound: self.n = None self.extra = None self.atoms = None - self.atoms_nc = None + self.nc = None self.coordinates = None self.pbe0 = None self.delta = None @@ -73,7 +73,7 @@ class Compound: bohr_ru: if radius units should be in bohr's radius units. """ self.cm = coulomb_matrix(self.coordinates, - self.atoms_nc, + self.nc, size=size, as_eig=as_eig, bohr_ru=bohr_ru) @@ -95,7 +95,7 @@ class Compound: bohr_ru: if radius units should be in bohr's radius units. """ self.ljm = lennard_jones_matrix(self.coordinates, - self.atoms_nc, + self.nc, diag_value=diag_value, sigma=sigma, epsilon=epsilon, @@ -112,8 +112,8 @@ class Compound: bohr_ru: if radius units should be in bohr's radius units. """ hd = get_helping_data(self.coordinates, - self.nc, self.atoms, + self.nc, size=size, bohr_ru=bohr_ru) @@ -127,8 +127,8 @@ class Compound: use_forces: if the use of forces instead of k_cx should be used. size: compound size. """ - self.am = adjacency_matrix(self.fnm, - self.bonds, + self.am = adjacency_matrix(self.bonds_i, + self.bonds_k, self.bonds_f, use_forces=use_forces, size=size) @@ -156,12 +156,12 @@ class Compound: self.n = np.int32(lines[0]) self.extra = lines[1] self.atoms = [] - self.atoms_nc = np.empty(self.n, dtype=np.int64) + self.nc = np.empty(self.n, dtype=np.int64) self.coordinates = np.empty((self.n, 3), dtype=np.float64) for i, atom in enumerate(lines[2:self.n + 2]): atom_d = atom.split() self.atoms.append(atom_d[0]) - self.atoms_nc[i] = NUCLEAR_CHARGE[atom_d[0]] + self.nc[i] = NUCLEAR_CHARGE[atom_d[0]] self.coordinates[i] = np.asarray(atom_d[1:4], dtype=np.float64) diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py index 5054b2b06..5efd13690 100644 --- a/ml_exp/do_ml.py +++ b/ml_exp/do_ml.py @@ -176,11 +176,13 @@ def do_ml(db_path='data', size=size, as_eig=as_eig, bohr_ru=bohr_ru) + """ if 'AM' in identifiers: compound.gen_hd(size=size, bohr_ru=bohr_ru) compound.gen_am(use_forces=use_forces, size=size) + """ if 'BOB' in identifiers: compound.gen_bob(size=size) @@ -189,8 +191,10 @@ def do_ml(db_path='data', cm_data = np.array([comp.cm for comp in compounds], dtype=np.float64) if 'LJM' in identifiers: ljm_data = np.array([comp.ljm for comp in compounds], dtype=np.float64) + """ if 'AM' in identifiers: am_data = np.array([comp.cm for comp in compounds], dtype=np.float64) + """ if 'BOB' in identifiers: bob_data = np.array([comp.bob for comp in compounds], dtype=np.float64) @@ -218,6 +222,7 @@ def do_ml(db_path='data', opt=opt, identifier='LJM', show_msgs=show_msgs) + """ if 'AM' in identifiers: am_mae, am_tictoc = simple_ml(am_data, energy_pbe0, @@ -227,6 +232,7 @@ def do_ml(db_path='data', opt=opt, identifier='AM', show_msgs=show_msgs) + """ if 'BOB' in identifiers: bob_mae, bob_tictoc = simple_ml(bob_data, energy_pbe0, diff --git a/ml_exp/representations.py b/ml_exp/representations.py index 6b5d26bcb..be567d67a 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -167,15 +167,15 @@ size. Arrays are not of the right shape.') def get_helping_data(coords, - nc, atoms, + nc, size=23, bohr_ru=False): """ Creates helping data such as the First Neighbor Matrix for the compound. coords: compound coordinates. - nc: nuclear charge data. atoms: list of atoms. + nc: nuclear charge data. size: compund size. bohr_ru: if radius units should be in bohr's radius units. NOTE: Bond distance of carbon to other elements @@ -203,11 +203,11 @@ size. Arrays are not of the right shape.') size = n # Possible bonds. - cc_bond = sorted(['C', 'C']) - ch_bond = sorted(['C', 'H']) - co_bond = sorted(['C', 'O']) - cn_bond = sorted(['C', 'N']) - cs_bond = sorted(['C', 'S']) + cc_bond = ''.join(sorted(['C', 'C'])) + ch_bond = ''.join(sorted(['C', 'H'])) + co_bond = ''.join(sorted(['C', 'O'])) + cn_bond = ''.join(sorted(['C', 'N'])) + cs_bond = ''.join(sorted(['C', 'S'])) pos_bonds = {cc_bond: (1.19, 1.54, 1.0), ch_bond: (1.06, 1.12, 1.0), co_bond: (1.43, 2.15, 0.8), cn_bond: (1.47, 2.19, 1.0), cs_bond: (1.81, 2.55, 0.7)} @@ -221,7 +221,7 @@ size. Arrays are not of the right shape.') for j, xyz_j in enumerate(coords): # Ignore the diagonal. if i != j: - bond = sorted([atoms[i], atoms[j]]) + bond = ''.join(sorted([atoms[i], atoms[j]])) if bond in pos_bonds.keys(): r_min = pos_bonds[bond][0] r_max = pos_bonds[bond][1] @@ -239,21 +239,25 @@ size. Arrays are not of the right shape.') return fnm, bonds, bonds_i, bonds_k, bonds_f -def adjacency_matrix(fnm, - bonds, - forces, +def adjacency_matrix(bonds_i, + bonds_k, + bonds_f, use_forces=False, - size=22): + size=23): """ Calculates the adjacency matrix given the bond list. - fnm: first neighbour matrix. - bonds: list of bonds (tuple of indexes). - forces: list of forces. + bonds: list of bond names. + bonds_i: list of bond indexes (tuple of indexes). + bonds_k: list of k_cx values. + bonds_f: list of force values. use_forces: if the use of forces instead of k_cx should be used. size: compund size. """ - n = len(bonds) + if bonds_i is None: + raise ValueError('The helping data hasn\'t been initialized for\ +the current compound.') + n = len(bonds_i) if size < n: print('Error. Compound size (n) is greater han (size). Using (n)\ instead of (size).') @@ -261,15 +265,15 @@ def adjacency_matrix(fnm, am = np.zeros((size, size), dtype=np.float64) - for i, bond_i in enumerate(bonds): - for j, bond_j in enumerate(bonds): + for i, bond_i in enumerate(bonds_i): + for j, bond_j in enumerate(bonds_i): # Ignore the diagonal. if i != j: if (bond_i[0] in bond_j) or (bond_i[1] in bond_j): if use_forces: - am[i, j] = np.dot(forces[i], forces[j]) + am[i, j] = np.dot(bonds_f[i], bonds_f[j]) else: - am[i, j] = fnm[bond_i[0], bond_i[1]] + am[i, j] = bonds_k[i] return am -- cgit v1.2.3-70-g09d2