From 0c91ee8b6c968457cff1d968cf69add73902bd11 Mon Sep 17 00:00:00 2001 From: David Luevano <55825613+luevano@users.noreply.github.com> Date: Thu, 20 Feb 2020 19:02:23 -0700 Subject: Clean main --- ml_exp/__main__.py | 51 ++++++--------------------------------------------- 1 file changed, 6 insertions(+), 45 deletions(-) diff --git a/ml_exp/__main__.py b/ml_exp/__main__.py index aaeb18714..83abb918e 100644 --- a/ml_exp/__main__.py +++ b/ml_exp/__main__.py @@ -22,50 +22,11 @@ SOFTWARE. """ # from ml_exp.do_ml import do_ml # from ml_exp.misc import plot_benchmarks -from ml_exp.read_qm7_data import read_qm7_data -from ml_exp.adj_matrix import fneig_matrix, adj_matrix -from ml_exp.c_matrix import c_matrix -from ml_exp.bob import bob -from ml_exp.bostuff import bok_cx +# from ml_exp.read_qm7_data import read_qm7_data +# from ml_exp.adj_matrix import fneig_matrix, adj_matrix +# from ml_exp.c_matrix import c_matrix +# from ml_exp.bob import bob +# from ml_exp.bostuff import bok_cx if __name__ == '__main__': - """ - do_ml(min_training_size=1500, - max_training_size=2000, - training_increment_size=500, - test_size=None, - ljm_diag_value=None, - ljm_sigma=1.0, - ljm_epsilon=1.0, - r_seed=111, - save_benchmarks=False, - show_msgs=True) - """ - # plot_benchmarks() - """ - xyz, nc, pbe0, delta, atoms = read_qm7_data(return_atoms=True) - for i in range(1): - fnm, bonds, forces = fneig_matrix(atoms[i], xyz[i], nc[i], True) - am = adj_matrix(bonds, forces) - - print(f'{i} first neighbor matrix\n{fnm}') - print(f'{i} bond list\n{bonds}') - print(f'{i} force list\n{forces}') - print(f'{i} adjacency matrix\n{am}') - print('-'*30) - print('OK!') - """ - xyz, nc, pbe0, delta, atoms = read_qm7_data(return_atoms=True) - for i in range(1): - cm = c_matrix(xyz[i], nc[i], as_eig=False) - bob_i = bob(cm, atoms[i]) - - print(f'{i} coulomb matrix\n{cm}') - print(f'{i} bag of bonds\n{bob}') - - fnm, bonds = fneig_matrix(atoms[i], xyz[i], nc[i], False) - am = adj_matrix(fnm, bonds, None) - bok_cx_i = bok_cx(am, atoms[i]) - - print(f'{i} adjacency matrix\n{am}') - print(f'{i} bag of k_cx\n{bok_cx_i}') + print('Ok!') -- cgit v1.2.3-70-g09d2 From 7a83a6e48a7503848bf1a4abd448f7858dca71e0 Mon Sep 17 00:00:00 2001 From: David Luevano <55825613+luevano@users.noreply.github.com> Date: Thu, 20 Feb 2020 19:54:46 -0700 Subject: Add basic compound object --- ml_exp/__main__.py | 8 -------- ml_exp/compound.py | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 51 insertions(+), 8 deletions(-) create mode 100644 ml_exp/compound.py diff --git a/ml_exp/__main__.py b/ml_exp/__main__.py index 83abb918e..3cb515d85 100644 --- a/ml_exp/__main__.py +++ b/ml_exp/__main__.py @@ -20,13 +20,5 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -# from ml_exp.do_ml import do_ml -# from ml_exp.misc import plot_benchmarks -# from ml_exp.read_qm7_data import read_qm7_data -# from ml_exp.adj_matrix import fneig_matrix, adj_matrix -# from ml_exp.c_matrix import c_matrix -# from ml_exp.bob import bob -# from ml_exp.bostuff import bok_cx - if __name__ == '__main__': print('Ok!') diff --git a/ml_exp/compound.py b/ml_exp/compound.py new file mode 100644 index 000000000..6fd46fc3a --- /dev/null +++ b/ml_exp/compound.py @@ -0,0 +1,51 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import numpy as np + + +class Compound: + def __init__(self, xyz=None): + self.n = None + self.atoms = None + self.coordinates = None + + if xyz is not None: + self.read_xyz(xyz) + + def read_xyz(self, filename): + """ + Reads an xyz file and adds the corresponding data to the Compound. + filename: (path to) the xyz file. + """ + with open(filename, 'r') as f: + lines = f.readlines() + + self.n = int(lines[0]) + self.atoms = [] + self.coordinates = np.empty((self.n, 3), dtype=float) + + for i, atom in enumerate(lines[2:self.n + 2]): + atom_d = atom.split() + + self.atoms.append(atom_d[0]) + self.coordinates[i] = np.asarray(atom_d[1:4], dtype=float) -- cgit v1.2.3-70-g09d2 From 96eade3b9f72c5d1fa5bae27ba82a2cd23e1e200 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Fri, 21 Feb 2020 17:16:22 -0700 Subject: Add nuclear charge --- ml_exp/compound.py | 11 ++++ ml_exp/data.py | 141 ++++++++++++++++++++++++++++++++++++++++++++++++ ml_exp/read_qm7_data.py | 11 ++++ 3 files changed, 163 insertions(+) create mode 100644 ml_exp/data.py diff --git a/ml_exp/compound.py b/ml_exp/compound.py index 6fd46fc3a..cfa3e0322 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -21,13 +21,22 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import numpy as np +from ml_exp.data import NUCLEAR_CHARGE class Compound: def __init__(self, xyz=None): + # empty_array = np.asarray([], dtype=float) + self.n = None self.atoms = None + self.atoms_nc = None self.coordinates = None + self.energy = None + + self.coulomb_matrix = None + self.lennard_jones_matrix = None + self.bob = None if xyz is not None: self.read_xyz(xyz) @@ -42,10 +51,12 @@ class Compound: self.n = int(lines[0]) self.atoms = [] + self.atoms_nc = np.empty(self.n, dtype=int) self.coordinates = np.empty((self.n, 3), dtype=float) for i, atom in enumerate(lines[2:self.n + 2]): atom_d = atom.split() self.atoms.append(atom_d[0]) + self.atoms_nc[i] = NUCLEAR_CHARGE[atom_d[0]] self.coordinates[i] = np.asarray(atom_d[1:4], dtype=float) diff --git a/ml_exp/data.py b/ml_exp/data.py new file mode 100644 index 000000000..243aff168 --- /dev/null +++ b/ml_exp/data.py @@ -0,0 +1,141 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +NUCLEAR_CHARGE = { + 'H': 1, + 'He': 2, + 'Li': 3, + 'Be': 4, + 'B': 5, + 'C': 6, + 'N': 7, + 'O': 8, + 'F': 9, + 'Ne': 10, + 'Na': 11, + 'Mg': 12, + 'Al': 13, + 'Si': 14, + 'P': 15, + 'S': 16, + 'Cl': 17, + 'Ar': 18, + 'K': 19, + 'Ca': 20, + 'Sc': 21, + 'Ti': 22, + 'V': 23, + 'Cr': 24, + 'Mn': 25, + 'Fe': 26, + 'Co': 27, + 'Ni': 28, + 'Cu': 29, + 'Zn': 30, + 'Ga': 31, + 'Ge': 32, + 'As': 33, + 'Se': 34, + 'Br': 35, + 'Kr': 36, + 'Rb': 37, + 'Sr': 38, + 'Y': 39, + 'Zr': 40, + 'Nb': 41, + 'Mo': 42, + 'Tc': 43, + 'Ru': 44, + 'Rh': 45, + 'Pd': 46, + 'Ag': 47, + 'Cd': 48, + 'In': 49, + 'Sn': 50, + 'Sb': 51, + 'Te': 52, + 'I': 53, + 'Xe': 54, + 'Cs': 55, + 'Ba': 56, + 'La': 57, + 'Ce': 58, + 'Pr': 59, + 'Nd': 60, + 'Pm': 61, + 'Sm': 62, + 'Eu': 63, + 'Gd': 64, + 'Tb': 65, + 'Dy': 66, + 'Ho': 67, + 'Er': 68, + 'Tm': 69, + 'Yb': 70, + 'Lu': 71, + 'Hf': 72, + 'Ta': 73, + 'W': 74, + 'Re': 75, + 'Os': 76, + 'Ir': 77, + 'Pt': 78, + 'Au': 79, + 'Hg': 80, + 'Tl': 81, + 'Pb': 82, + 'Bi': 83, + 'Po': 84, + 'At': 85, + 'Rn': 86, + 'Fr': 87, + 'Ra': 88, + 'Ac': 89, + 'Th': 90, + 'Pa': 91, + 'U': 92, + 'Np': 93, + 'Pu': 94, + 'Am': 95, + 'Cm': 96, + 'Bk': 97, + 'Cf': 98, + 'Es': 99, + 'Fm': 100, + 'Md': 101, + 'No': 102, + 'Lr': 103, + 'Rf': 104, + 'Db': 105, + 'Sg': 106, + 'Bh': 107, + 'Hs': 108, + 'Mt': 109, + 'Ds ': 110, + 'Rg ': 111, + 'Cn ': 112, + 'Nh': 113, + 'Fl': 114, + 'Mc': 115, + 'Lv': 116, + 'Ts': 117, + 'Og': 118} diff --git a/ml_exp/read_qm7_data.py b/ml_exp/read_qm7_data.py index 81be05a17..63c48195c 100644 --- a/ml_exp/read_qm7_data.py +++ b/ml_exp/read_qm7_data.py @@ -49,6 +49,17 @@ def read_nc_data(data_path): return {line[2]: int(line[0]) for line in lines} +# For use in the rewriting of the code (from functional programming to object +# oriented programming-ish) +def print_nc(nc_data): + """ + Prints the nuclear charge data to the terminal. + nc_data: dict of nc. + """ + for key in nc_data.keys(): + print(f'\'{key}\': {nc_data[key]},') + + # 'hof_qm7.txt.txt' retrieved from # https://github.com/qmlcode/tutorial def read_db_data(zi_data, -- cgit v1.2.3-70-g09d2 From 4ce7823359a144973c667f488e5fa39e5d3f8acc Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Fri, 21 Feb 2020 17:21:21 -0700 Subject: Remove unused function --- ml_exp/compound.py | 3 +++ ml_exp/read_qm7_data.py | 34 ---------------------------------- 2 files changed, 3 insertions(+), 34 deletions(-) diff --git a/ml_exp/compound.py b/ml_exp/compound.py index cfa3e0322..9536767a7 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -41,6 +41,9 @@ class Compound: if xyz is not None: self.read_xyz(xyz) + def gen_cm(self): + pass + def read_xyz(self, filename): """ Reads an xyz file and adds the corresponding data to the Compound. diff --git a/ml_exp/read_qm7_data.py b/ml_exp/read_qm7_data.py index 63c48195c..666f9c3fc 100644 --- a/ml_exp/read_qm7_data.py +++ b/ml_exp/read_qm7_data.py @@ -21,10 +21,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import os -import time import numpy as np import random -from ml_exp.misc import printc # 'periodic_table_of_elements.txt' retrieved from @@ -139,35 +137,3 @@ def read_db_data(zi_data, if return_atoms: return molecules, nuclear_charge, energy_pbe0, energy_delta, atoms return molecules, nuclear_charge, energy_pbe0, energy_delta - - -def read_qm7_data(data_path='data', - r_seed=111, - return_atoms=False): - """ - Reads all the qm7 data. - data_path: path to the data directory. - r_seed: random seed to use for the shuffling. - return_atoms: if atom list should be returned. - """ - tic = time.perf_counter() - printc('Data reading started.', 'CYAN') - - init_path = os.getcwd() - os.chdir(data_path) - data_path = os.getcwd() - - zi_data = read_nc_data(data_path) - if return_atoms: - molecules, nuclear_charge, energy_pbe0, energy_delta, atoms = \ - read_db_data(zi_data, data_path, r_seed, return_atoms) - else: - molecules, nuclear_charge, energy_pbe0, energy_delta = \ - read_db_data(zi_data, data_path, r_seed) - - os.chdir(init_path) - toc = time.perf_counter() - printc('\tData reading took {:.4f} seconds.'.format(toc-tic), 'GREEN') - if return_atoms: - return molecules, nuclear_charge, energy_pbe0, energy_delta, atoms - return molecules, nuclear_charge, energy_pbe0, energy_delta -- cgit v1.2.3-70-g09d2 From 2dd24b5f7dfa5c43acd8e4281dab22f178a5019e Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Fri, 21 Feb 2020 17:25:25 -0700 Subject: Remove unused function --- ml_exp/c_matrix.py | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/ml_exp/c_matrix.py b/ml_exp/c_matrix.py index 00034a660..ac942d473 100644 --- a/ml_exp/c_matrix.py +++ b/ml_exp/c_matrix.py @@ -20,11 +20,9 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import time import math import numpy as np from numpy.linalg import eig -from ml_exp.misc import printc def c_matrix(mol_data, @@ -146,34 +144,3 @@ def c_matrix(mol_data, return np.sort(eig(cm)[0])[::-1] else: return cm - - -def c_matrix_multiple(mol_data, - nc_data, - pipe=None, - max_len=25, - as_eig=True, - bohr_radius_units=False): - """ - Calculates the Coulomb Matrix of multiple molecules. - mol_data: molecule data, matrix of atom coordinates. - nc_data: nuclear charge data, array of atom data. - pipe: for multiprocessing purposes. Sends the data calculated - through a pipe. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - """ - printc('Coulomb Matrices calculation started.', 'CYAN') - tic = time.perf_counter() - - cm_data = np.array([c_matrix(mol, nc, max_len, as_eig, bohr_radius_units) - for mol, nc in zip(mol_data, nc_data)]) - - toc = time.perf_counter() - printc('\tCM calculation took {:.4f} seconds.'.format(toc - tic), 'GREEN') - - if pipe: - pipe.send(cm_data) - - return cm_data -- cgit v1.2.3-70-g09d2 From f6fd349a822d6dd2f7172a90c6c35d1eb36f5c95 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Fri, 21 Feb 2020 17:26:26 -0700 Subject: Move cmatrix to representations --- ml_exp/c_matrix.py | 146 ----------------------------------------------------- 1 file changed, 146 deletions(-) delete mode 100644 ml_exp/c_matrix.py diff --git a/ml_exp/c_matrix.py b/ml_exp/c_matrix.py deleted file mode 100644 index ac942d473..000000000 --- a/ml_exp/c_matrix.py +++ /dev/null @@ -1,146 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import math -import numpy as np -from numpy.linalg import eig - - -def c_matrix(mol_data, - nc_data, - max_len=25, - as_eig=True, - bohr_radius_units=False): - """ - Creates the Coulomb Matrix from the molecule data given. - mol_data: molecule data, matrix of atom coordinates. - nc_data: nuclear charge data, array of atom data. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - """ - if bohr_radius_units: - conversion_rate = 0.52917721067 - else: - conversion_rate = 1 - - mol_n = len(mol_data) - mol_nr = range(mol_n) - - if not mol_n == len(nc_data): - print(''.join(['Error. Molecule matrix dimension is different ', - 'than the nuclear charge array dimension.'])) - else: - if max_len < mol_n: - print(''.join(['Error. Molecule matrix dimension (mol_n) is ', - 'greater than max_len. Using mol_n.'])) - max_len = None - - if max_len: - cm = np.zeros((max_len, max_len)) - ml_r = range(max_len) - - # Actual calculation of the coulomb matrix. - for i in ml_r: - if i < mol_n: - x_i = mol_data[i, 0] - y_i = mol_data[i, 1] - z_i = mol_data[i, 2] - Z_i = nc_data[i] - else: - break - - for j in ml_r: - if j < mol_n: - x_j = mol_data[j, 0] - y_j = mol_data[j, 1] - z_j = mol_data[j, 2] - Z_j = nc_data[j] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - cm[i, j] = (0.5*Z_i**2.4) - else: - cm[i, j] = (conversion_rate*Z_i*Z_j/math.sqrt(x - + y - + z)) - else: - break - - # Now the value will be returned. - if as_eig: - cm_sorted = np.sort(eig(cm)[0])[::-1] - # Thanks to SO for the following lines of code. - # https://stackoverflow.com/a/43011036 - - # Keep zeros at the end. - mask = cm_sorted != 0. - f_mask = mask.sum(0, keepdims=1) >\ - np.arange(cm_sorted.shape[0]-1, -1, -1) - - f_mask = f_mask[::-1] - cm_sorted[f_mask] = cm_sorted[mask] - cm_sorted[~f_mask] = 0. - - return cm_sorted - - else: - return cm - - else: - cm_temp = [] - # Actual calculation of the coulomb matrix. - for i in mol_nr: - x_i = mol_data[i, 0] - y_i = mol_data[i, 1] - z_i = mol_data[i, 2] - Z_i = nc_data[i] - - cm_row = [] - for j in mol_nr: - x_j = mol_data[j, 0] - y_j = mol_data[j, 1] - z_j = mol_data[j, 2] - Z_j = nc_data[j] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - cm_row.append(0.5*Z_i**2.4) - else: - cm_row.append(conversion_rate*Z_i*Z_j/math.sqrt(x - + y - + z)) - - cm_temp.append(np.array(cm_row)) - - cm = np.array(cm_temp) - # Now the value will be returned. - if as_eig: - return np.sort(eig(cm)[0])[::-1] - else: - return cm -- cgit v1.2.3-70-g09d2 From d6b381e1ea629879b9855989b0f753632ea9df2a Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Fri, 21 Feb 2020 17:33:41 -0700 Subject: Refactor code, size is 23 not 25 --- ml_exp/representations.py | 145 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 ml_exp/representations.py diff --git a/ml_exp/representations.py b/ml_exp/representations.py new file mode 100644 index 000000000..830e51707 --- /dev/null +++ b/ml_exp/representations.py @@ -0,0 +1,145 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import math +import numpy as np + + +def coulomb_matrix(coords, + nc, + size=23, + as_eig=True, + bohr_radius_units=False): + """ + Creates the Coulomb Matrix from the molecule data given. + coords: compound coordinates. + nc: nuclear charge data. + size: compound size. + as_eig: if data should be returned as matrix or array of eigenvalues. + bohr_radius_units: if units should be in bohr's radius units. + """ + if bohr_radius_units: + conversion_rate = 0.52917721067 + else: + conversion_rate = 1 + + mol_n = len(coords) + mol_nr = range(mol_n) + + if not mol_n == len(nc): + print(''.join(['Error. Molecule matrix dimension is different ', + 'than the nuclear charge array dimension.'])) + else: + if size < mol_n: + print(''.join(['Error. Molecule matrix dimension (mol_n) is ', + 'greater than size. Using mol_n.'])) + size = None + + if size: + cm = np.zeros((size, size)) + ml_r = range(size) + + # Actual calculation of the coulomb matrix. + for i in ml_r: + if i < mol_n: + x_i = coords[i, 0] + y_i = coords[i, 1] + z_i = coords[i, 2] + Z_i = nc[i] + else: + break + + for j in ml_r: + if j < mol_n: + x_j = coords[j, 0] + y_j = coords[j, 1] + z_j = coords[j, 2] + Z_j = nc[j] + + x = (x_i-x_j)**2 + y = (y_i-y_j)**2 + z = (z_i-z_j)**2 + + if i == j: + cm[i, j] = (0.5*Z_i**2.4) + else: + cm[i, j] = (conversion_rate*Z_i*Z_j/math.sqrt(x + + y + + z)) + else: + break + + # Now the value will be returned. + if as_eig: + cm_sorted = np.sort(np.linalg.eig(cm)[0])[::-1] + # Thanks to SO for the following lines of code. + # https://stackoverflow.com/a/43011036 + + # Keep zeros at the end. + mask = cm_sorted != 0. + f_mask = mask.sum(0, keepdims=1) >\ + np.arange(cm_sorted.shape[0]-1, -1, -1) + + f_mask = f_mask[::-1] + cm_sorted[f_mask] = cm_sorted[mask] + cm_sorted[~f_mask] = 0. + + return cm_sorted + + else: + return cm + + else: + cm_temp = [] + # Actual calculation of the coulomb matrix. + for i in mol_nr: + x_i = coords[i, 0] + y_i = coords[i, 1] + z_i = coords[i, 2] + Z_i = nc[i] + + cm_row = [] + for j in mol_nr: + x_j = coords[j, 0] + y_j = coords[j, 1] + z_j = coords[j, 2] + Z_j = nc[j] + + x = (x_i-x_j)**2 + y = (y_i-y_j)**2 + z = (z_i-z_j)**2 + + if i == j: + cm_row.append(0.5*Z_i**2.4) + else: + cm_row.append(conversion_rate*Z_i*Z_j/math.sqrt(x + + y + + z)) + + cm_temp.append(np.array(cm_row)) + + cm = np.array(cm_temp) + # Now the value will be returned. + if as_eig: + return np.sort(np.linalg.eig(cm)[0])[::-1] + else: + return cm -- cgit v1.2.3-70-g09d2 From 02cb17d411e9a1bc2fe5c6bcf75695f23f340648 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Fri, 21 Feb 2020 17:44:34 -0700 Subject: Add exception --- ml_exp/representations.py | 158 +++++++++++++++++++++++----------------------- 1 file changed, 79 insertions(+), 79 deletions(-) diff --git a/ml_exp/representations.py b/ml_exp/representations.py index 830e51707..50b78e548 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -42,83 +42,34 @@ def coulomb_matrix(coords, else: conversion_rate = 1 - mol_n = len(coords) - mol_nr = range(mol_n) + n = coords.shape[0] + nr = range(n) - if not mol_n == len(nc): - print(''.join(['Error. Molecule matrix dimension is different ', - 'than the nuclear charge array dimension.'])) - else: - if size < mol_n: - print(''.join(['Error. Molecule matrix dimension (mol_n) is ', - 'greater than size. Using mol_n.'])) - size = None - - if size: - cm = np.zeros((size, size)) - ml_r = range(size) - - # Actual calculation of the coulomb matrix. - for i in ml_r: - if i < mol_n: - x_i = coords[i, 0] - y_i = coords[i, 1] - z_i = coords[i, 2] - Z_i = nc[i] - else: - break - - for j in ml_r: - if j < mol_n: - x_j = coords[j, 0] - y_j = coords[j, 1] - z_j = coords[j, 2] - Z_j = nc[j] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - cm[i, j] = (0.5*Z_i**2.4) - else: - cm[i, j] = (conversion_rate*Z_i*Z_j/math.sqrt(x - + y - + z)) - else: - break - - # Now the value will be returned. - if as_eig: - cm_sorted = np.sort(np.linalg.eig(cm)[0])[::-1] - # Thanks to SO for the following lines of code. - # https://stackoverflow.com/a/43011036 + if not n == nc.shape[0]: + raise ValueError('Compound size is different than the nuclear charge\ + size. Arrays are not of the right shape.') - # Keep zeros at the end. - mask = cm_sorted != 0. - f_mask = mask.sum(0, keepdims=1) >\ - np.arange(cm_sorted.shape[0]-1, -1, -1) + if size < n: + print(''.join(['Error. Molecule matrix dimension (n) is ', + 'greater than size. Using n.'])) + size = None - f_mask = f_mask[::-1] - cm_sorted[f_mask] = cm_sorted[mask] - cm_sorted[~f_mask] = 0. + if size: + cm = np.zeros((size, size)) + ml_r = range(size) - return cm_sorted - - else: - return cm - - else: - cm_temp = [] - # Actual calculation of the coulomb matrix. - for i in mol_nr: + # Actual calculation of the coulomb matrix. + for i in ml_r: + if i < n: x_i = coords[i, 0] y_i = coords[i, 1] z_i = coords[i, 2] Z_i = nc[i] + else: + break - cm_row = [] - for j in mol_nr: + for j in ml_r: + if j < n: x_j = coords[j, 0] y_j = coords[j, 1] z_j = coords[j, 2] @@ -129,17 +80,66 @@ def coulomb_matrix(coords, z = (z_i-z_j)**2 if i == j: - cm_row.append(0.5*Z_i**2.4) + cm[i, j] = (0.5*Z_i**2.4) else: - cm_row.append(conversion_rate*Z_i*Z_j/math.sqrt(x - + y - + z)) + cm[i, j] = (conversion_rate*Z_i*Z_j/math.sqrt(x + + y + + z)) + else: + break - cm_temp.append(np.array(cm_row)) + # Now the value will be returned. + if as_eig: + cm_sorted = np.sort(np.linalg.eig(cm)[0])[::-1] + # Thanks to SO for the following lines of code. + # https://stackoverflow.com/a/43011036 - cm = np.array(cm_temp) - # Now the value will be returned. - if as_eig: - return np.sort(np.linalg.eig(cm)[0])[::-1] - else: - return cm + # Keep zeros at the end. + mask = cm_sorted != 0. + f_mask = mask.sum(0, keepdims=1) >\ + np.arange(cm_sorted.shape[0]-1, -1, -1) + + f_mask = f_mask[::-1] + cm_sorted[f_mask] = cm_sorted[mask] + cm_sorted[~f_mask] = 0. + + return cm_sorted + + else: + return cm + + else: + cm_temp = [] + # Actual calculation of the coulomb matrix. + for i in nr: + x_i = coords[i, 0] + y_i = coords[i, 1] + z_i = coords[i, 2] + Z_i = nc[i] + + cm_row = [] + for j in nr: + x_j = coords[j, 0] + y_j = coords[j, 1] + z_j = coords[j, 2] + Z_j = nc[j] + + x = (x_i-x_j)**2 + y = (y_i-y_j)**2 + z = (z_i-z_j)**2 + + if i == j: + cm_row.append(0.5*Z_i**2.4) + else: + cm_row.append(conversion_rate*Z_i*Z_j/math.sqrt(x + + y + + z)) + + cm_temp.append(np.array(cm_row)) + + cm = np.array(cm_temp) + # Now the value will be returned. + if as_eig: + return np.sort(np.linalg.eig(cm)[0])[::-1] + else: + return cm -- cgit v1.2.3-70-g09d2 From bb5e608f0ca6491e1abfd7fd25d2f28ff6c96e6b Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Fri, 21 Feb 2020 17:58:13 -0700 Subject: Rewrite cmatrix --- ml_exp/representations.py | 111 +++++++++++++++------------------------------- 1 file changed, 35 insertions(+), 76 deletions(-) diff --git a/ml_exp/representations.py b/ml_exp/representations.py index 50b78e548..bbc538ae6 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -38,87 +38,36 @@ def coulomb_matrix(coords, bohr_radius_units: if units should be in bohr's radius units. """ if bohr_radius_units: - conversion_rate = 0.52917721067 + cr = 0.52917721067 else: - conversion_rate = 1 + cr = 1 n = coords.shape[0] - nr = range(n) if not n == nc.shape[0]: raise ValueError('Compound size is different than the nuclear charge\ size. Arrays are not of the right shape.') if size < n: - print(''.join(['Error. Molecule matrix dimension (n) is ', - 'greater than size. Using n.'])) - size = None - - if size: - cm = np.zeros((size, size)) - ml_r = range(size) - - # Actual calculation of the coulomb matrix. - for i in ml_r: - if i < n: - x_i = coords[i, 0] - y_i = coords[i, 1] - z_i = coords[i, 2] - Z_i = nc[i] - else: - break - - for j in ml_r: - if j < n: - x_j = coords[j, 0] - y_j = coords[j, 1] - z_j = coords[j, 2] - Z_j = nc[j] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - cm[i, j] = (0.5*Z_i**2.4) - else: - cm[i, j] = (conversion_rate*Z_i*Z_j/math.sqrt(x - + y - + z)) - else: - break - - # Now the value will be returned. - if as_eig: - cm_sorted = np.sort(np.linalg.eig(cm)[0])[::-1] - # Thanks to SO for the following lines of code. - # https://stackoverflow.com/a/43011036 + print('Error. Compound size (n) is greater han (size). Using (n)\ + instead of (size).') + size = n - # Keep zeros at the end. - mask = cm_sorted != 0. - f_mask = mask.sum(0, keepdims=1) >\ - np.arange(cm_sorted.shape[0]-1, -1, -1) - - f_mask = f_mask[::-1] - cm_sorted[f_mask] = cm_sorted[mask] - cm_sorted[~f_mask] = 0. - - return cm_sorted - - else: - return cm + nr = range(size) + cm = np.empty((size, size), dtype=float) - else: - cm_temp = [] - # Actual calculation of the coulomb matrix. - for i in nr: + # Actual calculation of the coulomb matrix. + for i in nr: + if i < n: x_i = coords[i, 0] y_i = coords[i, 1] z_i = coords[i, 2] Z_i = nc[i] + else: + break - cm_row = [] - for j in nr: + for j in nr: + if j < n: x_j = coords[j, 0] y_j = coords[j, 1] z_j = coords[j, 2] @@ -129,17 +78,27 @@ def coulomb_matrix(coords, z = (z_i-z_j)**2 if i == j: - cm_row.append(0.5*Z_i**2.4) + cm[i, j] = (0.5*Z_i**2.4) else: - cm_row.append(conversion_rate*Z_i*Z_j/math.sqrt(x - + y - + z)) + cm[i, j] = (cr*Z_i*Z_j/math.sqrt(x + y + z)) + else: + break - cm_temp.append(np.array(cm_row)) + # Now the value will be returned. + if as_eig: + cm_sorted = np.sort(np.linalg.eig(cm)[0])[::-1] + # Thanks to SO for the following lines of code. + # https://stackoverflow.com/a/43011036 - cm = np.array(cm_temp) - # Now the value will be returned. - if as_eig: - return np.sort(np.linalg.eig(cm)[0])[::-1] - else: - return cm + # Keep zeros at the end. + mask = cm_sorted != 0. + f_mask = mask.sum(0, keepdims=1) >\ + np.arange(cm_sorted.shape[0]-1, -1, -1) + + f_mask = f_mask[::-1] + cm_sorted[f_mask] = cm_sorted[mask] + cm_sorted[~f_mask] = 0. + + return cm_sorted + else: + return cm -- cgit v1.2.3-70-g09d2 From efaa8eeb58d8e19d32f94ec4e91ec794574dfd32 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Fri, 21 Feb 2020 18:13:33 -0700 Subject: Coulomb matrix done, fix init --- ml_exp/__init__.py | 27 ++++----------------------- ml_exp/compound.py | 28 ++++++++++++++++++++++++---- ml_exp/representations.py | 10 +++++----- 3 files changed, 33 insertions(+), 32 deletions(-) diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py index 06425f6a2..8e803b37e 100644 --- a/ml_exp/__init__.py +++ b/ml_exp/__init__.py @@ -20,29 +20,10 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from ml_exp.read_qm7_data import read_nc_data, read_db_data, read_qm7_data -from ml_exp.c_matrix import c_matrix, c_matrix_multiple -from ml_exp.lj_matrix import lj_matrix, lj_matrix_multiple -from ml_exp.frob_norm import frob_norm -from ml_exp.gauss_kernel import gauss_kernel -from ml_exp.cholesky_solve import cholesky_solve -from ml_exp.do_ml import do_ml -from ml_exp.parallel_create_matrices import parallel_create_matrices -from ml_exp.misc import plot_benchmarks - +from ml_exp.compound import Compound +from ml_exp.representations import coulomb_matrix # If somebody does "from package import *", this is what they will # be able to access: -__all__ = ['read_nc_data', - 'read_db_data', - 'read_qm7_data', - 'c_matrix', - 'c_matrix_multiple', - 'lj_matrix', - 'lj_matrix_multiple', - 'frob_norm', - 'gauss_kernel', - 'cholesky_solve', - 'do_ml', - 'parallel_create_matrices', - 'plot_benchmarks'] +__all__ = ['Compound', + 'coulomb_matrix'] diff --git a/ml_exp/compound.py b/ml_exp/compound.py index 9536767a7..3ea377826 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -22,10 +22,16 @@ SOFTWARE. """ import numpy as np from ml_exp.data import NUCLEAR_CHARGE +from ml_exp.representations import coulomb_matrix class Compound: - def __init__(self, xyz=None): + def __init__(self, + xyz=None): + """ + Initialization of the Compound. + xyz: (path to) the xyz file. + """ # empty_array = np.asarray([], dtype=float) self.n = None @@ -41,10 +47,24 @@ class Compound: if xyz is not None: self.read_xyz(xyz) - def gen_cm(self): - pass + def gen_cm(self, + size=23, + as_eig=True, + bohr_ru=False): + """ + Generate the Coulomb Matrix for the compund. + size: compound size. + as_eig: if the representation should be as the eigenvalues. + bhor_ru: if radius units should be in bohr's radius units. + """ + self.coulomb_matrix = coulomb_matrix(self.coordinates, + self.atoms_nc, + size=size, + as_eig=as_eig, + bhor_ru=bohr_ru) - def read_xyz(self, filename): + def read_xyz(self, + filename): """ Reads an xyz file and adds the corresponding data to the Compound. filename: (path to) the xyz file. diff --git a/ml_exp/representations.py b/ml_exp/representations.py index bbc538ae6..c503c310d 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -28,16 +28,16 @@ def coulomb_matrix(coords, nc, size=23, as_eig=True, - bohr_radius_units=False): + bhor_ru=False): """ Creates the Coulomb Matrix from the molecule data given. coords: compound coordinates. nc: nuclear charge data. size: compound size. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. + as_eig: if the representation should be as the eigenvalues. + bhor_ru: if radius units should be in bohr's radius units. """ - if bohr_radius_units: + if bhor_ru: cr = 0.52917721067 else: cr = 1 @@ -54,7 +54,7 @@ def coulomb_matrix(coords, size = n nr = range(size) - cm = np.empty((size, size), dtype=float) + cm = np.zeros((size, size), dtype=float) # Actual calculation of the coulomb matrix. for i in nr: -- cgit v1.2.3-70-g09d2 From 8a108f8757027750f825c6ef9dd55adb4c09def0 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Fri, 21 Feb 2020 18:20:22 -0700 Subject: Remove unused function --- ml_exp/lj_matrix.py | 46 ---------------------------------------------- 1 file changed, 46 deletions(-) diff --git a/ml_exp/lj_matrix.py b/ml_exp/lj_matrix.py index 549790755..5e3e89678 100644 --- a/ml_exp/lj_matrix.py +++ b/ml_exp/lj_matrix.py @@ -20,11 +20,9 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import time import math import numpy as np from numpy.linalg import eig -from ml_exp.misc import printc def lj_matrix(mol_data, @@ -176,47 +174,3 @@ def lj_matrix(mol_data, return np.sort(eig(lj)[0])[::-1] else: return lj - - -def lj_matrix_multiple(mol_data, - nc_data, - pipe=None, - diag_value=None, - sigma=1.0, - epsilon=1.0, - max_len=25, - as_eig=True, - bohr_radius_units=False): - """ - Calculates the Lennard-Jones Matrix of multiple molecules. - mol_data: molecule data, matrix of atom coordinates. - nc_data: nuclear charge data, array of atom data. - pipe: for multiprocessing purposes. Sends the data calculated - through a pipe. - diag_value: if special diagonal value is to be used. - sigma: sigma value. - epsilon: epsilon value. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - """ - printc('L-J Matrices calculation started.', 'CYAN') - tic = time.perf_counter() - - ljm_data = np.array([lj_matrix(mol, - nc, - diag_value, - sigma, - epsilon, - max_len, - as_eig, - bohr_radius_units) - for mol, nc in zip(mol_data, nc_data)]) - - toc = time.perf_counter() - printc('\tL-JM calculation took {:.4f} seconds.'.format(toc-tic), 'GREEN') - - if pipe: - pipe.send(ljm_data) - - return ljm_data -- cgit v1.2.3-70-g09d2 From a2463470b639a743189deece071654ed03791c3c Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Fri, 21 Feb 2020 19:00:34 -0700 Subject: Moved ljmatrix to representations, rewrote code --- ml_exp/lj_matrix.py | 176 ---------------------------------------------- ml_exp/representations.py | 102 +++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 176 deletions(-) delete mode 100644 ml_exp/lj_matrix.py diff --git a/ml_exp/lj_matrix.py b/ml_exp/lj_matrix.py deleted file mode 100644 index 5e3e89678..000000000 --- a/ml_exp/lj_matrix.py +++ /dev/null @@ -1,176 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import math -import numpy as np -from numpy.linalg import eig - - -def lj_matrix(mol_data, - nc_data, - diag_value=None, - sigma=1.0, - epsilon=1.0, - max_len=25, - as_eig=True, - bohr_radius_units=False): - """ - Creates the Lennard-Jones Matrix from the molecule data given. - mol_data: molecule data, matrix of atom coordinates. - nc_data: nuclear charge data, array of atom data. - diag_value: if special diagonal value is to be used. - sigma: sigma value. - epsilon: epsilon value. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - """ - if bohr_radius_units: - conversion_rate = 0.52917721067 - else: - conversion_rate = 1 - - mol_n = len(mol_data) - mol_nr = range(mol_n) - - if not mol_n == len(nc_data): - print(''.join(['Error. Molecule matrix dimension is different ', - 'than the nuclear charge array dimension.'])) - else: - if max_len < mol_n: - print(''.join(['Error. Molecule matrix dimension (mol_n) is ', - 'greater than max_len. Using mol_n.'])) - max_len = None - - if max_len: - lj = np.zeros((max_len, max_len)) - ml_r = range(max_len) - - # Actual calculation of the coulomb matrix. - for i in ml_r: - if i < mol_n: - x_i = mol_data[i, 0] - y_i = mol_data[i, 1] - z_i = mol_data[i, 2] - Z_i = nc_data[i] - else: - break - - for j in ml_r: - if j < mol_n: - x_j = mol_data[j, 0] - y_j = mol_data[j, 1] - z_j = mol_data[j, 2] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - if diag_value is None: - lj[i, j] = (0.5*Z_i**2.4) - else: - lj[i, j] = diag_value - else: - # Calculations are done after i==j is checked - # so no division by zero is done. - - # A little play with r exponents - # so no square root is calculated. - # Conversion factor is included in r^2. - - # 1/r^2 - r_2 = sigma**2/(conversion_rate**2*(x + y + z)) - - r_6 = math.pow(r_2, 3) - r_12 = math.pow(r_6, 2) - lj[i, j] = (4*epsilon*(r_12 - r_6)) - else: - break - - # Now the value will be returned. - if as_eig: - lj_sorted = np.sort(eig(lj)[0])[::-1] - # Thanks to SO for the following lines of code. - # https://stackoverflow.com/a/43011036 - - # Keep zeros at the end. - mask = lj_sorted != 0. - f_mask = mask.sum(0, keepdims=1) >\ - np.arange(lj_sorted.shape[0]-1, -1, -1) - - f_mask = f_mask[::-1] - lj_sorted[f_mask] = lj_sorted[mask] - lj_sorted[~f_mask] = 0. - - return lj_sorted - - else: - return lj - - else: - lj_temp = [] - # Actual calculation of the coulomb matrix. - for i in mol_nr: - x_i = mol_data[i, 0] - y_i = mol_data[i, 1] - z_i = mol_data[i, 2] - Z_i = nc_data[i] - - lj_row = [] - for j in mol_nr: - x_j = mol_data[j, 0] - y_j = mol_data[j, 1] - z_j = mol_data[j, 2] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - if not diag_value: - lj_row.append(0.5*Z_i**2.4) - else: - lj_row.append(diag_value) - else: - # Calculations are done after i==j is checked - # so no division by zero is done. - - # A little play with r exponents - # so no square root is calculated. - # Conversion factor is included in r^2. - - # 1/r^2 - r_2 = sigma**2/(conversion_rate**2*(x + y + z)) - - r_6 = math.pow(r_2, 3) - r_12 = math.pow(r_6, 2) - lj_row.append(4*epsilon*(r_12 - r_6)) - - lj_temp.append(np.array(lj_row)) - - lj = np.array(lj_temp) - # Now the value will be returned. - if as_eig: - return np.sort(eig(lj)[0])[::-1] - else: - return lj diff --git a/ml_exp/representations.py b/ml_exp/representations.py index c503c310d..b4edef8a5 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -102,3 +102,105 @@ def coulomb_matrix(coords, return cm_sorted else: return cm + + +def lennard_jones_matrix(coords, + nc, + diag_value=None, + sigma=1.0, + epsilon=1.0, + size=23, + as_eig=True, + bhor_ru=False): + """ + Creates the Lennard-Jones Matrix from the molecule data given. + coords: compound coordinates. + nc: nuclear charge data. + diag_value: if special diagonal value is to be used. + sigma: sigma value. + epsilon: epsilon value. + size: compound size. + as_eig: if the representation should be as the eigenvalues. + bhor_ru: if radius units should be in bohr's radius units. + """ + if bhor_ru: + cr = 0.52917721067 + else: + cr = 1 + + n = coords.shape[0] + + if not n == nc.shape[0]: + raise ValueError('Compound size is different than the nuclear charge\ + size. Arrays are not of the right shape.') + + if size < n: + print('Error. Compound size (n) is greater han (size). Using (n)\ + instead of (size).') + size = n + + nr = range(size) + lj = np.zeros((size, size), dtype=float) + + # Actual calculation of the lennard-jones matrix. + for i in nr: + if i < n: + x_i = coords[i, 0] + y_i = coords[i, 1] + z_i = coords[i, 2] + Z_i = nc[i] + else: + break + + for j in nr: + if j < n: + x_j = coords[j, 0] + y_j = coords[j, 1] + z_j = coords[j, 2] + # Never really used because when needed it is equal to Z_i. + # Z_j = nc[j] + + x = (x_i-x_j)**2 + y = (y_i-y_j)**2 + z = (z_i-z_j)**2 + + if i == j: + if diag_value is None: + lj[i, j] = (0.5*Z_i**2.4) + else: + lj[i, j] = diag_value + else: + # Calculations are done after i==j is checked + # so no division by zero is done. + + # A little play with r exponents + # so no square root is calculated. + # Conversion factor is included in r^2. + + # 1/r^2 + r_2 = sigma**2/(cr**2*(x + y + z)) + + r_6 = math.pow(r_2, 3) + r_12 = math.pow(r_6, 2) + lj[i, j] = (4*epsilon*(r_12 - r_6)) + else: + break + + # Now the value will be returned. + if as_eig: + lj_sorted = np.sort(np.linalg.eig(lj)[0])[::-1] + # Thanks to SO for the following lines of code. + # https://stackoverflow.com/a/43011036 + + # Keep zeros at the end. + mask = lj_sorted != 0. + f_mask = mask.sum(0, keepdims=1) >\ + np.arange(lj_sorted.shape[0]-1, -1, -1) + + f_mask = f_mask[::-1] + lj_sorted[f_mask] = lj_sorted[mask] + lj_sorted[~f_mask] = 0. + + return lj_sorted + else: + return lj -- cgit v1.2.3-70-g09d2 From 5ba008a928d34c49e431f32b5161f624d780c254 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Fri, 21 Feb 2020 19:06:36 -0700 Subject: Add ljmatrix to compound --- ml_exp/compound.py | 41 +++++++++++++++++++++++++++++++++-------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/ml_exp/compound.py b/ml_exp/compound.py index 3ea377826..d74f1230a 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -22,7 +22,7 @@ SOFTWARE. """ import numpy as np from ml_exp.data import NUCLEAR_CHARGE -from ml_exp.representations import coulomb_matrix +from ml_exp.representations import coulomb_matrix, lennard_jones_matrix class Compound: @@ -40,8 +40,8 @@ class Compound: self.coordinates = None self.energy = None - self.coulomb_matrix = None - self.lennard_jones_matrix = None + self.c_matrix = None + self.lj_matrix = None self.bob = None if xyz is not None: @@ -57,11 +57,36 @@ class Compound: as_eig: if the representation should be as the eigenvalues. bhor_ru: if radius units should be in bohr's radius units. """ - self.coulomb_matrix = coulomb_matrix(self.coordinates, - self.atoms_nc, - size=size, - as_eig=as_eig, - bhor_ru=bohr_ru) + self.c_matrix = coulomb_matrix(self.coordinates, + self.atoms_nc, + size=size, + as_eig=as_eig, + bhor_ru=bohr_ru) + + def gen_lj(self, + diag_value=None, + sigma=1.0, + epsilon=1.0, + size=23, + as_eig=True, + bohr_ru=False): + """ + Generate the Lennard-Jones Matrix for the compund. + diag_value: if special diagonal value is to be used. + sigma: sigma value. + epsilon: epsilon value. + size: compound size. + as_eig: if the representation should be as the eigenvalues. + bhor_ru: if radius units should be in bohr's radius units. + """ + self.lj_matrix = lennard_jones_matrix(self.coordinates, + self.atoms_nc, + diag_value=diag_value, + sigma=sigma, + epsilon=epsilon, + size=size, + as_eig=as_eig, + bhor_ru=bohr_ru) def read_xyz(self, filename): -- cgit v1.2.3-70-g09d2 From 05d97a15d822400f7018a4381593c2638c6ad577 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Fri, 21 Feb 2020 19:18:24 -0700 Subject: Refactor code --- ml_exp/compound.py | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/ml_exp/compound.py b/ml_exp/compound.py index d74f1230a..c6d5f8a64 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -40,8 +40,8 @@ class Compound: self.coordinates = None self.energy = None - self.c_matrix = None - self.lj_matrix = None + self.cm = None + self.ljm = None self.bob = None if xyz is not None: @@ -57,19 +57,19 @@ class Compound: as_eig: if the representation should be as the eigenvalues. bhor_ru: if radius units should be in bohr's radius units. """ - self.c_matrix = coulomb_matrix(self.coordinates, - self.atoms_nc, - size=size, - as_eig=as_eig, - bhor_ru=bohr_ru) + self.cm = coulomb_matrix(self.coordinates, + self.atoms_nc, + size=size, + as_eig=as_eig, + bhor_ru=bohr_ru) - def gen_lj(self, - diag_value=None, - sigma=1.0, - epsilon=1.0, - size=23, - as_eig=True, - bohr_ru=False): + def gen_ljm(self, + diag_value=None, + sigma=1.0, + epsilon=1.0, + size=23, + as_eig=True, + bohr_ru=False): """ Generate the Lennard-Jones Matrix for the compund. diag_value: if special diagonal value is to be used. @@ -79,14 +79,14 @@ class Compound: as_eig: if the representation should be as the eigenvalues. bhor_ru: if radius units should be in bohr's radius units. """ - self.lj_matrix = lennard_jones_matrix(self.coordinates, - self.atoms_nc, - diag_value=diag_value, - sigma=sigma, - epsilon=epsilon, - size=size, - as_eig=as_eig, - bhor_ru=bohr_ru) + self.ljm = lennard_jones_matrix(self.coordinates, + self.atoms_nc, + diag_value=diag_value, + sigma=sigma, + epsilon=epsilon, + size=size, + as_eig=as_eig, + bhor_ru=bohr_ru) def read_xyz(self, filename): -- cgit v1.2.3-70-g09d2 From a7d605f094e81c5139530560e26eb0ff9be8ec82 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Fri, 21 Feb 2020 19:21:18 -0700 Subject: Add ljm to init --- ml_exp/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py index 8e803b37e..f5ec7068b 100644 --- a/ml_exp/__init__.py +++ b/ml_exp/__init__.py @@ -21,9 +21,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ from ml_exp.compound import Compound -from ml_exp.representations import coulomb_matrix +from ml_exp.representations import coulomb_matrix, lennard_jones_matrix # If somebody does "from package import *", this is what they will # be able to access: __all__ = ['Compound', - 'coulomb_matrix'] + 'coulomb_matrix', + 'lennard_jones_matrix'] -- cgit v1.2.3-70-g09d2 From 4f95e38f1943856f9a0581c06ea603ffb6283d47 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 22 Feb 2020 16:02:12 -0700 Subject: Move adj matrix to representations --- ml_exp/adj_matrix.py | 131 ---------------------------------------------- ml_exp/representations.py | 108 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 131 deletions(-) delete mode 100644 ml_exp/adj_matrix.py diff --git a/ml_exp/adj_matrix.py b/ml_exp/adj_matrix.py deleted file mode 100644 index de484b5be..000000000 --- a/ml_exp/adj_matrix.py +++ /dev/null @@ -1,131 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -from numpy import array, zeros, dot -from numpy.linalg import norm -from ml_exp.misc import printc - - -def fneig_matrix(atoms, - xyz, - nc=None, - use_forces=False): - """ - Creates the first neighbor matrix of the given molecule data. - atoms: list of atoms. - xyz: matrix of atomic coords. - nc: nuclear charge info. - use_forces: if the use of forces instead of k_cx should be used. - NOTE: Bond distance of carbon to other elements - are (for atoms present in the qm7 dataset): - C: 1.20 - 1.53 A - H: 1.06 - 1.12 A - O: 1.43 - 2.15 A - N: 1.47 - 2.10 A - S: 1.81 - 2.55 A - """ - if use_forces and nc is None: - printc(f'Error. use_forces={use_forces} but nc={nc}', 'RED') - return None - # Possible bonds. - cc_bond = sorted(['C', 'C']) - ch_bond = sorted(['C', 'H']) - co_bond = sorted(['C', 'O']) - cn_bond = sorted(['C', 'N']) - cs_bond = sorted(['C', 'S']) - - # Number of atoms, empty matrix and bond list. - n = len(atoms) - fnm = array(zeros((n, n)), dtype=float) - bonds = [] - forces = [] - for i, xyz_i in enumerate(xyz): - for j, xyz_j in enumerate(xyz): - # Ignore the diagonal. - if i != j: - bond = sorted([atoms[i], atoms[j]]) - rv = xyz_i - xyz_j - r = norm(rv) - # Check for each type of bond. - if (cc_bond == bond) and (r >= 1.20 and r <= 1.53): - fnm[i, j] = 1.0 - if j > i: - bonds.append((i, j)) - if use_forces: - forces.append(rv*nc[i]*nc[j]/r**3) - elif (ch_bond == bond) and (r >= 1.06 and r <= 1.12): - fnm[i, j] = 1.0 - if j > i: - bonds.append((i, j)) - if use_forces: - forces.append(rv*nc[i]*nc[j]/r**3) - elif (co_bond == bond) and (r >= 1.43 and r <= 2.15): - fnm[i, j] = 0.8 - if j > i: - bonds.append((i, j)) - if use_forces: - forces.append(rv*nc[i]*nc[j]/r**3) - elif (cn_bond == bond) and (r >= 1.47 and r <= 2.10): - fnm[i, j] = 1.0 - if j > i: - bonds.append((i, j)) - if use_forces: - forces.append(rv*nc[i]*nc[j]/r**3) - elif (cs_bond == bond) and (r >= 1.81 and r <= 2.55): - fnm[i, j] = 0.7 - if j > i: - bonds.append((i, j)) - if use_forces: - forces.append(rv*nc[i]*nc[j]/r**3) - if use_forces: - return fnm, bonds, forces - return fnm, bonds - - -def adj_matrix(fneig_matrix, - bonds, - forces=None, - max_len=25): - """ - Calculates the adjacency matrix given the bond list. - bonds: list of bonds (tuple of indexes). - max_len: maximum amount of atoms in molecule. - forces: list of forces. - """ - n = len(bonds) - - if max_len < n: - print(''.join(['Error. Molecule matrix dimension (mol_n) is ', - 'greater than max_len. Using mol_n.'])) - max_len = n - - am = array(zeros((max_len, max_len))) - for i, bond_i in enumerate(bonds): - for j, bond_j in enumerate(bonds): - # Ignore the diagonal. - if i != j: - if (bond_i[0] in bond_j) or (bond_i[1] in bond_j): - if forces: - am[i, j] = dot(forces[i], forces[j]) - else: - am[i, j] = fneig_matrix[bond_i[0], bond_i[1]] - return am diff --git a/ml_exp/representations.py b/ml_exp/representations.py index b4edef8a5..f503b5987 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -204,3 +204,111 @@ def lennard_jones_matrix(coords, return lj_sorted else: return lj + + +def fneig_matrix(atoms, + xyz, + nc=None, + use_forces=False): + """ + Creates the first neighbor matrix of the given molecule data. + atoms: list of atoms. + xyz: matrix of atomic coords. + nc: nuclear charge info. + use_forces: if the use of forces instead of k_cx should be used. + NOTE: Bond distance of carbon to other elements + are (for atoms present in the qm7 dataset): + C: 1.20 - 1.53 A + H: 1.06 - 1.12 A + O: 1.43 - 2.15 A + N: 1.47 - 2.10 A + S: 1.81 - 2.55 A + """ + if use_forces and nc is None: + # print(f'Error. use_forces={use_forces} but nc={nc}', 'RED') + return None + # Possible bonds. + cc_bond = sorted(['C', 'C']) + ch_bond = sorted(['C', 'H']) + co_bond = sorted(['C', 'O']) + cn_bond = sorted(['C', 'N']) + cs_bond = sorted(['C', 'S']) + + # Number of atoms, empty matrix and bond list. + n = len(atoms) + fnm = np.empty((n, n), dtype=float) + bonds = [] + forces = [] + for i, xyz_i in enumerate(xyz): + for j, xyz_j in enumerate(xyz): + + # Ignore the diagonal. + if i != j: + bond = sorted([atoms[i], atoms[j]]) + rv = xyz_i - xyz_j + r = np.linalg.norm(rv) + + # Check for each type of bond. + if (cc_bond == bond) and (r >= 1.20 and r <= 1.53): + fnm[i, j] = 1.0 + if j > i: + bonds.append((i, j)) + if use_forces: + forces.append(rv*nc[i]*nc[j]/r**3) + elif (ch_bond == bond) and (r >= 1.06 and r <= 1.12): + fnm[i, j] = 1.0 + if j > i: + bonds.append((i, j)) + if use_forces: + forces.append(rv*nc[i]*nc[j]/r**3) + elif (co_bond == bond) and (r >= 1.43 and r <= 2.15): + fnm[i, j] = 0.8 + if j > i: + bonds.append((i, j)) + if use_forces: + forces.append(rv*nc[i]*nc[j]/r**3) + elif (cn_bond == bond) and (r >= 1.47 and r <= 2.10): + fnm[i, j] = 1.0 + if j > i: + bonds.append((i, j)) + if use_forces: + forces.append(rv*nc[i]*nc[j]/r**3) + elif (cs_bond == bond) and (r >= 1.81 and r <= 2.55): + fnm[i, j] = 0.7 + if j > i: + bonds.append((i, j)) + if use_forces: + forces.append(rv*nc[i]*nc[j]/r**3) + if use_forces: + return fnm, bonds, forces + return fnm, bonds + + +def adj_matrix(fneig_matrix, + bonds, + forces=None, + max_len=25): + """ + Calculates the adjacency matrix given the bond list. + bonds: list of bonds (tuple of indexes). + max_len: maximum amount of atoms in molecule. + forces: list of forces. + """ + n = len(bonds) + + if max_len < n: + print(''.join(['Error. Molecule matrix dimension (mol_n) is ', + 'greater than max_len. Using mol_n.'])) + max_len = n + + am = np.empty((max_len, max_len), dtype=float) + for i, bond_i in enumerate(bonds): + for j, bond_j in enumerate(bonds): + # Ignore the diagonal. + if i != j: + if (bond_i[0] in bond_j) or (bond_i[1] in bond_j): + if forces: + am[i, j] = np.dot(forces[i], forces[j]) + else: + am[i, j] = fneig_matrix[bond_i[0], bond_i[1]] + return am -- cgit v1.2.3-70-g09d2 From 2572f9581b24547692a5654a1d452690a2cbcad6 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 22 Feb 2020 16:36:40 -0700 Subject: Add adj matrix to compound) --- ml_exp/compound.py | 35 ++++++++++++++++--- ml_exp/representations.py | 89 ++++++++++++++++++++++++++++------------------- 2 files changed, 83 insertions(+), 41 deletions(-) diff --git a/ml_exp/compound.py b/ml_exp/compound.py index c6d5f8a64..d4dc9b7c0 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -22,7 +22,8 @@ SOFTWARE. """ import numpy as np from ml_exp.data import NUCLEAR_CHARGE -from ml_exp.representations import coulomb_matrix, lennard_jones_matrix +from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\ + first_neighbor_matrix, adjacency_matrix class Compound: @@ -42,6 +43,7 @@ class Compound: self.cm = None self.ljm = None + self.am = None self.bob = None if xyz is not None: @@ -55,13 +57,13 @@ class Compound: Generate the Coulomb Matrix for the compund. size: compound size. as_eig: if the representation should be as the eigenvalues. - bhor_ru: if radius units should be in bohr's radius units. + bohr_ru: if radius units should be in bohr's radius units. """ self.cm = coulomb_matrix(self.coordinates, self.atoms_nc, size=size, as_eig=as_eig, - bhor_ru=bohr_ru) + bohr_ru=bohr_ru) def gen_ljm(self, diag_value=None, @@ -77,7 +79,7 @@ class Compound: epsilon: epsilon value. size: compound size. as_eig: if the representation should be as the eigenvalues. - bhor_ru: if radius units should be in bohr's radius units. + bohr_ru: if radius units should be in bohr's radius units. """ self.ljm = lennard_jones_matrix(self.coordinates, self.atoms_nc, @@ -86,7 +88,30 @@ class Compound: epsilon=epsilon, size=size, as_eig=as_eig, - bhor_ru=bohr_ru) + bohr_ru=bohr_ru) + + def gen_am(self, + use_forces=False, + size=23, + bohr_ru=False): + """ + Generate the Adjacency Matrix for the compund. + use_forces: if the use of forces instead of k_cx should be used. + size: compound size. + bohr_ru: if radius units should be in bohr's radius units. + """ + # First, generate the first neighor matrix. + fnm, bonds, forces = first_neighbor_matrix(self.coordinates, + self.atoms_nc, + self.atoms, + use_forces=use_forces, + bohr_ru=bohr_ru) + + # Now, generate the adjacency matrix. + self.am = adjacency_matrix(fnm, + bonds, + forces, + size=size) def read_xyz(self, filename): diff --git a/ml_exp/representations.py b/ml_exp/representations.py index f503b5987..483da2898 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -28,16 +28,16 @@ def coulomb_matrix(coords, nc, size=23, as_eig=True, - bhor_ru=False): + bohr_ru=False): """ Creates the Coulomb Matrix from the molecule data given. coords: compound coordinates. nc: nuclear charge data. size: compound size. as_eig: if the representation should be as the eigenvalues. - bhor_ru: if radius units should be in bohr's radius units. + bohr_ru: if radius units should be in bohr's radius units. """ - if bhor_ru: + if bohr_ru: cr = 0.52917721067 else: cr = 1 @@ -111,7 +111,7 @@ def lennard_jones_matrix(coords, epsilon=1.0, size=23, as_eig=True, - bhor_ru=False): + bohr_ru=False): """ Creates the Lennard-Jones Matrix from the molecule data given. coords: compound coordinates. @@ -121,9 +121,9 @@ def lennard_jones_matrix(coords, epsilon: epsilon value. size: compound size. as_eig: if the representation should be as the eigenvalues. - bhor_ru: if radius units should be in bohr's radius units. + bohr_ru: if radius units should be in bohr's radius units. """ - if bhor_ru: + if bohr_ru: cr = 0.52917721067 else: cr = 1 @@ -206,16 +206,18 @@ def lennard_jones_matrix(coords, return lj -def fneig_matrix(atoms, - xyz, - nc=None, - use_forces=False): +def first_neighbor_matrix(coords, + nc, + atoms, + use_forces=False, + bohr_ru=False): """ - Creates the first neighbor matrix of the given molecule data. + Creates the First Neighbor Matrix from the molecule data given. + coords: compound coordinates. + nc: nuclear charge data. atoms: list of atoms. - xyz: matrix of atomic coords. - nc: nuclear charge info. use_forces: if the use of forces instead of k_cx should be used. + bohr_ru: if radius units should be in bohr's radius units. NOTE: Bond distance of carbon to other elements are (for atoms present in the qm7 dataset): C: 1.20 - 1.53 A @@ -224,9 +226,17 @@ def fneig_matrix(atoms, N: 1.47 - 2.10 A S: 1.81 - 2.55 A """ - if use_forces and nc is None: - # print(f'Error. use_forces={use_forces} but nc={nc}', 'RED') - return None + if bohr_ru: + cr = 0.52917721067 + else: + cr = 1 + + n = coords.shape[0] + + if not n == nc.shape[0]: + raise ValueError('Compound size is different than the nuclear charge\ + size. Arrays are not of the right shape.') + # Possible bonds. cc_bond = sorted(['C', 'C']) ch_bond = sorted(['C', 'H']) @@ -234,19 +244,21 @@ def fneig_matrix(atoms, cn_bond = sorted(['C', 'N']) cs_bond = sorted(['C', 'S']) - # Number of atoms, empty matrix and bond list. - n = len(atoms) - fnm = np.empty((n, n), dtype=float) + if use_forces: + fnm = np.empty((n, n), dtype=float) + else: + fnm = np.empty((n, n), dtype=int) + bonds = [] forces = [] - for i, xyz_i in enumerate(xyz): - for j, xyz_j in enumerate(xyz): + for i, xyz_i in enumerate(coords): + for j, xyz_j in enumerate(coords): # Ignore the diagonal. if i != j: bond = sorted([atoms[i], atoms[j]]) rv = xyz_i - xyz_j - r = np.linalg.norm(rv) + r = np.linalg.norm(rv)/cr # Check for each type of bond. if (cc_bond == bond) and (r >= 1.20 and r <= 1.53): @@ -279,29 +291,33 @@ def fneig_matrix(atoms, bonds.append((i, j)) if use_forces: forces.append(rv*nc[i]*nc[j]/r**3) - if use_forces: - return fnm, bonds, forces - return fnm, bonds + + return fnm, bonds, forces -def adj_matrix(fneig_matrix, - bonds, - forces=None, - max_len=25): +def adjacency_matrix(fnm, + bonds, + forces, + size=23): """ Calculates the adjacency matrix given the bond list. + fnm: first neighbour matrix. bonds: list of bonds (tuple of indexes). - max_len: maximum amount of atoms in molecule. forces: list of forces. + size: compund size. """ n = len(bonds) - if max_len < n: - print(''.join(['Error. Molecule matrix dimension (mol_n) is ', - 'greater than max_len. Using mol_n.'])) - max_len = n + if size < n: + print('Error. Compound size (n) is greater han (size). Using (n)\ + instead of (size).') + size = n + + if forces: + am = np.empty((size, size), dtype=float) + else: + am = np.empty((size, size), dtype=int) - am = np.empty((max_len, max_len), dtype=float) for i, bond_i in enumerate(bonds): for j, bond_j in enumerate(bonds): # Ignore the diagonal. @@ -310,5 +326,6 @@ def adj_matrix(fneig_matrix, if forces: am[i, j] = np.dot(forces[i], forces[j]) else: - am[i, j] = fneig_matrix[bond_i[0], bond_i[1]] + am[i, j] = fnm[bond_i[0], bond_i[1]] + return am -- cgit v1.2.3-70-g09d2 From fb4307513ddda06742bbe4d542c2716c0d67c06d Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 22 Feb 2020 16:56:05 -0700 Subject: Refactor code --- ml_exp/representations.py | 93 +++++++++++++---------------------------------- 1 file changed, 26 insertions(+), 67 deletions(-) diff --git a/ml_exp/representations.py b/ml_exp/representations.py index 483da2898..354611190 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -20,7 +20,6 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import math import numpy as np @@ -53,36 +52,17 @@ def coulomb_matrix(coords, instead of (size).') size = n - nr = range(size) cm = np.zeros((size, size), dtype=float) # Actual calculation of the coulomb matrix. - for i in nr: - if i < n: - x_i = coords[i, 0] - y_i = coords[i, 1] - z_i = coords[i, 2] - Z_i = nc[i] - else: - break - - for j in nr: - if j < n: - x_j = coords[j, 0] - y_j = coords[j, 1] - z_j = coords[j, 2] - Z_j = nc[j] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - cm[i, j] = (0.5*Z_i**2.4) - else: - cm[i, j] = (cr*Z_i*Z_j/math.sqrt(x + y + z)) + for i, xyz_i in enumerate(coords): + for j, xyz_j in enumerate(coords): + rv = xyz_i - xyz_j + r = np.linalg.norm(rv)/cr + if i == j: + cm[i, j] = (0.5*nc[i]**2.4) else: - break + cm[i, j] = nc[i]*nc[j]/r # Now the value will be returned. if as_eig: @@ -139,52 +119,31 @@ def lennard_jones_matrix(coords, instead of (size).') size = n - nr = range(size) lj = np.zeros((size, size), dtype=float) # Actual calculation of the lennard-jones matrix. - for i in nr: - if i < n: - x_i = coords[i, 0] - y_i = coords[i, 1] - z_i = coords[i, 2] - Z_i = nc[i] - else: - break - - for j in nr: - if j < n: - x_j = coords[j, 0] - y_j = coords[j, 1] - z_j = coords[j, 2] - # Never really used because when needed it is equal to Z_i. - # Z_j = nc[j] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - if diag_value is None: - lj[i, j] = (0.5*Z_i**2.4) - else: - lj[i, j] = diag_value + for i, xyz_i in enumerate(coords): + for j, xyz_j in enumerate(coords): + if i == j: + if diag_value is None: + lj[i, j] = (0.5*nc[i]**2.4) else: - # Calculations are done after i==j is checked - # so no division by zero is done. - - # A little play with r exponents - # so no square root is calculated. - # Conversion factor is included in r^2. + lj[i, j] = diag_value + else: + # Calculations are done after i==j is checked + # so no division by zero is done. - # 1/r^2 - r_2 = sigma**2/(cr**2*(x + y + z)) + # A little play with r exponents + # so no square root is calculated. + # Conversion factor is included in r^2. + rv = xyz_i - xyz_j + r = sigma*np.linalg.norm(rv)/cr - r_6 = math.pow(r_2, 3) - r_12 = math.pow(r_6, 2) - lj[i, j] = (4*epsilon*(r_12 - r_6)) - else: - break + # 1/r^n + r_2 = 1/r**2 + r_6 = r_2**3 + r_12 = r_6**2 + lj[i, j] = (4*epsilon*(r_12 - r_6)) # Now the value will be returned. if as_eig: -- cgit v1.2.3-70-g09d2 From 8d0a22b9fdef41ece399468423cb533768d0e631 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 22 Feb 2020 17:12:32 -0700 Subject: Add tests --- ml_exp/__main__.py | 11 ++++++++++- ml_exp/representations.py | 4 ++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/ml_exp/__main__.py b/ml_exp/__main__.py index 3cb515d85..cb8a9a3f1 100644 --- a/ml_exp/__main__.py +++ b/ml_exp/__main__.py @@ -20,5 +20,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from ml_exp.compound import Compound + + if __name__ == '__main__': - print('Ok!') + print('Initialize test') + test = [] + for i in range(1, 10): + test.append(Compound(f'/home/luevano/py/ml_exp/data/qm7/000{i}.xyz')) + test[i-1].gen_cm(size=1, as_eig=False) + test[i-1].gen_ljm(size=1, as_eig=False) + test[i-1].gen_am(size=1) diff --git a/ml_exp/representations.py b/ml_exp/representations.py index 354611190..ae870f3b0 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -273,9 +273,9 @@ def adjacency_matrix(fnm, size = n if forces: - am = np.empty((size, size), dtype=float) + am = np.zeros((size, size), dtype=float) else: - am = np.empty((size, size), dtype=int) + am = np.zeros((size, size), dtype=int) for i, bond_i in enumerate(bonds): for j, bond_j in enumerate(bonds): -- cgit v1.2.3-70-g09d2 From 85198178482767d2831ddf3762e5e6eeafa990bd Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sat, 22 Feb 2020 18:37:50 -0700 Subject: Fix bug, current errors present on adj matrix --- ml_exp/__main__.py | 3 --- ml_exp/representations.py | 16 ++++++++-------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/ml_exp/__main__.py b/ml_exp/__main__.py index cb8a9a3f1..17119623e 100644 --- a/ml_exp/__main__.py +++ b/ml_exp/__main__.py @@ -28,6 +28,3 @@ if __name__ == '__main__': test = [] for i in range(1, 10): test.append(Compound(f'/home/luevano/py/ml_exp/data/qm7/000{i}.xyz')) - test[i-1].gen_cm(size=1, as_eig=False) - test[i-1].gen_ljm(size=1, as_eig=False) - test[i-1].gen_am(size=1) diff --git a/ml_exp/representations.py b/ml_exp/representations.py index ae870f3b0..796d15f98 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -48,8 +48,8 @@ def coulomb_matrix(coords, size. Arrays are not of the right shape.') if size < n: - print('Error. Compound size (n) is greater han (size). Using (n)\ - instead of (size).') + print('Error. Compound size (n) is greater han (size). Using (n)', + 'instead of (size).') size = n cm = np.zeros((size, size), dtype=float) @@ -115,8 +115,8 @@ def lennard_jones_matrix(coords, size. Arrays are not of the right shape.') if size < n: - print('Error. Compound size (n) is greater han (size). Using (n)\ - instead of (size).') + print('Error. Compound size (n) is greater han (size). Using (n)', + 'instead of (size).') size = n lj = np.zeros((size, size), dtype=float) @@ -137,10 +137,10 @@ def lennard_jones_matrix(coords, # so no square root is calculated. # Conversion factor is included in r^2. rv = xyz_i - xyz_j - r = sigma*np.linalg.norm(rv)/cr + r = np.linalg.norm(rv)/cr # 1/r^n - r_2 = 1/r**2 + r_2 = sigma**2/r**2 r_6 = r_2**3 r_12 = r_6**2 lj[i, j] = (4*epsilon*(r_12 - r_6)) @@ -204,9 +204,9 @@ def first_neighbor_matrix(coords, cs_bond = sorted(['C', 'S']) if use_forces: - fnm = np.empty((n, n), dtype=float) + fnm = np.zeros((n, n), dtype=float) else: - fnm = np.empty((n, n), dtype=int) + fnm = np.zeros((n, n), dtype=int) bonds = [] forces = [] -- cgit v1.2.3-70-g09d2 From db47ff8606b14e4cdddc0e317972c3515b9c788b Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sun, 23 Feb 2020 21:03:39 -0700 Subject: Fix adj matrix issue with cc bonds) --- ml_exp/__main__.py | 11 ++++++++--- ml_exp/compound.py | 6 ++++-- ml_exp/representations.py | 29 ++++++++++++++--------------- 3 files changed, 26 insertions(+), 20 deletions(-) diff --git a/ml_exp/__main__.py b/ml_exp/__main__.py index 17119623e..124ce8cd9 100644 --- a/ml_exp/__main__.py +++ b/ml_exp/__main__.py @@ -25,6 +25,11 @@ from ml_exp.compound import Compound if __name__ == '__main__': print('Initialize test') - test = [] - for i in range(1, 10): - test.append(Compound(f'/home/luevano/py/ml_exp/data/qm7/000{i}.xyz')) + mol_name = '/home/luevano/py/ml_exp/data/qm7/0004.xyz' + mol = Compound(mol_name) + mol.gen_cm(size=1, as_eig=False) + mol.gen_ljm(size=1, as_eig=False) + mol.gen_am(size=1) + + print(mol.n, mol.atoms, mol.atoms_nc) + print(mol.am) diff --git a/ml_exp/compound.py b/ml_exp/compound.py index d4dc9b7c0..d499e6b83 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -39,7 +39,8 @@ class Compound: self.atoms = None self.atoms_nc = None self.coordinates = None - self.energy = None + self.pbe0 = None + self.delta = None self.cm = None self.ljm = None @@ -91,8 +92,8 @@ class Compound: bohr_ru=bohr_ru) def gen_am(self, - use_forces=False, size=23, + use_forces=False, bohr_ru=False): """ Generate the Adjacency Matrix for the compund. @@ -104,6 +105,7 @@ class Compound: fnm, bonds, forces = first_neighbor_matrix(self.coordinates, self.atoms_nc, self.atoms, + size=size, use_forces=use_forces, bohr_ru=bohr_ru) diff --git a/ml_exp/representations.py b/ml_exp/representations.py index 796d15f98..11c30dfd2 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -39,7 +39,7 @@ def coulomb_matrix(coords, if bohr_ru: cr = 0.52917721067 else: - cr = 1 + cr = 1.0 n = coords.shape[0] @@ -106,7 +106,7 @@ def lennard_jones_matrix(coords, if bohr_ru: cr = 0.52917721067 else: - cr = 1 + cr = 1.0 n = coords.shape[0] @@ -168,6 +168,7 @@ def lennard_jones_matrix(coords, def first_neighbor_matrix(coords, nc, atoms, + size=23, use_forces=False, bohr_ru=False): """ @@ -179,7 +180,7 @@ def first_neighbor_matrix(coords, bohr_ru: if radius units should be in bohr's radius units. NOTE: Bond distance of carbon to other elements are (for atoms present in the qm7 dataset): - C: 1.20 - 1.53 A + C: 1.20 - 1.54 A (Edited to 1.19 - 1.54 A) H: 1.06 - 1.12 A O: 1.43 - 2.15 A N: 1.47 - 2.10 A @@ -188,7 +189,7 @@ def first_neighbor_matrix(coords, if bohr_ru: cr = 0.52917721067 else: - cr = 1 + cr = 1.0 n = coords.shape[0] @@ -196,6 +197,11 @@ def first_neighbor_matrix(coords, raise ValueError('Compound size is different than the nuclear charge\ size. Arrays are not of the right shape.') + if size < n: + print('Error. Compound size (n) is greater han (size). Using (n)', + 'instead of (size).') + size = n + # Possible bonds. cc_bond = sorted(['C', 'C']) ch_bond = sorted(['C', 'H']) @@ -203,16 +209,12 @@ def first_neighbor_matrix(coords, cn_bond = sorted(['C', 'N']) cs_bond = sorted(['C', 'S']) - if use_forces: - fnm = np.zeros((n, n), dtype=float) - else: - fnm = np.zeros((n, n), dtype=int) + fnm = np.zeros((size, size), dtype=float) bonds = [] forces = [] for i, xyz_i in enumerate(coords): for j, xyz_j in enumerate(coords): - # Ignore the diagonal. if i != j: bond = sorted([atoms[i], atoms[j]]) @@ -220,7 +222,7 @@ def first_neighbor_matrix(coords, r = np.linalg.norm(rv)/cr # Check for each type of bond. - if (cc_bond == bond) and (r >= 1.20 and r <= 1.53): + if (cc_bond == bond) and (r >= 1.19 and r <= 1.54): fnm[i, j] = 1.0 if j > i: bonds.append((i, j)) @@ -257,7 +259,7 @@ def first_neighbor_matrix(coords, def adjacency_matrix(fnm, bonds, forces, - size=23): + size=22): """ Calculates the adjacency matrix given the bond list. fnm: first neighbour matrix. @@ -272,10 +274,7 @@ def adjacency_matrix(fnm, instead of (size).') size = n - if forces: - am = np.zeros((size, size), dtype=float) - else: - am = np.zeros((size, size), dtype=int) + am = np.zeros((size, size), dtype=float) for i, bond_i in enumerate(bonds): for j, bond_j in enumerate(bonds): -- cgit v1.2.3-70-g09d2 From 54e69de6126afb321c0ea575ede595347ffc121c Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sun, 23 Feb 2020 21:07:54 -0700 Subject: Add fnm and am to init --- ml_exp/__init__.py | 7 +++++-- ml_exp/__main__.py | 12 ++---------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py index f5ec7068b..d81f0999a 100644 --- a/ml_exp/__init__.py +++ b/ml_exp/__init__.py @@ -21,10 +21,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ from ml_exp.compound import Compound -from ml_exp.representations import coulomb_matrix, lennard_jones_matrix +from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\ + first_neighbor_matrix, adjacency_matrix # If somebody does "from package import *", this is what they will # be able to access: __all__ = ['Compound', 'coulomb_matrix', - 'lennard_jones_matrix'] + 'lennard_jones_matrix', + 'first_neighbor_matrix', + 'adjacency_matrix'] diff --git a/ml_exp/__main__.py b/ml_exp/__main__.py index 124ce8cd9..44b492103 100644 --- a/ml_exp/__main__.py +++ b/ml_exp/__main__.py @@ -20,16 +20,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from ml_exp.compound import Compound +# from ml_exp.compound import Compound if __name__ == '__main__': - print('Initialize test') - mol_name = '/home/luevano/py/ml_exp/data/qm7/0004.xyz' - mol = Compound(mol_name) - mol.gen_cm(size=1, as_eig=False) - mol.gen_ljm(size=1, as_eig=False) - mol.gen_am(size=1) - - print(mol.n, mol.atoms, mol.atoms_nc) - print(mol.am) + print('OK!') -- cgit v1.2.3-70-g09d2 From 336654366a0e0a15263bd0d93bcb81ef48fcb040 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sun, 23 Feb 2020 21:12:00 -0700 Subject: Remove bag of stuff --- ml_exp/bostuff.py | 99 ------------------------------------------------------- 1 file changed, 99 deletions(-) delete mode 100644 ml_exp/bostuff.py diff --git a/ml_exp/bostuff.py b/ml_exp/bostuff.py deleted file mode 100644 index 1741e26cd..000000000 --- a/ml_exp/bostuff.py +++ /dev/null @@ -1,99 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -from numpy import array, zeros -from collections import Counter -from ml_exp.bob import check_bond - - -def bok_cx(adj_matrix, - atoms, - max_n=25, - max_bond_len=325): - """ - Creates the bag of k_cx using the adjacency matrix data. - adj_matrix: adjacency matrix. - max_n: maximum amount of atoms. - max_bond_len: maximum amount of bonds in molecule. - """ - n = len(atoms) - bond_n = (n * n - n) / 2 + n - n_r = range(n) - - if max_n < n: - print(''.join(['Error. Molecule matrix dimension (mol_n) is ', - 'greater than max_len. Using mol_n.'])) - max_n = n - - if max_bond_len < bond_n: - print(''.join(['Error. Molecule bond length (bond_n) is ', - 'greater than max_bond_len. Using bond_n.'])) - max_bond_len = bond_n - - # List where each bag data is stored. - bags = [] - for i in n_r: - for j in n_r: - # Work only in the upper triangle of the coulomb matrix. - if j >= i: - # Get the string of the current bond. - if i == j: - current_bond = atoms[i] - else: - current_bond = ''.join(sorted([atoms[i], atoms[j]])) - - # Check if that bond is already in a bag. - checker = check_bond(bags, current_bond) - # Either create a new bag or add values to an existing one. - if not checker[0]: - bags.append([current_bond, adj_matrix[i, j]]) - else: - bags[checker[1]].append(adj_matrix[i, j]) - - # Create the actual bond list ordered. - atom_counter = Counter(atoms) - atom_list = sorted(list(set(atoms))) - bonds = [] - for i, a_i in enumerate(atom_list): - if atom_counter[a_i] > 1: - for a_j in atom_list[i:]: - bonds.append(''.join(sorted([a_i, a_j]))) - bonds = atom_list + bonds - - # Create the final vector for the bok_cx. - bok_cx = array(zeros(max_bond_len), dtype=float) - c_i = 0 - for i, bond in enumerate(bonds): - checker = check_bond(bags, bond) - if checker[0]: - for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]): - # Use c_i as the index for bok_cx if the zero padding should - # be at the end of the vector instead of between each bond. - bok_cx[i*max_n + j] = num - c_i += 1 - # This is set to false because this was a debugging measure. - else: - print(''.join([f'Error. Bond {bond} from bond list coudn\'t', - ' be found in the bags list. This could be', - ' a case where the atom is only present once', - ' in the molecule.'])) - return bok_cx -- cgit v1.2.3-70-g09d2 From 321681e542509869568e9ed610d821c1d9d9d5e6 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sun, 23 Feb 2020 21:16:16 -0700 Subject: Move bob to representations --- ml_exp/bob.py | 117 ---------------------------------------------- ml_exp/representations.py | 94 +++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 117 deletions(-) delete mode 100644 ml_exp/bob.py diff --git a/ml_exp/bob.py b/ml_exp/bob.py deleted file mode 100644 index 86efecdb4..000000000 --- a/ml_exp/bob.py +++ /dev/null @@ -1,117 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -from numpy import array, zeros -from collections import Counter - - -def check_bond(bags, - bond): - """ - Checks if a bond is in a bag. - bags: list of bags, containing a bag per entry, which in turn - contains a list of bond-values. - bond: bond to check. - """ - if bags == []: - return False, None - - for i, bag in enumerate(bags): - if bag[0] == bond: - return True, i - - return False, None - - -def bob(c_matrix, - atoms, - max_n=25, - max_bond_len=325): - """ - Creates the bag of bond using the coulomb matrix data. - c_matrix: coulomb matrix. - atoms: list of atoms. - max_n: maximum amount of atoms. - max_bond_len: maximum amount of bonds in molecule. - """ - n = len(atoms) - bond_n = (n * n - n) / 2 + n - n_r = range(n) - - if max_n < n: - print(''.join(['Error. Molecule matrix dimension (mol_n) is ', - 'greater than max_len. Using mol_n.'])) - max_n = n - - if max_bond_len < bond_n: - print(''.join(['Error. Molecule bond lenght (bond_n) is ', - 'greater than max_bond_len. Using bond_n.'])) - max_bond_len = bond_n - - # List where each bag data is stored. - bags = [] - for i in n_r: - for j in n_r: - # Work only in the upper triangle of the coulomb matrix. - if j >= i: - # Get the string of the current bond. - if i == j: - current_bond = atoms[i] - else: - current_bond = ''.join(sorted([atoms[i], atoms[j]])) - - # Check if that bond is already in a bag. - checker = check_bond(bags, current_bond) - # Either create a new bag or add values to an existing one. - if not checker[0]: - bags.append([current_bond, c_matrix[i, j]]) - else: - bags[checker[1]].append(c_matrix[i, j]) - - # Create the actual bond list ordered. - atom_counter = Counter(atoms) - atom_list = sorted(list(set(atoms))) - bonds = [] - for i, a_i in enumerate(atom_list): - if atom_counter[a_i] > 1: - for a_j in atom_list[i:]: - bonds.append(''.join(sorted([a_i, a_j]))) - bonds = atom_list + bonds - - # Create the final vector for the bob. - bob = array(zeros(max_bond_len), dtype=float) - c_i = 0 - for i, bond in enumerate(bonds): - checker = check_bond(bags, bond) - if checker[0]: - for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]): - # Use c_i as the index for bob if the zero padding should - # be at the end of the vector instead of between each bond. - bob[i*max_n + j] = num - c_i += 1 - # This is set to false because this was a debugging measure. - else: - print(''.join([f'Error. Bond {bond} from bond list coudn\'t', - ' be found in the bags list. This could be', - ' a case where the atom is only present once', - ' in the molecule.'])) - return bob diff --git a/ml_exp/representations.py b/ml_exp/representations.py index 11c30dfd2..3119a4a88 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -21,6 +21,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import numpy as np +from collections import Counter def coulomb_matrix(coords, @@ -287,3 +288,96 @@ def adjacency_matrix(fnm, am[i, j] = fnm[bond_i[0], bond_i[1]] return am + + +def check_bond(bags, + bond): + """ + Checks if a bond is in a bag. + bags: list of bags, containing a bag per entry, which in turn + contains a list of bond-values. + bond: bond to check. + """ + if bags == []: + return False, None + + for i, bag in enumerate(bags): + if bag[0] == bond: + return True, i + + return False, None + + +def bob(c_matrix, + atoms, + max_n=25, + max_bond_len=325): + """ + Creates the bag of bond using the coulomb matrix data. + c_matrix: coulomb matrix. + atoms: list of atoms. + max_n: maximum amount of atoms. + max_bond_len: maximum amount of bonds in molecule. + """ + n = len(atoms) + bond_n = (n * n - n) / 2 + n + n_r = range(n) + + if max_n < n: + print(''.join(['Error. Molecule matrix dimension (mol_n) is ', + 'greater than max_len. Using mol_n.'])) + max_n = n + + if max_bond_len < bond_n: + print(''.join(['Error. Molecule bond lenght (bond_n) is ', + 'greater than max_bond_len. Using bond_n.'])) + max_bond_len = bond_n + + # List where each bag data is stored. + bags = [] + for i in n_r: + for j in n_r: + # Work only in the upper triangle of the coulomb matrix. + if j >= i: + # Get the string of the current bond. + if i == j: + current_bond = atoms[i] + else: + current_bond = ''.join(sorted([atoms[i], atoms[j]])) + + # Check if that bond is already in a bag. + checker = check_bond(bags, current_bond) + # Either create a new bag or add values to an existing one. + if not checker[0]: + bags.append([current_bond, c_matrix[i, j]]) + else: + bags[checker[1]].append(c_matrix[i, j]) + + # Create the actual bond list ordered. + atom_counter = Counter(atoms) + atom_list = sorted(list(set(atoms))) + bonds = [] + for i, a_i in enumerate(atom_list): + if atom_counter[a_i] > 1: + for a_j in atom_list[i:]: + bonds.append(''.join(sorted([a_i, a_j]))) + bonds = atom_list + bonds + + # Create the final vector for the bob. + bob = array(zeros(max_bond_len), dtype=float) + c_i = 0 + for i, bond in enumerate(bonds): + checker = check_bond(bags, bond) + if checker[0]: + for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]): + # Use c_i as the index for bob if the zero padding should + # be at the end of the vector instead of between each bond. + bob[i*max_n + j] = num + c_i += 1 + # This is set to false because this was a debugging measure. + else: + print(''.join([f'Error. Bond {bond} from bond list coudn\'t', + ' be found in the bags list. This could be', + ' a case where the atom is only present once', + ' in the molecule.'])) + return bob -- cgit v1.2.3-70-g09d2 From 2c8a05c20fbfec9d6c34a83958b694a7611b6cf1 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sun, 23 Feb 2020 21:45:06 -0700 Subject: Refactor bag of sutff in representations --- ml_exp/compound.py | 2 +- ml_exp/representations.py | 73 +++++++++++++++++++++++------------------------ 2 files changed, 36 insertions(+), 39 deletions(-) diff --git a/ml_exp/compound.py b/ml_exp/compound.py index d499e6b83..595078d55 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -45,7 +45,7 @@ class Compound: self.cm = None self.ljm = None self.am = None - self.bob = None + self.bos = None if xyz is not None: self.read_xyz(xyz) diff --git a/ml_exp/representations.py b/ml_exp/representations.py index 3119a4a88..a85fee11f 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -45,8 +45,8 @@ def coulomb_matrix(coords, n = coords.shape[0] if not n == nc.shape[0]: - raise ValueError('Compound size is different than the nuclear charge\ - size. Arrays are not of the right shape.') + raise ValueError('Compound size is different than the nuclear charge \ +size. Arrays are not of the right shape.') if size < n: print('Error. Compound size (n) is greater han (size). Using (n)', @@ -112,8 +112,8 @@ def lennard_jones_matrix(coords, n = coords.shape[0] if not n == nc.shape[0]: - raise ValueError('Compound size is different than the nuclear charge\ - size. Arrays are not of the right shape.') + raise ValueError('Compound size is different than the nuclear charge \ +size. Arrays are not of the right shape.') if size < n: print('Error. Compound size (n) is greater han (size). Using (n)', @@ -195,8 +195,8 @@ def first_neighbor_matrix(coords, n = coords.shape[0] if not n == nc.shape[0]: - raise ValueError('Compound size is different than the nuclear charge\ - size. Arrays are not of the right shape.') + raise ValueError('Compound size is different than the nuclear charge \ +size. Arrays are not of the right shape.') if size < n: print('Error. Compound size (n) is greater han (size). Using (n)', @@ -295,7 +295,7 @@ def check_bond(bags, """ Checks if a bond is in a bag. bags: list of bags, containing a bag per entry, which in turn - contains a list of bond-values. + contains a list of bond-values. bond: bond to check. """ if bags == []: @@ -308,50 +308,49 @@ def check_bond(bags, return False, None -def bob(c_matrix, - atoms, - max_n=25, - max_bond_len=325): +def bag_of(cm, + atoms, + stuff='bonds', + size=23): """ - Creates the bag of bond using the coulomb matrix data. - c_matrix: coulomb matrix. + Creates the Bag of Bonds using the Coulomb Matrix. + cm: coulomb matrix. atoms: list of atoms. - max_n: maximum amount of atoms. - max_bond_len: maximum amount of bonds in molecule. + size: maximum amount of atoms. """ + if cm is None: + raise ValueError('Coulomb Matrix hasn\'t been initialized for the \ +current compound.') + n = len(atoms) - bond_n = (n * n - n) / 2 + n - n_r = range(n) - if max_n < n: - print(''.join(['Error. Molecule matrix dimension (mol_n) is ', - 'greater than max_len. Using mol_n.'])) - max_n = n + if size < n: + print('Error. Compound size (n) is greater han (size). Using (n)', + 'instead of (size).') + size = n - if max_bond_len < bond_n: - print(''.join(['Error. Molecule bond lenght (bond_n) is ', - 'greater than max_bond_len. Using bond_n.'])) - max_bond_len = bond_n + # Bond max length, calculated using only the upper triangular matrix. + bond_size = (size * size - size)/2 + size # List where each bag data is stored. bags = [] - for i in n_r: - for j in n_r: + for i, atom_i in enumerate(atoms): + for j, atom_j in enumerate(atoms): # Work only in the upper triangle of the coulomb matrix. if j >= i: # Get the string of the current bond. if i == j: - current_bond = atoms[i] + current_bond = atom_i else: - current_bond = ''.join(sorted([atoms[i], atoms[j]])) + current_bond = ''.join(sorted([atom_i, atom_j])) # Check if that bond is already in a bag. checker = check_bond(bags, current_bond) # Either create a new bag or add values to an existing one. if not checker[0]: - bags.append([current_bond, c_matrix[i, j]]) + bags.append([current_bond, cm[i, j]]) else: - bags[checker[1]].append(c_matrix[i, j]) + bags[checker[1]].append(cm[i, j]) # Create the actual bond list ordered. atom_counter = Counter(atoms) @@ -364,7 +363,7 @@ def bob(c_matrix, bonds = atom_list + bonds # Create the final vector for the bob. - bob = array(zeros(max_bond_len), dtype=float) + bob = np.zeros(bond_size, dtype=float) c_i = 0 for i, bond in enumerate(bonds): checker = check_bond(bags, bond) @@ -372,12 +371,10 @@ def bob(c_matrix, for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]): # Use c_i as the index for bob if the zero padding should # be at the end of the vector instead of between each bond. - bob[i*max_n + j] = num + bob[i*size + j] = num c_i += 1 - # This is set to false because this was a debugging measure. else: - print(''.join([f'Error. Bond {bond} from bond list coudn\'t', - ' be found in the bags list. This could be', - ' a case where the atom is only present once', - ' in the molecule.'])) + print(f'Error. Bond {bond} from bond list coudn\'t be found', + 'in the bags list. This could be a case where the atom', + 'is only present oncce in the molecule.') return bob -- cgit v1.2.3-70-g09d2 From b82b90ec609ee80629f1e7f4b8a03cbbb53a0f21 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sun, 23 Feb 2020 21:45:45 -0700 Subject: Edit bag of stuff function name --- ml_exp/representations.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ml_exp/representations.py b/ml_exp/representations.py index a85fee11f..1fe55aa5f 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -308,15 +308,15 @@ def check_bond(bags, return False, None -def bag_of(cm, - atoms, - stuff='bonds', - size=23): +def bag_of_stuff(cm, + atoms, + stuff='bonds', + size=23): """ Creates the Bag of Bonds using the Coulomb Matrix. cm: coulomb matrix. atoms: list of atoms. - size: maximum amount of atoms. + size: compound size. """ if cm is None: raise ValueError('Coulomb Matrix hasn\'t been initialized for the \ -- cgit v1.2.3-70-g09d2 From b300e365d0886695b0d0220aad1be6cb38cf3c36 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Sun, 23 Feb 2020 22:06:30 -0700 Subject: Add bos to compound, and bugfix to bos --- ml_exp/__init__.py | 6 ++++-- ml_exp/compound.py | 15 ++++++++++++++- ml_exp/representations.py | 20 ++++++++++++-------- 3 files changed, 30 insertions(+), 11 deletions(-) diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py index d81f0999a..dcb10a1df 100644 --- a/ml_exp/__init__.py +++ b/ml_exp/__init__.py @@ -22,7 +22,7 @@ SOFTWARE. """ from ml_exp.compound import Compound from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\ - first_neighbor_matrix, adjacency_matrix + first_neighbor_matrix, adjacency_matrix, check_bond, bag_of_stuff # If somebody does "from package import *", this is what they will # be able to access: @@ -30,4 +30,6 @@ __all__ = ['Compound', 'coulomb_matrix', 'lennard_jones_matrix', 'first_neighbor_matrix', - 'adjacency_matrix'] + 'adjacency_matrix', + 'check_bond', + 'bag_of_stuff'] diff --git a/ml_exp/compound.py b/ml_exp/compound.py index 595078d55..8b6af0ae9 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -23,7 +23,7 @@ SOFTWARE. import numpy as np from ml_exp.data import NUCLEAR_CHARGE from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\ - first_neighbor_matrix, adjacency_matrix + first_neighbor_matrix, adjacency_matrix, bag_of_stuff class Compound: @@ -115,6 +115,19 @@ class Compound: forces, size=size) + def gen_bos(self, + size=23, + stuff='bonds'): + """ + Generate the Bag of Stuff for the compound. + size: compound size. + stuff: elements of the bag, by default the known bag of bonds. + """ + self.bos = bag_of_stuff(self.cm, + self.atoms, + size=size, + stuff=stuff) + def read_xyz(self, filename): """ diff --git a/ml_exp/representations.py b/ml_exp/representations.py index 1fe55aa5f..ea079595e 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -310,8 +310,8 @@ def check_bond(bags, def bag_of_stuff(cm, atoms, - stuff='bonds', - size=23): + size=23, + stuff='bonds'): """ Creates the Bag of Bonds using the Coulomb Matrix. cm: coulomb matrix. @@ -322,6 +322,10 @@ def bag_of_stuff(cm, raise ValueError('Coulomb Matrix hasn\'t been initialized for the \ current compound.') + if cm.ndim == 1: + raise ValueError('Coulomb Matrix (CM) dimension is 1. Maybe it was \ +generated as the vector of eigenvalues, try (re-)generating the CM.') + n = len(atoms) if size < n: @@ -330,7 +334,7 @@ current compound.') size = n # Bond max length, calculated using only the upper triangular matrix. - bond_size = (size * size - size)/2 + size + bond_size = int((size * size - size)/2 + size) # List where each bag data is stored. bags = [] @@ -362,19 +366,19 @@ current compound.') bonds.append(''.join(sorted([a_i, a_j]))) bonds = atom_list + bonds - # Create the final vector for the bob. - bob = np.zeros(bond_size, dtype=float) + # Create the final vector for the bos. + bos = np.zeros(bond_size, dtype=float) c_i = 0 for i, bond in enumerate(bonds): checker = check_bond(bags, bond) if checker[0]: for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]): - # Use c_i as the index for bob if the zero padding should + # Use c_i as the index for bos if the zero padding should # be at the end of the vector instead of between each bond. - bob[i*size + j] = num + bos[i*size + j] = num c_i += 1 else: print(f'Error. Bond {bond} from bond list coudn\'t be found', 'in the bags list. This could be a case where the atom', 'is only present oncce in the molecule.') - return bob + return bos -- cgit v1.2.3-70-g09d2 From adbc889949b8399353ab166b5d9c15734f1f0bb8 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Tue, 25 Feb 2020 20:41:09 -0700 Subject: Move frob norm and cholesky solve to math --- ml_exp/cholesky_solve.py | 64 --------------------------------- ml_exp/frob_norm.py | 51 --------------------------- ml_exp/gauss_kernel.py | 49 -------------------------- ml_exp/kernels.py | 49 ++++++++++++++++++++++++++ ml_exp/math.py | 92 ++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 141 insertions(+), 164 deletions(-) delete mode 100644 ml_exp/cholesky_solve.py delete mode 100644 ml_exp/frob_norm.py delete mode 100644 ml_exp/gauss_kernel.py create mode 100644 ml_exp/kernels.py create mode 100644 ml_exp/math.py diff --git a/ml_exp/cholesky_solve.py b/ml_exp/cholesky_solve.py deleted file mode 100644 index bc6a572a3..000000000 --- a/ml_exp/cholesky_solve.py +++ /dev/null @@ -1,64 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import numpy as np -from numpy.linalg import cholesky - - -def cholesky_solve(K, y): - """ - Applies Cholesky decomposition to obtain the 'alpha coeficients'. - K: kernel. - y: known parameters. - """ - # The initial mathematical problem is to solve Ka=y. - - # First, add a small lambda value. - K[np.diag_indices_from(K)] += 1e-8 - - # Get the Cholesky decomposition of the kernel. - L = cholesky(K) - size = len(L) - - # Solve Lx=y for x. - x = np.zeros(size) - x[0] = y[0] / L[0, 0] - for i in range(1, size): - temp_sum = 0.0 - for j in range(i): - temp_sum += L[i, j] * x[j] - x[i] = (y[i] - temp_sum) / L[i, i] - - # Now, solve LTa=x for a. - L2 = L.T - a = np.zeros(size) - a_ms = size - 1 - a[a_ms] = x[a_ms] / L2[a_ms, a_ms] - # Because of the form of L2 (upper triangular matriz), an inversion of - # range() needs to be done. - for i in range(0, a_ms)[::-1]: - temp_sum = 0.0 - for j in range(i, size)[::-1]: - temp_sum += L2[i, j] * a[j] - a[i] = (x[i] - temp_sum) / L2[i, i] - - return a diff --git a/ml_exp/frob_norm.py b/ml_exp/frob_norm.py deleted file mode 100644 index 4c3a2945d..000000000 --- a/ml_exp/frob_norm.py +++ /dev/null @@ -1,51 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import math - - -def frob_norm(array): - """ - Calculates the frobenius norm of a given array or matrix. - array: array of data. - """ - - arr_sh_len = len(array.shape) - arr_range = range(len(array)) - fn = 0.0 - - # If it is a 'vector'. - if arr_sh_len == 1: - for i in arr_range: - fn += array[i]*array[i] - - return math.sqrt(fn) - - # If it is a matrix. - elif arr_sh_len == 2: - for i in arr_range: - for j in arr_range: - fn += array[i, j]*array[i, j] - - return math.sqrt(fn) - else: - print('Error. Array size greater than 2 ({}).'.format(arr_sh_len)) diff --git a/ml_exp/gauss_kernel.py b/ml_exp/gauss_kernel.py deleted file mode 100644 index 834d62408..000000000 --- a/ml_exp/gauss_kernel.py +++ /dev/null @@ -1,49 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import math -import numpy as np -from ml_exp.frob_norm import frob_norm - - -def gauss_kernel(X_1, X_2, sigma): - """ - Calculates the Gaussian Kernel. - X_1: first representations. - X_2: second representations. - sigma: kernel width. - """ - x1_l = len(X_1) - x1_range = range(x1_l) - x2_l = len(X_2) - x2_range = range(x2_l) - - inv_sigma = -0.5 / (sigma*sigma) - - K = np.zeros((x1_l, x2_l)) - for i in x1_range: - for j in x2_range: - f_norm = frob_norm(X_1[i] - X_2[j]) - # print(f_norm) - K[i, j] = math.exp(inv_sigma * f_norm) - - return K diff --git a/ml_exp/kernels.py b/ml_exp/kernels.py new file mode 100644 index 000000000..834d62408 --- /dev/null +++ b/ml_exp/kernels.py @@ -0,0 +1,49 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import math +import numpy as np +from ml_exp.frob_norm import frob_norm + + +def gauss_kernel(X_1, X_2, sigma): + """ + Calculates the Gaussian Kernel. + X_1: first representations. + X_2: second representations. + sigma: kernel width. + """ + x1_l = len(X_1) + x1_range = range(x1_l) + x2_l = len(X_2) + x2_range = range(x2_l) + + inv_sigma = -0.5 / (sigma*sigma) + + K = np.zeros((x1_l, x2_l)) + for i in x1_range: + for j in x2_range: + f_norm = frob_norm(X_1[i] - X_2[j]) + # print(f_norm) + K[i, j] = math.exp(inv_sigma * f_norm) + + return K diff --git a/ml_exp/math.py b/ml_exp/math.py new file mode 100644 index 000000000..781985118 --- /dev/null +++ b/ml_exp/math.py @@ -0,0 +1,92 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import numpy as np +import math + + +def frob_norm(array): + """ + Calculates the frobenius norm of a given array or matrix. + array: array of data. + """ + + arr_sh_len = len(array.shape) + arr_range = range(len(array)) + fn = 0.0 + + # If it is a 'vector'. + if arr_sh_len == 1: + for i in arr_range: + fn += array[i]*array[i] + + return math.sqrt(fn) + + # If it is a matrix. + elif arr_sh_len == 2: + for i in arr_range: + for j in arr_range: + fn += array[i, j]*array[i, j] + + return math.sqrt(fn) + else: + print('Error. Array size greater than 2 ({}).'.format(arr_sh_len)) + + +def cholesky_solve(K, y): + """ + Applies Cholesky decomposition to obtain the 'alpha coeficients'. + K: kernel. + y: known parameters. + """ + # The initial mathematical problem is to solve Ka=y. + + # First, add a small lambda value. + K[np.diag_indices_from(K)] += 1e-8 + + # Get the Cholesky decomposition of the kernel. + L = np.linalg.cholesky(K) + size = len(L) + + # Solve Lx=y for x. + x = np.zeros(size) + x[0] = y[0] / L[0, 0] + for i in range(1, size): + temp_sum = 0.0 + for j in range(i): + temp_sum += L[i, j] * x[j] + x[i] = (y[i] - temp_sum) / L[i, i] + + # Now, solve LTa=x for a. + L2 = L.T + a = np.zeros(size) + a_ms = size - 1 + a[a_ms] = x[a_ms] / L2[a_ms, a_ms] + # Because of the form of L2 (upper triangular matriz), an inversion of + # range() needs to be done. + for i in range(0, a_ms)[::-1]: + temp_sum = 0.0 + for j in range(i, size)[::-1]: + temp_sum += L2[i, j] * a[j] + a[i] = (x[i] - temp_sum) / L2[i, i] + + return a -- cgit v1.2.3-70-g09d2 From 2bdd175a6eb1f6edd8689987fef1a29fc1043da1 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Tue, 25 Feb 2020 20:42:15 -0700 Subject: Remove frobnorm, already in numpy --- ml_exp/math.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/ml_exp/math.py b/ml_exp/math.py index 781985118..e7c8dcabc 100644 --- a/ml_exp/math.py +++ b/ml_exp/math.py @@ -21,35 +21,6 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import numpy as np -import math - - -def frob_norm(array): - """ - Calculates the frobenius norm of a given array or matrix. - array: array of data. - """ - - arr_sh_len = len(array.shape) - arr_range = range(len(array)) - fn = 0.0 - - # If it is a 'vector'. - if arr_sh_len == 1: - for i in arr_range: - fn += array[i]*array[i] - - return math.sqrt(fn) - - # If it is a matrix. - elif arr_sh_len == 2: - for i in arr_range: - for j in arr_range: - fn += array[i, j]*array[i, j] - - return math.sqrt(fn) - else: - print('Error. Array size greater than 2 ({}).'.format(arr_sh_len)) def cholesky_solve(K, y): -- cgit v1.2.3-70-g09d2 From d1b84b91f4b94b717de477e47b22946fd42be0db Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Tue, 25 Feb 2020 20:52:08 -0700 Subject: Refactor gauss kernel, more natural syntax --- ml_exp/__init__.py | 4 +++- ml_exp/kernels.py | 22 +++++++++------------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py index dcb10a1df..e9dd83eae 100644 --- a/ml_exp/__init__.py +++ b/ml_exp/__init__.py @@ -23,6 +23,7 @@ SOFTWARE. from ml_exp.compound import Compound from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\ first_neighbor_matrix, adjacency_matrix, check_bond, bag_of_stuff +from ml_exp.math import cholesky_solve # If somebody does "from package import *", this is what they will # be able to access: @@ -32,4 +33,5 @@ __all__ = ['Compound', 'first_neighbor_matrix', 'adjacency_matrix', 'check_bond', - 'bag_of_stuff'] + 'bag_of_stuff', + 'cholesky_solve'] diff --git a/ml_exp/kernels.py b/ml_exp/kernels.py index 834d62408..7a61a1e1e 100644 --- a/ml_exp/kernels.py +++ b/ml_exp/kernels.py @@ -22,27 +22,23 @@ SOFTWARE. """ import math import numpy as np -from ml_exp.frob_norm import frob_norm -def gauss_kernel(X_1, X_2, sigma): +def gaussian_kernel(X1, + X2, + sigma): """ Calculates the Gaussian Kernel. - X_1: first representations. - X_2: second representations. + X1: first representations. + X2: second representations. sigma: kernel width. """ - x1_l = len(X_1) - x1_range = range(x1_l) - x2_l = len(X_2) - x2_range = range(x2_l) - inv_sigma = -0.5 / (sigma*sigma) - K = np.zeros((x1_l, x2_l)) - for i in x1_range: - for j in x2_range: - f_norm = frob_norm(X_1[i] - X_2[j]) + K = np.zeros((X1.shape[0], X2.shape[0]), dtype=float) + for i in X1: + for j in X2: + f_norm = np.linalg.norm(i - j) # print(f_norm) K[i, j] = math.exp(inv_sigma * f_norm) -- cgit v1.2.3-70-g09d2 From 0b35ed647ec4d786b488082dd98bb91ca0f5cc05 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Tue, 25 Feb 2020 20:54:01 -0700 Subject: Placeholder --- ml_exp/__main__.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ml_exp/__main__.py b/ml_exp/__main__.py index 44b492103..0d9b87bad 100644 --- a/ml_exp/__main__.py +++ b/ml_exp/__main__.py @@ -20,8 +20,5 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -# from ml_exp.compound import Compound - - if __name__ == '__main__': - print('OK!') + print('Everything OK!\n\tThis is a placeholder.') -- cgit v1.2.3-70-g09d2 From 339954ffc95c29327d33e3e2697fea1cf8e9427d Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Tue, 25 Feb 2020 21:03:31 -0700 Subject: Remove unused functions --- ml_exp/read_qm7_data.py | 33 --------------------------------- 1 file changed, 33 deletions(-) diff --git a/ml_exp/read_qm7_data.py b/ml_exp/read_qm7_data.py index 666f9c3fc..3017ac429 100644 --- a/ml_exp/read_qm7_data.py +++ b/ml_exp/read_qm7_data.py @@ -25,39 +25,6 @@ import numpy as np import random -# 'periodic_table_of_elements.txt' retrieved from -# https://gist.github.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee -def read_nc_data(data_path): - """ - Reads nuclear charge data from file and returns a dictionary. - data_path: path to the data directory. - """ - fname = 'periodic_table_of_elements.txt' - with open(''.join([data_path, '/', fname]), 'r') as infile: - temp_lines = infile.readlines() - - del temp_lines[0] - - lines = [] - for temp_line in temp_lines: - new_line = temp_line.split(sep=',') - lines.append(new_line) - - # Dictionary of nuclear charge. - return {line[2]: int(line[0]) for line in lines} - - -# For use in the rewriting of the code (from functional programming to object -# oriented programming-ish) -def print_nc(nc_data): - """ - Prints the nuclear charge data to the terminal. - nc_data: dict of nc. - """ - for key in nc_data.keys(): - print(f'\'{key}\': {nc_data[key]},') - - # 'hof_qm7.txt.txt' retrieved from # https://github.com/qmlcode/tutorial def read_db_data(zi_data, -- cgit v1.2.3-70-g09d2 From 3c8a53f42a4a90d3cc235fc4e4132556b9cd74fa Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Tue, 25 Feb 2020 21:06:10 -0700 Subject: Rename function --- ml_exp/qm7db.py | 106 ++++++++++++++++++++++++++++++++++++++++++++++++ ml_exp/read_qm7_data.py | 106 ------------------------------------------------ 2 files changed, 106 insertions(+), 106 deletions(-) create mode 100644 ml_exp/qm7db.py delete mode 100644 ml_exp/read_qm7_data.py diff --git a/ml_exp/qm7db.py b/ml_exp/qm7db.py new file mode 100644 index 000000000..e584bc123 --- /dev/null +++ b/ml_exp/qm7db.py @@ -0,0 +1,106 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import os +import numpy as np +import random + + +# 'hof_qm7.txt.txt' retrieved from +# https://github.com/qmlcode/tutorial +def qm7db(zi_data, + data_path, + r_seed=111, + return_atoms=False): + """ + Reads molecule database and extracts + its contents as usable variables. + zi_data: dictionary containing nuclear charge data. + data_path: path to the data directory. + r_seed: random seed to use for the shuffling. + return_atoms: if atom list should be returned. + """ + os.chdir(data_path) + + fname = 'hof_qm7.txt' + with open(fname, 'r') as infile: + lines = infile.readlines() + + # Temporary energy dictionary. + energy_temp = dict() + + for line in lines: + xyz_data = line.split() + + xyz_name = xyz_data[0] + hof = float(xyz_data[1]) + dftb = float(xyz_data[2]) + # print(xyz_name, hof, dftb) + + energy_temp[xyz_name] = np.array([hof, hof - dftb]) + + # Use a random seed. + random.seed(r_seed) + + et_keys = list(energy_temp.keys()) + random.shuffle(et_keys) + + # Temporary energy dictionary, shuffled. + energy_temp_shuffled = dict() + for key in et_keys: + energy_temp_shuffled.update({key: energy_temp[key]}) + + mol_data = [] + mol_nc_data = [] + atoms = [] + # Actual reading of the xyz files. + for i, k in enumerate(energy_temp_shuffled.keys()): + with open(k, 'r') as xyz_file: + lines = xyz_file.readlines() + + len_lines = len(lines) + mol_temp_data = [] + mol_nc_temp_data = np.array(np.zeros(len_lines-2)) + atoms_temp = [] + for j, line in enumerate(lines[2:len_lines]): + line_list = line.split() + + atoms_temp.append(line_list[0]) + mol_nc_temp_data[j] = float(zi_data[line_list[0]]) + line_data = np.array(np.asarray(line_list[1:4], dtype=float)) + mol_temp_data.append(line_data) + + mol_data.append(mol_temp_data) + mol_nc_data.append(mol_nc_temp_data) + atoms.append(atoms_temp) + + # Convert everything to a numpy array. + molecules = np.array([np.array(mol) for mol in mol_data]) + nuclear_charge = np.array([nc_d for nc_d in mol_nc_data]) + energy_pbe0 = np.array([energy_temp_shuffled[k][0] + for k in energy_temp_shuffled.keys()]) + energy_delta = np.array([energy_temp_shuffled[k][1] + for k in energy_temp_shuffled.keys()]) + + if return_atoms: + return molecules, nuclear_charge, energy_pbe0, energy_delta, atoms + return molecules, nuclear_charge, energy_pbe0, energy_delta diff --git a/ml_exp/read_qm7_data.py b/ml_exp/read_qm7_data.py deleted file mode 100644 index 3017ac429..000000000 --- a/ml_exp/read_qm7_data.py +++ /dev/null @@ -1,106 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import os -import numpy as np -import random - - -# 'hof_qm7.txt.txt' retrieved from -# https://github.com/qmlcode/tutorial -def read_db_data(zi_data, - data_path, - r_seed=111, - return_atoms=False): - """ - Reads molecule database and extracts - its contents as usable variables. - zi_data: dictionary containing nuclear charge data. - data_path: path to the data directory. - r_seed: random seed to use for the shuffling. - return_atoms: if atom list should be returned. - """ - os.chdir(data_path) - - fname = 'hof_qm7.txt' - with open(fname, 'r') as infile: - lines = infile.readlines() - - # Temporary energy dictionary. - energy_temp = dict() - - for line in lines: - xyz_data = line.split() - - xyz_name = xyz_data[0] - hof = float(xyz_data[1]) - dftb = float(xyz_data[2]) - # print(xyz_name, hof, dftb) - - energy_temp[xyz_name] = np.array([hof, hof - dftb]) - - # Use a random seed. - random.seed(r_seed) - - et_keys = list(energy_temp.keys()) - random.shuffle(et_keys) - - # Temporary energy dictionary, shuffled. - energy_temp_shuffled = dict() - for key in et_keys: - energy_temp_shuffled.update({key: energy_temp[key]}) - - mol_data = [] - mol_nc_data = [] - atoms = [] - # Actual reading of the xyz files. - for i, k in enumerate(energy_temp_shuffled.keys()): - with open(k, 'r') as xyz_file: - lines = xyz_file.readlines() - - len_lines = len(lines) - mol_temp_data = [] - mol_nc_temp_data = np.array(np.zeros(len_lines-2)) - atoms_temp = [] - for j, line in enumerate(lines[2:len_lines]): - line_list = line.split() - - atoms_temp.append(line_list[0]) - mol_nc_temp_data[j] = float(zi_data[line_list[0]]) - line_data = np.array(np.asarray(line_list[1:4], dtype=float)) - mol_temp_data.append(line_data) - - mol_data.append(mol_temp_data) - mol_nc_data.append(mol_nc_temp_data) - atoms.append(atoms_temp) - - # Convert everything to a numpy array. - molecules = np.array([np.array(mol) for mol in mol_data]) - nuclear_charge = np.array([nc_d for nc_d in mol_nc_data]) - energy_pbe0 = np.array([energy_temp_shuffled[k][0] - for k in energy_temp_shuffled.keys()]) - energy_delta = np.array([energy_temp_shuffled[k][1] - for k in energy_temp_shuffled.keys()]) - - if return_atoms: - return molecules, nuclear_charge, energy_pbe0, energy_delta, atoms - return molecules, nuclear_charge, energy_pbe0, energy_delta -- cgit v1.2.3-70-g09d2 From 2185e43ffdbf70a06cb894549fc7ff09f9c90cc3 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Tue, 25 Feb 2020 21:09:49 -0700 Subject: Refactor code --- ml_exp/qm7db.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/ml_exp/qm7db.py b/ml_exp/qm7db.py index e584bc123..4a7beaed6 100644 --- a/ml_exp/qm7db.py +++ b/ml_exp/qm7db.py @@ -27,17 +27,14 @@ import random # 'hof_qm7.txt.txt' retrieved from # https://github.com/qmlcode/tutorial -def qm7db(zi_data, +def qm7db(nc, data_path, - r_seed=111, - return_atoms=False): + r_seed=111): """ - Reads molecule database and extracts - its contents as usable variables. - zi_data: dictionary containing nuclear charge data. + Creates a list of compounds with the qm7 database. + nc: dictionary containing nuclear charge data. data_path: path to the data directory. r_seed: random seed to use for the shuffling. - return_atoms: if atom list should be returned. """ os.chdir(data_path) @@ -85,7 +82,7 @@ def qm7db(zi_data, line_list = line.split() atoms_temp.append(line_list[0]) - mol_nc_temp_data[j] = float(zi_data[line_list[0]]) + mol_nc_temp_data[j] = float(nc[line_list[0]]) line_data = np.array(np.asarray(line_list[1:4], dtype=float)) mol_temp_data.append(line_data) @@ -101,6 +98,4 @@ def qm7db(zi_data, energy_delta = np.array([energy_temp_shuffled[k][1] for k in energy_temp_shuffled.keys()]) - if return_atoms: - return molecules, nuclear_charge, energy_pbe0, energy_delta, atoms - return molecules, nuclear_charge, energy_pbe0, energy_delta + return molecules, nuclear_charge, energy_pbe0, energy_delta, atoms -- cgit v1.2.3-70-g09d2 From 39d7b86cfb578b491c8cbf99905e67f7f7937dc9 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Tue, 25 Feb 2020 21:13:33 -0700 Subject: Refactor code --- ml_exp/qm7db.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/ml_exp/qm7db.py b/ml_exp/qm7db.py index 4a7beaed6..1f1115ba0 100644 --- a/ml_exp/qm7db.py +++ b/ml_exp/qm7db.py @@ -20,7 +20,6 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import os import numpy as np import random @@ -28,19 +27,18 @@ import random # 'hof_qm7.txt.txt' retrieved from # https://github.com/qmlcode/tutorial def qm7db(nc, - data_path, + db_path='data', r_seed=111): """ Creates a list of compounds with the qm7 database. nc: dictionary containing nuclear charge data. - data_path: path to the data directory. + db_path: path to the database directory. r_seed: random seed to use for the shuffling. """ - os.chdir(data_path) - fname = 'hof_qm7.txt' - with open(fname, 'r') as infile: - lines = infile.readlines() + fname = f'{db_path}/hof_qm7.txt' + with open(fname, 'r') as f: + lines = f.readlines() # Temporary energy dictionary. energy_temp = dict() -- cgit v1.2.3-70-g09d2 From d064df5d045a5502f273814252debdb564df3ffc Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Wed, 26 Feb 2020 04:47:44 -0700 Subject: Refactor code --- ml_exp/math.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/ml_exp/math.py b/ml_exp/math.py index e7c8dcabc..da1fdf08f 100644 --- a/ml_exp/math.py +++ b/ml_exp/math.py @@ -23,23 +23,23 @@ SOFTWARE. import numpy as np -def cholesky_solve(K, y): +def cholesky_solve(K, + y): """ - Applies Cholesky decomposition to obtain the 'alpha coeficients'. + Applies Cholesky decomposition to solve Ka=y. Where a are the alpha + coeficients. K: kernel. y: known parameters. """ - # The initial mathematical problem is to solve Ka=y. - - # First, add a small lambda value. + # Add a small lambda value. K[np.diag_indices_from(K)] += 1e-8 # Get the Cholesky decomposition of the kernel. L = np.linalg.cholesky(K) - size = len(L) + size = K.shape[0] # Solve Lx=y for x. - x = np.zeros(size) + x = np.zeros(size, dtype=float) x[0] = y[0] / L[0, 0] for i in range(1, size): temp_sum = 0.0 @@ -49,12 +49,9 @@ def cholesky_solve(K, y): # Now, solve LTa=x for a. L2 = L.T - a = np.zeros(size) - a_ms = size - 1 - a[a_ms] = x[a_ms] / L2[a_ms, a_ms] - # Because of the form of L2 (upper triangular matriz), an inversion of - # range() needs to be done. - for i in range(0, a_ms)[::-1]: + a = np.zeros(size, dtype=float) + a[size - 1] = x[size - 1] / L2[size - 1, size - 1] + for i in range(0, size - 1)[::-1]: temp_sum = 0.0 for j in range(i, size)[::-1]: temp_sum += L2[i, j] * a[j] -- cgit v1.2.3-70-g09d2 From 5a85580b5f99484b957b65326ec5a19ab627d801 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Wed, 26 Feb 2020 05:26:21 -0700 Subject: Optimize qm7db reading for the rewrite --- ml_exp/__init__.py | 2 -- ml_exp/compound.py | 3 ++- ml_exp/qm7db.py | 68 +++++++++++------------------------------------------- 3 files changed, 16 insertions(+), 57 deletions(-) diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py index e9dd83eae..d685e7ccc 100644 --- a/ml_exp/__init__.py +++ b/ml_exp/__init__.py @@ -25,8 +25,6 @@ from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\ first_neighbor_matrix, adjacency_matrix, check_bond, bag_of_stuff from ml_exp.math import cholesky_solve -# If somebody does "from package import *", this is what they will -# be able to access: __all__ = ['Compound', 'coulomb_matrix', 'lennard_jones_matrix', diff --git a/ml_exp/compound.py b/ml_exp/compound.py index 8b6af0ae9..0a8b89610 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -33,7 +33,7 @@ class Compound: Initialization of the Compound. xyz: (path to) the xyz file. """ - # empty_array = np.asarray([], dtype=float) + self.name = None self.n = None self.atoms = None @@ -137,6 +137,7 @@ class Compound: with open(filename, 'r') as f: lines = f.readlines() + self.name = filename.split('/')[-1] self.n = int(lines[0]) self.atoms = [] self.atoms_nc = np.empty(self.n, dtype=int) diff --git a/ml_exp/qm7db.py b/ml_exp/qm7db.py index 1f1115ba0..f9950c317 100644 --- a/ml_exp/qm7db.py +++ b/ml_exp/qm7db.py @@ -20,6 +20,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +from ml_exp.compound import Compound import numpy as np import random @@ -28,72 +29,31 @@ import random # https://github.com/qmlcode/tutorial def qm7db(nc, db_path='data', + is_shuffled=True, r_seed=111): """ Creates a list of compounds with the qm7 database. nc: dictionary containing nuclear charge data. db_path: path to the database directory. + is_shuffled: if the resulting list of compounds should be shuffled. r_seed: random seed to use for the shuffling. """ - fname = f'{db_path}/hof_qm7.txt' with open(fname, 'r') as f: lines = f.readlines() - # Temporary energy dictionary. - energy_temp = dict() - - for line in lines: - xyz_data = line.split() - - xyz_name = xyz_data[0] - hof = float(xyz_data[1]) - dftb = float(xyz_data[2]) - # print(xyz_name, hof, dftb) + compounds = [] + for i, line in enumerate(lines): + line = line.split() + compounds.append(Compound(f'{db_path}/{line[0]}')) + compounds[i].pbe0 = float(line[1]) + compounds[i].delta = float(line[1]) - float(line[2]) - energy_temp[xyz_name] = np.array([hof, hof - dftb]) - - # Use a random seed. + # Shuffle the compounds list random.seed(r_seed) + random.shuffle(compounds) - et_keys = list(energy_temp.keys()) - random.shuffle(et_keys) - - # Temporary energy dictionary, shuffled. - energy_temp_shuffled = dict() - for key in et_keys: - energy_temp_shuffled.update({key: energy_temp[key]}) - - mol_data = [] - mol_nc_data = [] - atoms = [] - # Actual reading of the xyz files. - for i, k in enumerate(energy_temp_shuffled.keys()): - with open(k, 'r') as xyz_file: - lines = xyz_file.readlines() - - len_lines = len(lines) - mol_temp_data = [] - mol_nc_temp_data = np.array(np.zeros(len_lines-2)) - atoms_temp = [] - for j, line in enumerate(lines[2:len_lines]): - line_list = line.split() - - atoms_temp.append(line_list[0]) - mol_nc_temp_data[j] = float(nc[line_list[0]]) - line_data = np.array(np.asarray(line_list[1:4], dtype=float)) - mol_temp_data.append(line_data) - - mol_data.append(mol_temp_data) - mol_nc_data.append(mol_nc_temp_data) - atoms.append(atoms_temp) - - # Convert everything to a numpy array. - molecules = np.array([np.array(mol) for mol in mol_data]) - nuclear_charge = np.array([nc_d for nc_d in mol_nc_data]) - energy_pbe0 = np.array([energy_temp_shuffled[k][0] - for k in energy_temp_shuffled.keys()]) - energy_delta = np.array([energy_temp_shuffled[k][1] - for k in energy_temp_shuffled.keys()]) + e_pbe0 = np.array([compound.pbe0 for compound in compounds], dtype=float) + e_delta = np.array([compound.delta for compound in compounds], dtype=float) - return molecules, nuclear_charge, energy_pbe0, energy_delta, atoms + return compounds, e_pbe0, e_delta -- cgit v1.2.3-70-g09d2 From a0e73407bfef3569a994c07d5bb9c8df97518e30 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Wed, 26 Feb 2020 05:32:35 -0700 Subject: Add feature --- ml_exp/compound.py | 2 ++ ml_exp/qm7db.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/ml_exp/compound.py b/ml_exp/compound.py index 0a8b89610..9c7102860 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -36,6 +36,7 @@ class Compound: self.name = None self.n = None + self.extra = None # In case a comment is in the compound file. self.atoms = None self.atoms_nc = None self.coordinates = None @@ -139,6 +140,7 @@ class Compound: self.name = filename.split('/')[-1] self.n = int(lines[0]) + self.extra = lines[1] self.atoms = [] self.atoms_nc = np.empty(self.n, dtype=int) self.coordinates = np.empty((self.n, 3), dtype=float) diff --git a/ml_exp/qm7db.py b/ml_exp/qm7db.py index f9950c317..8d2453f45 100644 --- a/ml_exp/qm7db.py +++ b/ml_exp/qm7db.py @@ -49,9 +49,9 @@ def qm7db(nc, compounds[i].pbe0 = float(line[1]) compounds[i].delta = float(line[1]) - float(line[2]) - # Shuffle the compounds list - random.seed(r_seed) - random.shuffle(compounds) + if is_shuffled: + random.seed(r_seed) + random.shuffle(compounds) e_pbe0 = np.array([compound.pbe0 for compound in compounds], dtype=float) e_delta = np.array([compound.delta for compound in compounds], dtype=float) -- cgit v1.2.3-70-g09d2 From b209f5e79ce45e6a4d5b442f10217806d078d5a6 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Wed, 26 Feb 2020 05:40:44 -0700 Subject: Update requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 437e0428a..4b242770c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ colorama==0.4.3 numpy==1.18.1 -pandas==0.25.3 \ No newline at end of file +pandas==1.0.1 -- cgit v1.2.3-70-g09d2 From e18a909d18481e5e292882b3b1dd2ad8d693d097 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Wed, 26 Feb 2020 05:53:54 -0700 Subject: Revise printc --- ml_exp/__init__.py | 4 +++- ml_exp/misc.py | 38 +++++++++++++++++--------------------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py index d685e7ccc..c62141a58 100644 --- a/ml_exp/__init__.py +++ b/ml_exp/__init__.py @@ -24,6 +24,7 @@ from ml_exp.compound import Compound from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\ first_neighbor_matrix, adjacency_matrix, check_bond, bag_of_stuff from ml_exp.math import cholesky_solve +from ml_exp.qm7db import qm7db __all__ = ['Compound', 'coulomb_matrix', @@ -32,4 +33,5 @@ __all__ = ['Compound', 'adjacency_matrix', 'check_bond', 'bag_of_stuff', - 'cholesky_solve'] + 'cholesky_solve', + 'qm7db'] diff --git a/ml_exp/misc.py b/ml_exp/misc.py index e9142b05f..4bd68eefc 100644 --- a/ml_exp/misc.py +++ b/ml_exp/misc.py @@ -29,29 +29,25 @@ init() def printc(text, color): """ Prints texts normaly, but in color. Using colorama. - text: string with the text to print. + text: text to print. color: color to be used, same as available in colorama. """ - color_dic = {'BLACK': Fore.BLACK, - 'RED': Fore.RED, - 'GREEN': Fore.GREEN, - 'YELLOW': Fore.YELLOW, - 'BLUE': Fore.BLUE, - 'MAGENTA': Fore.MAGENTA, - 'CYAN': Fore.CYAN, - 'WHITE': Fore.WHITE, - 'RESET': Fore.RESET} - - color_dic_keys = color_dic.keys() - if color not in color_dic_keys: - print(Fore.RED - + '\'{}\' not found, using default color.'.format(color) - + Style.RESET_ALL) - actual_color = Fore.RESET - else: - actual_color = color_dic[color] - - print(actual_color + text + Style.RESET_ALL) + color = color.upper() + colors = {'BLACK': Fore.BLACK, + 'RED': Fore.RED, + 'GREEN': Fore.GREEN, + 'YELLOW': Fore.YELLOW, + 'BLUE': Fore.BLUE, + 'MAGENTA': Fore.MAGENTA, + 'CYAN': Fore.CYAN, + 'WHITE': Fore.WHITE, + 'RESET': Fore.RESET} + + av_colors = colors.keys() + if color not in av_colors: + raise KeyError(f'{color} not found.') + + print(colors[color] + text + Style.RESET_ALL) def plot_benchmarks(): -- cgit v1.2.3-70-g09d2 From eeea8ff03aeae2367f8db4e4f0d65cccf708d434 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Wed, 26 Feb 2020 21:20:10 -0700 Subject: Remove unused function --- ml_exp/parallel_create_matrices.py | 85 -------------------------------------- 1 file changed, 85 deletions(-) delete mode 100644 ml_exp/parallel_create_matrices.py diff --git a/ml_exp/parallel_create_matrices.py b/ml_exp/parallel_create_matrices.py deleted file mode 100644 index a68122d48..000000000 --- a/ml_exp/parallel_create_matrices.py +++ /dev/null @@ -1,85 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -from multiprocessing import Process, Pipe -from ml_exp.c_matrix import c_matrix_multiple -from ml_exp.lj_matrix import lj_matrix_multiple - - -def parallel_create_matrices(mol_data, - nc_data, - ljm_diag_value=None, - ljm_sigma=1.0, - ljm_epsilon=1.0, - max_len=25, - as_eig=True, - bohr_radius_units=False): - """ - Creates the Coulomb and L-J matrices in parallel. - mol_data: molecule data, matrix of atom coordinates. - nc_data: nuclear charge data, array of atom data. - ljm_diag_value: if special diagonal value is to be used for lj matrix. - ljm_sigma: sigma value for lj matrix. - ljm_epsilon: psilon value for lj matrix. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - """ - - # Matrices calculation. - procs = [] - pipes = [] - - cm_recv, cm_send = Pipe(False) - p1 = Process(target=c_matrix_multiple, - args=(mol_data, - nc_data, - cm_send, - max_len, - as_eig, - bohr_radius_units)) - procs.append(p1) - pipes.append(cm_recv) - p1.start() - - ljm_recv, ljm_send = Pipe(False) - p2 = Process(target=lj_matrix_multiple, - args=(mol_data, - nc_data, - ljm_send, - ljm_diag_value, - ljm_sigma, - ljm_epsilon, - max_len, - as_eig, - bohr_radius_units)) - procs.append(p2) - pipes.append(ljm_recv) - p2.start() - - cm_data = pipes[0].recv() - ljm_data = pipes[1].recv() - - for proc in procs: - proc.join() - - return cm_data, ljm_data -- cgit v1.2.3-70-g09d2 From 4d759471c3e92381bb6e8b91083b8dea5ec49b02 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Thu, 27 Feb 2020 15:31:23 -0700 Subject: Rewrite basic ml function --- ml_exp/do_ml.py | 218 ++++++++++++-------------------------------------------- 1 file changed, 45 insertions(+), 173 deletions(-) diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py index 110242a1d..de4d4061c 100644 --- a/ml_exp/do_ml.py +++ b/ml_exp/do_ml.py @@ -22,206 +22,78 @@ SOFTWARE. """ import time import numpy as np -from multiprocessing import Process, Pipe from ml_exp.misc import printc -from ml_exp.gauss_kernel import gauss_kernel -from ml_exp.cholesky_solve import cholesky_solve -from ml_exp.read_qm7_data import read_qm7_data -from ml_exp.parallel_create_matrices import parallel_create_matrices - - -def ml(desc_data, - energy_data, - training_size, - desc_type=None, - pipe=None, - test_size=None, - sigma=1000.0, - show_msgs=True): +from ml_exp.kernels import gaussian_kernel +from ml_exp.math import cholesky_solve +# from ml_exp.qm7db import qm7db + + +def simple_ml(descriptors, + energies, + training_size, + identifier=None, + test_size=None, + sigma=1000.0, + show_msgs=True): """ - Does the ML methodology. - desc_data: descriptor (or representation) data. - energy_data: energy data associated with desc_data. + Basic ML methodology for a single descriptor type. + descriptors: array of descriptors. + energies: array of energies. training_size: size of the training set to use. - desc_type: string with the name of the descriptor used. - pipe: for multiprocessing purposes. Sends the data calculated - through a pipe. + identifier: string with the name of the descriptor used. test_size: size of the test set to use. If no size is given, the last remaining molecules are used. sigma: depth of the kernel. - show_msgs: Show debug messages or not. - NOTE: desc_type is just a string and is only for identification purposes. + show_msgs: if debu messages should be shown. + NOTE: identifier is just a string and is only for identification purposes. Also, training is done with the first part of the data and - testing with the ending part of the data. + testing with the ending part of the data. """ tic = time.perf_counter() # Initial calculations for later use. - d_len = len(desc_data) - e_len = len(energy_data) + data_size = descriptors.shape[0] - if not desc_type: - desc_type = 'NOT SPECIFIED' + if not identifier: + identifier = 'NOT SPECIFIED' - if d_len != e_len: - printc(''.join(['ERROR. Descriptor data size different ', - 'than energy data size.']), 'RED') - return None + if not data_size == energies.shape[0]: + raise ValueError('Energies size is different than descriptors size.') - if training_size >= d_len: - printc('ERROR. Training size greater or equal than data size.', 'RED') - return None + if training_size >= data_size: + raise ValueError('Training size is greater or equal to the data size.') + # If test_size is not set, it is set to a maximum size of 1500. if not test_size: - test_size = d_len - training_size + test_size = data_size - training_size if test_size > 1500: test_size = 1500 if show_msgs: - printc('{} ML started.'.format(desc_type), 'GREEN') - printc('\tTraining size: {}'.format(training_size), 'CYAN') - printc('\tTest size: {}'.format(test_size), 'CYAN') - printc('\tSigma: {}'.format(sigma), 'CYAN') + printc(f'{identifier} ML started.', 'GREEN') + printc(f'\tTraining size: {training_size}', 'CYAN') + printc(f'\tTest size: {test_size}', 'CYAN') + printc(f'\tSigma: {test_size}', 'CYAN') - X_training = desc_data[:training_size] - Y_training = energy_data[:training_size] - K_training = gauss_kernel(X_training, X_training, sigma) - alpha_ = cholesky_solve(K_training, Y_training) + X_training = descriptors[:training_size] + Y_training = energies[:training_size] + K_training = gaussian_kernel(X_training, X_training, sigma) + alpha = cholesky_solve(K_training, Y_training) - X_test = desc_data[-test_size:] - Y_test = energy_data[-test_size:] - K_test = gauss_kernel(X_test, X_training, sigma) - Y_predicted = np.dot(K_test, alpha_) + X_test = descriptors[-test_size:] + Y_test = energies[-test_size:] + K_test = gaussian_kernel(X_test, X_training, sigma) + Y_predicted = np.dot(K_test, alpha) mae = np.mean(np.abs(Y_predicted - Y_test)) if show_msgs: - printc('\tMAE for {}: {:.4f}'.format(desc_type, mae), 'GREEN') + printc('\tMAE for {identifier}: {mae:.4f}', 'GREEN') toc = time.perf_counter() tictoc = toc - tic if show_msgs: - printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc), - 'GREEN') - printc('\t\tTraining size: {}'.format(training_size), 'CYAN') - printc('\t\tTest size: {}'.format(test_size), 'CYAN') - printc('\t\tSigma: {}'.format(sigma), 'CYAN') - - if pipe: - pipe.send([desc_type, training_size, test_size, sigma, mae, tictoc]) + printc(f'\t{identifier} ML took {tictoc:.4f} seconds.', 'GREEN') + printc(f'\t\tTraining size: {training_size}', 'CYAN') + printc(f'\t\tTest size: {test_size}', 'CYAN') + printc(f'\t\tSigma: {sigma}', 'CYAN') return mae, tictoc - - -def do_ml(min_training_size, - max_training_size=None, - training_increment_size=500, - test_size=None, - ljm_diag_value=None, - ljm_sigma=1.0, - ljm_epsilon=1.0, - r_seed=111, - save_benchmarks=False, - max_len=25, - as_eig=True, - bohr_radius_units=False, - sigma=1000.0, - show_msgs=True): - """ - Main function that does the whole ML process. - min_training_size: minimum training size. - max_training_size: maximum training size. - training_increment_size: training increment size. - test_size: size of the test set to use. If no size is given, - the last remaining molecules are used. - ljm_diag_value: if a special diagonal value should be used in lj matrix. - ljm_sigma: sigma value for lj matrix. - ljm_epsilon: epsilon value for lj matrix. - r_seed: random seed to use for the shuffling. - save_benchmarks: if benchmarks should be saved. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - sigma: depth of the kernel. - show_msgs: Show debug messages or not. - """ - # Initialization time. - init_time = time.perf_counter() - if not max_training_size: - max_training_size = min_training_size + training_increment_size - - # Data reading. - molecules, nuclear_charge, energy_pbe0, energy_delta =\ - read_qm7_data(r_seed=r_seed) - - # Matrices calculation. - cm_data, ljm_data = parallel_create_matrices(molecules, - nuclear_charge, - ljm_diag_value, - ljm_sigma, - ljm_epsilon, - max_len, - as_eig, - bohr_radius_units) - - # ML calculation. - procs = [] - cm_pipes = [] - ljm_pipes = [] - for i in range(min_training_size, - max_training_size + 1, - training_increment_size): - cm_recv, cm_send = Pipe(False) - p1 = Process(target=ml, - args=(cm_data, - energy_pbe0, - i, - 'CM', - cm_send, - test_size, - sigma, - show_msgs)) - procs.append(p1) - cm_pipes.append(cm_recv) - p1.start() - - ljm_recv, ljm_send = Pipe(False) - p2 = Process(target=ml, - args=(ljm_data, - energy_pbe0, - i, - 'L-JM', - ljm_send, - test_size, - sigma, - show_msgs)) - procs.append(p2) - ljm_pipes.append(ljm_recv) - p2.start() - - cm_bench_results = [] - ljm_bench_results = [] - for cd_pipe, ljd_pipe in zip(cm_pipes, ljm_pipes): - cm_bench_results.append(cd_pipe.recv()) - ljm_bench_results.append(ljd_pipe.recv()) - - for proc in procs: - proc.join() - - if save_benchmarks: - with open('data\\benchmarks.csv', 'a') as save_file: - # save_file.write(''.join(['ml_type,tr_size,te_size,kernel_s,', - # 'mae,time,lj_s,lj_e,date_ran\n'])) - ltime = time.localtime()[:3][::-1] - ljm_se = ',' + str(ljm_sigma) + ',' + str(ljm_epsilon) + ',' - date = '/'.join([str(field) for field in ltime]) - for cm, ljm, in zip(cm_bench_results, ljm_bench_results): - cm_text = ','.join([str(field) for field in cm])\ - + ',' + date + '\n' - ljm_text = ','.join([str(field) for field in ljm])\ - + ljm_se + date + '\n' - save_file.write(cm_text) - save_file.write(ljm_text) - - # End of program - end_time = time.perf_counter() - printc('Program took {:.4f} seconds.'.format(end_time - init_time), - 'CYAN') -- cgit v1.2.3-70-g09d2 From a8dc304d0c85b3db76986f109bdcd7a530bd3045 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Thu, 27 Feb 2020 15:42:02 -0700 Subject: Fix bug --- ml_exp/qm7db.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ml_exp/qm7db.py b/ml_exp/qm7db.py index 8d2453f45..3fd46b6bc 100644 --- a/ml_exp/qm7db.py +++ b/ml_exp/qm7db.py @@ -25,15 +25,11 @@ import numpy as np import random -# 'hof_qm7.txt.txt' retrieved from -# https://github.com/qmlcode/tutorial -def qm7db(nc, - db_path='data', +def qm7db(db_path='data', is_shuffled=True, r_seed=111): """ Creates a list of compounds with the qm7 database. - nc: dictionary containing nuclear charge data. db_path: path to the database directory. is_shuffled: if the resulting list of compounds should be shuffled. r_seed: random seed to use for the shuffling. -- cgit v1.2.3-70-g09d2 From b937df41c1c5e996be94a3a690908ea989e281dc Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Thu, 27 Feb 2020 15:59:38 -0700 Subject: Placeholder function, test --- ml_exp/do_ml.py | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py index de4d4061c..f32480367 100644 --- a/ml_exp/do_ml.py +++ b/ml_exp/do_ml.py @@ -25,7 +25,7 @@ import numpy as np from ml_exp.misc import printc from ml_exp.kernels import gaussian_kernel from ml_exp.math import cholesky_solve -# from ml_exp.qm7db import qm7db +from ml_exp.qm7db import qm7db def simple_ml(descriptors, @@ -44,7 +44,7 @@ def simple_ml(descriptors, test_size: size of the test set to use. If no size is given, the last remaining molecules are used. sigma: depth of the kernel. - show_msgs: if debu messages should be shown. + show_msgs: if debug messages should be shown. NOTE: identifier is just a string and is only for identification purposes. Also, training is done with the first part of the data and testing with the ending part of the data. @@ -97,3 +97,55 @@ def simple_ml(descriptors, printc(f'\t\tSigma: {sigma}', 'CYAN') return mae, tictoc + + +def do_ml(db_path='data', + is_shuffled=True, + r_seed=111, + show_msgs=True): + """ + Main function that does the whole ML process. + training_size: minimum training size. + test_size: size of the test set to use. If no size is given, + the last remaining molecules are used. + ljm_diag_value: if a special diagonal value should be used in lj matrix. + ljm_sigma: sigma value for lj matrix. + ljm_epsilon: epsilon value for lj matrix. + r_seed: random seed to use for the shuffling. + save_benchmarks: if benchmarks should be saved. + size: maximum amount of atoms in molecule. + as_eig: if data should be returned as matrix or array of eigenvalues. + bohr_radius_units: if units should be in bohr's radius units. + sigma: depth of the kernel. + show_msgs: if debug messages should be shown. + """ + init_time = time.perf_counter() + + # Data reading. + tic = time.perf_counter() + compounds, energy_pbe0, energy_delta = qm7db(db_path=db_path, + is_shuffled=is_shuffled, + r_seed=r_seed) + toc = time.perf_counter() + tictoc = toc - tic + if show_msgs: + printc(f'Data reading took {tictoc:.4f} seconds.', 'CYAN') + + # Matrices calculation. + tic = time.perf_counter() + for compound in compounds: + compound.gen_cm() + compound.gen_ljm() + compound.gen_am() + toc = time.perf_counter() + tictoc = toc - tic + if show_msgs: + printc(f'Matrices calculation took {tictoc:.4f} seconds.', 'CYAN') + + # ML calculation. + # PLHLDR + + # End of program + end_time = time.perf_counter() + totaltime = end_time - init_time + printc(f'Program took {totaltime:.4f} seconds.', 'CYAN') -- cgit v1.2.3-70-g09d2 From c4d30a346d114185ab93eb5e88e241b922b5a538 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Thu, 27 Feb 2020 16:50:57 -0700 Subject: Add simple ml methodology, fix bugs --- ml_exp/__init__.py | 5 ++++- ml_exp/compound.py | 2 +- ml_exp/do_ml.py | 65 ++++++++++++++++++++++++++++++++++++++++-------------- ml_exp/kernels.py | 6 ++--- 4 files changed, 56 insertions(+), 22 deletions(-) diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py index c62141a58..48ece56c8 100644 --- a/ml_exp/__init__.py +++ b/ml_exp/__init__.py @@ -25,6 +25,7 @@ from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\ first_neighbor_matrix, adjacency_matrix, check_bond, bag_of_stuff from ml_exp.math import cholesky_solve from ml_exp.qm7db import qm7db +from ml_exp.do_ml import simple_ml, do_ml __all__ = ['Compound', 'coulomb_matrix', @@ -34,4 +35,6 @@ __all__ = ['Compound', 'check_bond', 'bag_of_stuff', 'cholesky_solve', - 'qm7db'] + 'qm7db', + 'simple_ml', + 'do_ml'] diff --git a/ml_exp/compound.py b/ml_exp/compound.py index 9c7102860..1598fa482 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -93,8 +93,8 @@ class Compound: bohr_ru=bohr_ru) def gen_am(self, - size=23, use_forces=False, + size=23, bohr_ru=False): """ Generate the Adjacency Matrix for the compund. diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py index f32480367..40243d413 100644 --- a/ml_exp/do_ml.py +++ b/ml_exp/do_ml.py @@ -31,19 +31,19 @@ from ml_exp.qm7db import qm7db def simple_ml(descriptors, energies, training_size, - identifier=None, test_size=None, sigma=1000.0, + identifier=None, show_msgs=True): """ Basic ML methodology for a single descriptor type. descriptors: array of descriptors. energies: array of energies. training_size: size of the training set to use. - identifier: string with the name of the descriptor used. test_size: size of the test set to use. If no size is given, the last remaining molecules are used. sigma: depth of the kernel. + identifier: string with the name of the descriptor used. show_msgs: if debug messages should be shown. NOTE: identifier is just a string and is only for identification purposes. Also, training is done with the first part of the data and @@ -86,7 +86,7 @@ def simple_ml(descriptors, mae = np.mean(np.abs(Y_predicted - Y_test)) if show_msgs: - printc('\tMAE for {identifier}: {mae:.4f}', 'GREEN') + printc(f'\tMAE for {identifier}: {mae:.4f}', 'GREEN') toc = time.perf_counter() tictoc = toc - tic @@ -102,20 +102,27 @@ def simple_ml(descriptors, def do_ml(db_path='data', is_shuffled=True, r_seed=111, + diag_value=None, + lj_sigma=1.0, + lj_epsilon=1.0, + use_forces=False, + size=23, + as_eig=True, + bohr_ru=False, + sigma=1000.0, show_msgs=True): """ Main function that does the whole ML process. - training_size: minimum training size. - test_size: size of the test set to use. If no size is given, - the last remaining molecules are used. - ljm_diag_value: if a special diagonal value should be used in lj matrix. - ljm_sigma: sigma value for lj matrix. - ljm_epsilon: epsilon value for lj matrix. + db_path: path to the database directory. + is_shuffled: if the resulting list of compounds should be shuffled. r_seed: random seed to use for the shuffling. - save_benchmarks: if benchmarks should be saved. - size: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. + diag_value: if special diagonal value is to be used. + lj_sigma: sigma value. + lj_epsilon: epsilon value. + use_forces: if the use of forces instead of k_cx should be used. + size: compound size. + as_eig: if the representation should be as the eigenvalues. + bohr_ru: if radius units should be in bohr's radius units. sigma: depth of the kernel. show_msgs: if debug messages should be shown. """ @@ -134,16 +141,40 @@ def do_ml(db_path='data', # Matrices calculation. tic = time.perf_counter() for compound in compounds: - compound.gen_cm() - compound.gen_ljm() - compound.gen_am() + compound.gen_cm(size=size, + as_eig=as_eig, + bohr_ru=bohr_ru) + compound.gen_ljm(diag_value=diag_value, + sigma=lj_sigma, + epsilon=lj_epsilon, + size=size, + as_eig=as_eig, + bohr_ru=bohr_ru) + compound.gen_am(use_forces=use_forces, + size=size, + bohr_ru=bohr_ru) + + # Create a numpy array for the descriptors. + cm_data = np.array([compound.cm for compound in compounds], dtype=float) + ljm_data = np.array([compound.ljm for compound in compounds], dtype=float) + am_data = np.array([compound.cm for compound in compounds], dtype=float) + print(cm_data.shape, ljm_data.shape, am_data.shape) + toc = time.perf_counter() tictoc = toc - tic if show_msgs: printc(f'Matrices calculation took {tictoc:.4f} seconds.', 'CYAN') # ML calculation. - # PLHLDR + # CM + cm_mae, cm_tictoc = simple_ml(cm_data, + energy_pbe0, + training_size=5000, + test_size=1500, + sigma=1000.0, + identifier='CM', + show_msgs=show_msgs) + print(cm_mae, cm_tictoc) # End of program end_time = time.perf_counter() diff --git a/ml_exp/kernels.py b/ml_exp/kernels.py index 7a61a1e1e..3914ffc20 100644 --- a/ml_exp/kernels.py +++ b/ml_exp/kernels.py @@ -36,9 +36,9 @@ def gaussian_kernel(X1, inv_sigma = -0.5 / (sigma*sigma) K = np.zeros((X1.shape[0], X2.shape[0]), dtype=float) - for i in X1: - for j in X2: - f_norm = np.linalg.norm(i - j) + for i, x1 in enumerate(X1): + for j, x2 in enumerate(X2): + f_norm = np.linalg.norm(x1 - x2) # print(f_norm) K[i, j] = math.exp(inv_sigma * f_norm) -- cgit v1.2.3-70-g09d2 From 6ad25bb4cd5822fa84d1c4cec54952c6fd8df2f6 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Thu, 27 Feb 2020 21:41:41 -0700 Subject: Add rest of the descriptors --- ml_exp/do_ml.py | 95 ++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 70 insertions(+), 25 deletions(-) diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py index 40243d413..d8ee415bf 100644 --- a/ml_exp/do_ml.py +++ b/ml_exp/do_ml.py @@ -106,10 +106,14 @@ def do_ml(db_path='data', lj_sigma=1.0, lj_epsilon=1.0, use_forces=False, + stuff='bonds', size=23, as_eig=True, bohr_ru=False, + training_size=1500, + test_size=None, sigma=1000.0, + identifiers=["CM"], show_msgs=True): """ Main function that does the whole ML process. @@ -120,12 +124,20 @@ def do_ml(db_path='data', lj_sigma: sigma value. lj_epsilon: epsilon value. use_forces: if the use of forces instead of k_cx should be used. + stuff: elements of the bag, by default the known bag of bonds. size: compound size. as_eig: if the representation should be as the eigenvalues. bohr_ru: if radius units should be in bohr's radius units. + training_size: size of the training set to use. + test_size: size of the test set to use. If no size is given, + the last remaining molecules are used. sigma: depth of the kernel. + identifiers: list of names (strings) of descriptors to use. show_msgs: if debug messages should be shown. """ + if type(identifiers) != list: + raise TypeError('\'identifiers\' is not a list.') + init_time = time.perf_counter() # Data reading. @@ -141,24 +153,34 @@ def do_ml(db_path='data', # Matrices calculation. tic = time.perf_counter() for compound in compounds: - compound.gen_cm(size=size, - as_eig=as_eig, - bohr_ru=bohr_ru) - compound.gen_ljm(diag_value=diag_value, - sigma=lj_sigma, - epsilon=lj_epsilon, - size=size, - as_eig=as_eig, - bohr_ru=bohr_ru) - compound.gen_am(use_forces=use_forces, - size=size, - bohr_ru=bohr_ru) + if 'CM' in identifiers: + compound.gen_cm(size=size, + as_eig=as_eig, + bohr_ru=bohr_ru) + if 'LJM' in identifiers: + compound.gen_ljm(diag_value=diag_value, + sigma=lj_sigma, + epsilon=lj_epsilon, + size=size, + as_eig=as_eig, + bohr_ru=bohr_ru) + if 'AM' in identifiers: + compound.gen_am(use_forces=use_forces, + size=size, + bohr_ru=bohr_ru) + if 'BOS' in identifiers: + compound.gen_bos(size=size, + stuff=stuff) # Create a numpy array for the descriptors. - cm_data = np.array([compound.cm for compound in compounds], dtype=float) - ljm_data = np.array([compound.ljm for compound in compounds], dtype=float) - am_data = np.array([compound.cm for compound in compounds], dtype=float) - print(cm_data.shape, ljm_data.shape, am_data.shape) + if 'CM' in identifiers: + cm_data = np.array([comp.cm for comp in compounds], dtype=float) + if 'LJM' in identifiers: + ljm_data = np.array([comp.ljm for comp in compounds], dtype=float) + if 'AM' in identifiers: + am_data = np.array([comp.cm for comp in compounds], dtype=float) + if 'BOS' in identifiers: + bos_data = np.array([comp.bos for comp in compounds], dtype=float) toc = time.perf_counter() tictoc = toc - tic @@ -166,15 +188,38 @@ def do_ml(db_path='data', printc(f'Matrices calculation took {tictoc:.4f} seconds.', 'CYAN') # ML calculation. - # CM - cm_mae, cm_tictoc = simple_ml(cm_data, - energy_pbe0, - training_size=5000, - test_size=1500, - sigma=1000.0, - identifier='CM', - show_msgs=show_msgs) - print(cm_mae, cm_tictoc) + if 'CM' in identifiers: + cm_mae, cm_tictoc = simple_ml(cm_data, + energy_pbe0, + training_size=training_size, + test_size=test_size, + sigma=sigma, + identifier='CM', + show_msgs=show_msgs) + if 'LJM' in identifiers: + ljm_mae, ljm_tictoc = simple_ml(ljm_data, + energy_pbe0, + training_size=training_size, + test_size=test_size, + sigma=sigma, + identifier='LJM', + show_msgs=show_msgs) + if 'AM' in identifiers: + am_mae, am_tictoc = simple_ml(am_data, + energy_pbe0, + training_size=training_size, + test_size=test_size, + sigma=sigma, + identifier='CM', + show_msgs=show_msgs) + if 'BOS' in identifiers: + bos_mae, bos_tictoc = simple_ml(bos_data, + energy_pbe0, + training_size=training_size, + test_size=test_size, + sigma=sigma, + identifier='CM', + show_msgs=show_msgs) # End of program end_time = time.perf_counter() -- cgit v1.2.3-70-g09d2