From 487bf8840846b5d4d694b38985268c308aadb36e Mon Sep 17 00:00:00 2001 From: David Luevano <55825613+luevano@users.noreply.github.com> Date: Wed, 18 Dec 2019 07:21:35 -0700 Subject: Refactor files --- c_matrix.py | 179 --------------------------------- cholesky_solve.py | 64 ------------ do_ml.py | 108 -------------------- frob_norm.py | 51 ---------- gauss_kernel.py | 49 --------- lj_matrix.py | 207 -------------------------------------- lj_matrix/__init__.py | 22 ++++ lj_matrix/__main__.py | 238 ++++++++++++++++++++++++++++++++++++++++++++ lj_matrix/c_matrix.py | 179 +++++++++++++++++++++++++++++++++ lj_matrix/cholesky_solve.py | 64 ++++++++++++ lj_matrix/do_ml.py | 108 ++++++++++++++++++++ lj_matrix/frob_norm.py | 51 ++++++++++ lj_matrix/gauss_kernel.py | 49 +++++++++ lj_matrix/lj_matrix.py | 207 ++++++++++++++++++++++++++++++++++++++ lj_matrix/misc.py | 53 ++++++++++ lj_matrix/read_qm7_data.py | 144 +++++++++++++++++++++++++++ main.py | 238 -------------------------------------------- misc.py | 53 ---------- read_qm7_data.py | 144 --------------------------- 19 files changed, 1115 insertions(+), 1093 deletions(-) delete mode 100644 c_matrix.py delete mode 100644 cholesky_solve.py delete mode 100644 do_ml.py delete mode 100644 frob_norm.py delete mode 100644 gauss_kernel.py delete mode 100644 lj_matrix.py create mode 100644 lj_matrix/__init__.py create mode 100644 lj_matrix/__main__.py create mode 100644 lj_matrix/c_matrix.py create mode 100644 lj_matrix/cholesky_solve.py create mode 100644 lj_matrix/do_ml.py create mode 100644 lj_matrix/frob_norm.py create mode 100644 lj_matrix/gauss_kernel.py create mode 100644 lj_matrix/lj_matrix.py create mode 100644 lj_matrix/misc.py create mode 100644 lj_matrix/read_qm7_data.py delete mode 100644 main.py delete mode 100644 misc.py delete mode 100644 read_qm7_data.py diff --git a/c_matrix.py b/c_matrix.py deleted file mode 100644 index 4de711a1b..000000000 --- a/c_matrix.py +++ /dev/null @@ -1,179 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import time -from misc import printc -import math -import numpy as np -from numpy.linalg import eig - - -def c_matrix(mol_data, - nc_data, - max_len=25, - as_eig=True, - bohr_radius_units=False): - """ - Creates the Coulomb Matrix from the molecule data given. - mol_data: molecule data, matrix of atom coordinates. - nc_data: nuclear charge data, array of atom data. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - """ - if bohr_radius_units: - conversion_rate = 0.52917721067 - else: - conversion_rate = 1 - - mol_n = len(mol_data) - mol_nr = range(mol_n) - - if not mol_n == len(nc_data): - print(''.join(['Error. Molecule matrix dimension is different ', - 'than the nuclear charge array dimension.'])) - else: - if max_len < mol_n: - print(''.join(['Error. Molecule matrix dimension (mol_n) is ', - 'greater than max_len. Using mol_n.'])) - max_len = None - - if max_len: - cm = np.zeros((max_len, max_len)) - ml_r = range(max_len) - - # Actual calculation of the coulomb matrix. - for i in ml_r: - if i < mol_n: - x_i = mol_data[i, 0] - y_i = mol_data[i, 1] - z_i = mol_data[i, 2] - Z_i = nc_data[i] - else: - break - - for j in ml_r: - if j < mol_n: - x_j = mol_data[j, 0] - y_j = mol_data[j, 1] - z_j = mol_data[j, 2] - Z_j = nc_data[j] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - cm[i, j] = (0.5*Z_i**2.4) - else: - cm[i, j] = (conversion_rate*Z_i*Z_j/math.sqrt(x - + y - + z)) - else: - break - - # Now the value will be returned. - if as_eig: - cm_sorted = np.sort(eig(cm)[0])[::-1] - # Thanks to SO for the following lines of code. - # https://stackoverflow.com/a/43011036 - - # Keep zeros at the end. - mask = cm_sorted != 0. - f_mask = mask.sum(0, keepdims=1) >\ - np.arange(cm_sorted.shape[0]-1, -1, -1) - - f_mask = f_mask[::-1] - cm_sorted[f_mask] = cm_sorted[mask] - cm_sorted[~f_mask] = 0. - - return cm_sorted - - else: - return cm - - else: - cm_temp = [] - # Actual calculation of the coulomb matrix. - for i in mol_nr: - x_i = mol_data[i, 0] - y_i = mol_data[i, 1] - z_i = mol_data[i, 2] - Z_i = nc_data[i] - - cm_row = [] - for j in mol_nr: - x_j = mol_data[j, 0] - y_j = mol_data[j, 1] - z_j = mol_data[j, 2] - Z_j = nc_data[j] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - cm_row.append(0.5*Z_i**2.4) - else: - cm_row.append(conversion_rate*Z_i*Z_j/math.sqrt(x - + y - + z)) - - cm_temp.append(np.array(cm_row)) - - cm = np.array(cm_temp) - # Now the value will be returned. - if as_eig: - return np.sort(eig(cm)[0])[::-1] - else: - return cm - - -def c_matrix_multiple(mol_data, - nc_data, - pipe=None, - max_len=25, - as_eig=True, - bohr_radius_units=False): - """ - Calculates the Coulomb Matrix of multiple molecules. - mol_data: molecule data, matrix of atom coordinates. - nc_data: nuclear charge data, array of atom data. - pipe: for multiprocessing purposes. Sends the data calculated - through a pipe. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - """ - printc('Coulomb Matrices calculation started.', 'CYAN') - tic = time.perf_counter() - - cm_data = np.array([c_matrix(mol, nc, max_len, as_eig, bohr_radius_units) - for mol, nc in zip(mol_data, nc_data)]) - - toc = time.perf_counter() - printc('\tCM calculation took {:.4f} seconds.'.format(toc - tic), 'GREEN') - - if pipe: - pipe.send(cm_data) - - return cm_data diff --git a/cholesky_solve.py b/cholesky_solve.py deleted file mode 100644 index bc6a572a3..000000000 --- a/cholesky_solve.py +++ /dev/null @@ -1,64 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import numpy as np -from numpy.linalg import cholesky - - -def cholesky_solve(K, y): - """ - Applies Cholesky decomposition to obtain the 'alpha coeficients'. - K: kernel. - y: known parameters. - """ - # The initial mathematical problem is to solve Ka=y. - - # First, add a small lambda value. - K[np.diag_indices_from(K)] += 1e-8 - - # Get the Cholesky decomposition of the kernel. - L = cholesky(K) - size = len(L) - - # Solve Lx=y for x. - x = np.zeros(size) - x[0] = y[0] / L[0, 0] - for i in range(1, size): - temp_sum = 0.0 - for j in range(i): - temp_sum += L[i, j] * x[j] - x[i] = (y[i] - temp_sum) / L[i, i] - - # Now, solve LTa=x for a. - L2 = L.T - a = np.zeros(size) - a_ms = size - 1 - a[a_ms] = x[a_ms] / L2[a_ms, a_ms] - # Because of the form of L2 (upper triangular matriz), an inversion of - # range() needs to be done. - for i in range(0, a_ms)[::-1]: - temp_sum = 0.0 - for j in range(i, size)[::-1]: - temp_sum += L2[i, j] * a[j] - a[i] = (x[i] - temp_sum) / L2[i, i] - - return a diff --git a/do_ml.py b/do_ml.py deleted file mode 100644 index c88533e68..000000000 --- a/do_ml.py +++ /dev/null @@ -1,108 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import time -from misc import printc -import numpy as np -from gauss_kernel import gauss_kernel -from cholesky_solve import cholesky_solve - - -def do_ml(desc_data, - energy_data, - training_size, - desc_type=None, - pipe=None, - test_size=None, - sigma=1000.0, - show_msgs=True): - """ - Does the ML methodology. - desc_data: descriptor (or representation) data. - energy_data: energy data associated with desc_data. - training_size: size of the training set to use. - desc_type: string with the name of the descriptor used. - pipe: for multiprocessing purposes. Sends the data calculated - through a pipe. - test_size: size of the test set to use. If no size is given, - the last remaining molecules are used. - sigma: depth of the kernel. - show_msgs: Show debug messages or not. - NOTE: desc_type is just a string and is only for identification purposes. - Also, training is done with the first part of the data and - testing with the ending part of the data. - """ - # Initial calculations for later use. - d_len = len(desc_data) - e_len = len(energy_data) - - if not desc_type: - desc_type = 'NOT SPECIFIED' - - if d_len != e_len: - printc(''.join(['ERROR. Descriptor data size different ', - 'than energy data size.']), 'RED') - return None - - if training_size >= d_len: - printc('ERROR. Training size greater or equal than data size.', 'RED') - return None - - if not test_size: - test_size = d_len - training_size - if test_size > 1500: - test_size = 1500 - - tic = time.perf_counter() - if show_msgs: - printc('{} ML started.'.format(desc_type), 'GREEN') - printc('\tTraining size: {}'.format(training_size), 'CYAN') - printc('\tTest size: {}'.format(test_size), 'CYAN') - printc('\tSigma: {}'.format(sigma), 'CYAN') - - Xcm_training = desc_data[:training_size] - Ycm_training = energy_data[:training_size] - Kcm_training = gauss_kernel(Xcm_training, Xcm_training, sigma) - alpha_cm = cholesky_solve(Kcm_training, Ycm_training) - - Xcm_test = desc_data[-test_size:] - Ycm_test = energy_data[-test_size:] - Kcm_test = gauss_kernel(Xcm_test, Xcm_training, sigma) - Ycm_predicted = np.dot(Kcm_test, alpha_cm) - - mae = np.mean(np.abs(Ycm_predicted - Ycm_test)) - if show_msgs: - printc('\tMAE for {}: {:.4f}'.format(desc_type, mae), 'GREEN') - - toc = time.perf_counter() - tictoc = toc - tic - if show_msgs: - printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc), - 'GREEN') - printc('\t\tTraining size: {}'.format(training_size), 'CYAN') - printc('\t\tTest size: {}'.format(test_size), 'CYAN') - printc('\t\tSigma: {}'.format(sigma), 'CYAN') - - if pipe: - pipe.send([desc_type, training_size, test_size, sigma, mae, tictoc]) - - return mae, tictoc diff --git a/frob_norm.py b/frob_norm.py deleted file mode 100644 index 4c3a2945d..000000000 --- a/frob_norm.py +++ /dev/null @@ -1,51 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import math - - -def frob_norm(array): - """ - Calculates the frobenius norm of a given array or matrix. - array: array of data. - """ - - arr_sh_len = len(array.shape) - arr_range = range(len(array)) - fn = 0.0 - - # If it is a 'vector'. - if arr_sh_len == 1: - for i in arr_range: - fn += array[i]*array[i] - - return math.sqrt(fn) - - # If it is a matrix. - elif arr_sh_len == 2: - for i in arr_range: - for j in arr_range: - fn += array[i, j]*array[i, j] - - return math.sqrt(fn) - else: - print('Error. Array size greater than 2 ({}).'.format(arr_sh_len)) diff --git a/gauss_kernel.py b/gauss_kernel.py deleted file mode 100644 index 0dfc65d59..000000000 --- a/gauss_kernel.py +++ /dev/null @@ -1,49 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import math -import numpy as np -from frob_norm import frob_norm - - -def gauss_kernel(X_1, X_2, sigma): - """ - Calculates the Gaussian Kernel. - X_1: first representations. - X_2: second representations. - sigma: kernel width. - """ - x1_l = len(X_1) - x1_range = range(x1_l) - x2_l = len(X_2) - x2_range = range(x2_l) - - inv_sigma = -0.5 / (sigma*sigma) - - K = np.zeros((x1_l, x2_l)) - for i in x1_range: - for j in x2_range: - f_norm = frob_norm(X_1[i] - X_2[j]) - # print(f_norm) - K[i, j] = math.exp(inv_sigma * f_norm) - - return K diff --git a/lj_matrix.py b/lj_matrix.py deleted file mode 100644 index 2a8e0d956..000000000 --- a/lj_matrix.py +++ /dev/null @@ -1,207 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import time -from misc import printc -import math -import numpy as np -from numpy.linalg import eig - - -def lj_matrix(mol_data, - nc_data, - sigma=1.0, - epsilon=1.0, - max_len=25, - as_eig=True, - bohr_radius_units=False): - """ - Creates the Lennard-Jones Matrix from the molecule data given. - mol_data: molecule data, matrix of atom coordinates. - nc_data: nuclear charge data, array of atom data. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - """ - if bohr_radius_units: - conversion_rate = 0.52917721067 - else: - conversion_rate = 1 - - mol_n = len(mol_data) - mol_nr = range(mol_n) - - if not mol_n == len(nc_data): - print(''.join(['Error. Molecule matrix dimension is different ', - 'than the nuclear charge array dimension.'])) - else: - if max_len < mol_n: - print(''.join(['Error. Molecule matrix dimension (mol_n) is ', - 'greater than max_len. Using mol_n.'])) - max_len = None - - if max_len: - lj = np.zeros((max_len, max_len)) - ml_r = range(max_len) - - # Actual calculation of the coulomb matrix. - for i in ml_r: - if i < mol_n: - x_i = mol_data[i, 0] - y_i = mol_data[i, 1] - z_i = mol_data[i, 2] - Z_i = nc_data[i] - else: - break - - for j in ml_r: - if j < mol_n: - x_j = mol_data[j, 0] - y_j = mol_data[j, 1] - z_j = mol_data[j, 2] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - lj[i, j] = (0.5*Z_i**2.4) - else: - # Calculations are done after i==j is checked - # so no division by zero is done. - - # A little play with r exponents - # so no square root is calculated. - # Conversion factor is included in r^2. - - # 1/r^2 - r_2 = sigma**2/(conversion_rate**2*(x + y + z)) - - r_6 = math.pow(r_2, 3) - r_12 = math.pow(r_6, 2) - lj[i, j] = (4*epsilon*(r_12 - r_6)) - else: - break - - # Now the value will be returned. - if as_eig: - lj_sorted = np.sort(eig(lj)[0])[::-1] - # Thanks to SO for the following lines of code. - # https://stackoverflow.com/a/43011036 - - # Keep zeros at the end. - mask = lj_sorted != 0. - f_mask = mask.sum(0, keepdims=1) >\ - np.arange(lj_sorted.shape[0]-1, -1, -1) - - f_mask = f_mask[::-1] - lj_sorted[f_mask] = lj_sorted[mask] - lj_sorted[~f_mask] = 0. - - return lj_sorted - - else: - return lj - - else: - lj_temp = [] - # Actual calculation of the coulomb matrix. - for i in mol_nr: - x_i = mol_data[i, 0] - y_i = mol_data[i, 1] - z_i = mol_data[i, 2] - Z_i = nc_data[i] - - lj_row = [] - for j in mol_nr: - x_j = mol_data[j, 0] - y_j = mol_data[j, 1] - z_j = mol_data[j, 2] - - x = (x_i-x_j)**2 - y = (y_i-y_j)**2 - z = (z_i-z_j)**2 - - if i == j: - lj_row.append(0.5*Z_i**2.4) - else: - # Calculations are done after i==j is checked - # so no division by zero is done. - - # A little play with r exponents - # so no square root is calculated. - # Conversion factor is included in r^2. - - # 1/r^2 - r_2 = sigma**2/(conversion_rate**2*(x + y + z)) - - r_6 = math.pow(r_2, 3) - r_12 = math.pow(r_6, 2) - lj_row.append(4*epsilon*(r_12 - r_6)) - - lj_temp.append(np.array(lj_row)) - - lj = np.array(lj_temp) - # Now the value will be returned. - if as_eig: - return np.sort(eig(lj)[0])[::-1] - else: - return lj - - -def lj_matrix_multiple(mol_data, - nc_data, - pipe=None, - sigma=1, - epsilon=1, - max_len=25, - as_eig=True, - bohr_radius_units=False): - """ - Calculates the Lennard-Jones Matrix of multiple molecules. - mol_data: molecule data, matrix of atom coordinates. - nc_data: nuclear charge data, array of atom data. - pipe: for multiprocessing purposes. Sends the data calculated - through a pipe. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - """ - printc('L-J Matrices calculation started.', 'CYAN') - tic = time.perf_counter() - - ljm_data = np.array([lj_matrix(mol, - nc, - sigma, - epsilon, - max_len, - as_eig, - bohr_radius_units) - for mol, nc in zip(mol_data, nc_data)]) - - toc = time.perf_counter() - printc('\tL-JM calculation took {:.4f} seconds.'.format(toc-tic), 'GREEN') - - if pipe: - pipe.send(ljm_data) - - return ljm_data diff --git a/lj_matrix/__init__.py b/lj_matrix/__init__.py new file mode 100644 index 000000000..48cd14913 --- /dev/null +++ b/lj_matrix/__init__.py @@ -0,0 +1,22 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" diff --git a/lj_matrix/__main__.py b/lj_matrix/__main__.py new file mode 100644 index 000000000..4e13f4995 --- /dev/null +++ b/lj_matrix/__main__.py @@ -0,0 +1,238 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import time +from multiprocessing import Process, Pipe +# import matplotlib.pyplot as plt +import pandas as pd +from lj_matrix.misc import printc +from lj_matrix.read_qm7_data import read_qm7_data +from lj_matrix.c_matrix import c_matrix_multiple +from lj_matrix.lj_matrix import lj_matrix_multiple +from lj_matrix.do_ml import do_ml + + +# Test +def ml(): + """ + Main function that does the whole ML process. + """ + # Initialization time. + init_time = time.perf_counter() + + # Data reading. + zi_data, molecules, nuclear_charge, energy_pbe0, energy_delta =\ + read_qm7_data() + + # Matrices calculation. + procs = [] + pipes = [] + + # cm_recv, cm_send = Pipe(False) + # p1 = Process(target=c_matrix_multiple, + # args=(molecules, nuclear_charge, cm_send)) + # procs.append(p1) + # pipes.append(cm_recv) + # p1.start() + + ljm_recv, ljm_send = Pipe(False) + p2 = Process(target=lj_matrix_multiple, + args=(molecules, nuclear_charge, ljm_send, 1, 0.25)) + procs.append(p2) + pipes.append(ljm_recv) + p2.start() + + # cm_data = pipes[0].recv() + ljm_data = pipes[0].recv() + + for proc in procs: + proc.join() + + # ML calculation. + procs = [] + # cm_pipes = [] + ljm_pipes = [] + for i in range(1500, 6500 + 1, 500): + # cm_recv, cm_send = Pipe(False) + # p1 = Process(target=do_ml, + # args=(cm_data, energy_pbe0, i, 'CM', cm_send)) + # procs.append(p1) + # cm_pipes.append(cm_recv) + # p1.start() + + ljm_recv, ljm_send = Pipe(False) + p2 = Process(target=do_ml, + args=(ljm_data, energy_pbe0, i, 'L-JM', ljm_send)) + procs.append(p2) + ljm_pipes.append(ljm_recv) + p2.start() + + # cm_bench_results = [] + ljm_bench_results = [] + for ljd_pipe in ljm_pipes: # cd_pipe, ljd_pipe in zip(cm_pipes, ljm_pipes): + # cm_bench_results.append(cd_pipe.recv()) + ljm_bench_results.append(ljd_pipe.recv()) + + for proc in procs: + proc.join() + + with open('data\\benchmarks.csv', 'a') as save_file: + # save_file.write(''.join(['ml_type,tr_size,te_size,kernel_s,', + # 'mae,time,lj_s,lj_e,date_ran\n'])) + date = '/'.join([str(field) for field in time.localtime()[:3][::-1]]) + for ljm in ljm_bench_results: # cm, ljm, in zip(cm_bench_results, ljm_bench_results): + # cm_text = ','.join([str(field) for field in cm])\ + # + ',' + date + '\n' + ljm_text = ','.join([str(field) for field in ljm])\ + + ',1,0.25,' + date + '\n' + # save_file.write(cm_text) + save_file.write(ljm_text) + + # End of program + end_time = time.perf_counter() + printc('Program took {:.4f} seconds.'.format(end_time - init_time), + 'CYAN') + + +def pl(): + """ + Function for plotting the benchmarks. + """ + # Original columns. + or_cols = ['ml_type', + 'tr_size', + 'te_size', + 'kernel_s', + 'mae', + 'time', + 'lj_s', + 'lj_e', + 'date_ran'] + # Drop some original columns. + dor_cols = ['te_size', + 'kernel_s', + 'time', + 'date_ran'] + + # Read benchmarks data and drop some columns. + data_temp = pd.read_csv('data\\benchmarks.csv',) + data = pd.DataFrame(data_temp, columns=or_cols) + data = data.drop(columns=dor_cols) + + # Get the data of the first benchmarks and drop unnecesary columns. + first_data = pd.DataFrame(data, index=range(0, 22)) + first_data = first_data.drop(columns=['lj_s', 'lj_e']) + + # Columns to keep temporarily. + fd_columns = ['ml_type', + 'tr_size', + 'mae'] + + # Create new dataframes for each matrix descriptor and fill them. + first_data_cm = pd.DataFrame(columns=fd_columns) + first_data_ljm = pd.DataFrame(columns=fd_columns) + for i in range(first_data.shape[0]): + temp_df = first_data.iloc[[i]] + if first_data.at[i, 'ml_type'] == 'CM': + first_data_cm = first_data_cm.append(temp_df) + else: + first_data_ljm = first_data_ljm.append(temp_df) + + # Drop unnecesary column and rename 'mae' for later use. + first_data_cm = first_data_cm.drop(columns=['ml_type'])\ + .rename(columns={'mae': 'cm_mae'}) + first_data_ljm = first_data_ljm.drop(columns=['ml_type'])\ + .rename(columns={'mae': 'ljm_mae'}) + # print(first_data_cm) + # print(first_data_ljm) + + # Get the cm data axis so it can be joined with the ljm data axis. + cm_axis = first_data_cm.plot(x='tr_size', + y='cm_mae', + kind='line') + # Get the ljm data axis and join it with the cm one. + plot_axis = first_data_ljm.plot(ax=cm_axis, + x='tr_size', + y='ljm_mae', + kind='line') + plot_axis.set_xlabel('tr_size') + plot_axis.set_ylabel('mae') + plot_axis.set_title('mae for different tr_sizes') + # Get the figure and save it. + # plot_axis.get_figure().savefig('.figs\\mae_diff_tr_sizes.pdf') + + # Get the rest of the benchmark data and drop unnecesary column. + new_data = data.drop(index=range(0, 22)) + new_data = new_data.drop(columns=['ml_type']) + + # Get the first set and rename it. + nd_first = first_data_ljm.rename(columns={'ljm_mae': '1, 1'}) + ndf_axis = nd_first.plot(x='tr_size', + y='1, 1', + kind='line') + last_axis = ndf_axis + for i in range(22, 99, 11): + lj_s = new_data['lj_s'][i] + lj_e = new_data['lj_e'][i] + new_mae = '{}, {}'.format(lj_s, lj_e) + nd_temp = pd.DataFrame(new_data, index=range(i, i + 11))\ + .drop(columns=['lj_s', 'lj_e'])\ + .rename(columns={'mae': new_mae}) + last_axis = nd_temp.plot(ax=last_axis, + x='tr_size', + y=new_mae, + kind='line') + print(nd_temp) + + last_axis.set_xlabel('tr_size') + last_axis.set_ylabel('mae') + last_axis.set_title('mae for different parameters of lj(s)') + + last_axis.get_figure().savefig('.figs\\mae_diff_param_lj_s.pdf') + + ndf_axis = nd_first.plot(x='tr_size', + y='1, 1', + kind='line') + last_axis = ndf_axis + for i in range(99, data.shape[0], 11): + lj_s = new_data['lj_s'][i] + lj_e = new_data['lj_e'][i] + new_mae = '{}, {}'.format(lj_s, lj_e) + nd_temp = pd.DataFrame(new_data, index=range(i, i + 11))\ + .drop(columns=['lj_s', 'lj_e'])\ + .rename(columns={'mae': new_mae}) + last_axis = nd_temp.plot(ax=last_axis, + x='tr_size', + y=new_mae, + kind='line') + print(nd_temp) + + last_axis.set_xlabel('tr_size') + last_axis.set_ylabel('mae') + last_axis.set_title('mae for different parameters of lj(e)') + + last_axis.get_figure().savefig('.figs\\mae_diff_param_lj_e.pdf') + + +if __name__ == '__main__': + # ml() + pl() diff --git a/lj_matrix/c_matrix.py b/lj_matrix/c_matrix.py new file mode 100644 index 000000000..f40a18c68 --- /dev/null +++ b/lj_matrix/c_matrix.py @@ -0,0 +1,179 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import time +from lj_matrix.misc import printc +import math +import numpy as np +from numpy.linalg import eig + + +def c_matrix(mol_data, + nc_data, + max_len=25, + as_eig=True, + bohr_radius_units=False): + """ + Creates the Coulomb Matrix from the molecule data given. + mol_data: molecule data, matrix of atom coordinates. + nc_data: nuclear charge data, array of atom data. + max_len: maximum amount of atoms in molecule. + as_eig: if data should be returned as matrix or array of eigenvalues. + bohr_radius_units: if units should be in bohr's radius units. + """ + if bohr_radius_units: + conversion_rate = 0.52917721067 + else: + conversion_rate = 1 + + mol_n = len(mol_data) + mol_nr = range(mol_n) + + if not mol_n == len(nc_data): + print(''.join(['Error. Molecule matrix dimension is different ', + 'than the nuclear charge array dimension.'])) + else: + if max_len < mol_n: + print(''.join(['Error. Molecule matrix dimension (mol_n) is ', + 'greater than max_len. Using mol_n.'])) + max_len = None + + if max_len: + cm = np.zeros((max_len, max_len)) + ml_r = range(max_len) + + # Actual calculation of the coulomb matrix. + for i in ml_r: + if i < mol_n: + x_i = mol_data[i, 0] + y_i = mol_data[i, 1] + z_i = mol_data[i, 2] + Z_i = nc_data[i] + else: + break + + for j in ml_r: + if j < mol_n: + x_j = mol_data[j, 0] + y_j = mol_data[j, 1] + z_j = mol_data[j, 2] + Z_j = nc_data[j] + + x = (x_i-x_j)**2 + y = (y_i-y_j)**2 + z = (z_i-z_j)**2 + + if i == j: + cm[i, j] = (0.5*Z_i**2.4) + else: + cm[i, j] = (conversion_rate*Z_i*Z_j/math.sqrt(x + + y + + z)) + else: + break + + # Now the value will be returned. + if as_eig: + cm_sorted = np.sort(eig(cm)[0])[::-1] + # Thanks to SO for the following lines of code. + # https://stackoverflow.com/a/43011036 + + # Keep zeros at the end. + mask = cm_sorted != 0. + f_mask = mask.sum(0, keepdims=1) >\ + np.arange(cm_sorted.shape[0]-1, -1, -1) + + f_mask = f_mask[::-1] + cm_sorted[f_mask] = cm_sorted[mask] + cm_sorted[~f_mask] = 0. + + return cm_sorted + + else: + return cm + + else: + cm_temp = [] + # Actual calculation of the coulomb matrix. + for i in mol_nr: + x_i = mol_data[i, 0] + y_i = mol_data[i, 1] + z_i = mol_data[i, 2] + Z_i = nc_data[i] + + cm_row = [] + for j in mol_nr: + x_j = mol_data[j, 0] + y_j = mol_data[j, 1] + z_j = mol_data[j, 2] + Z_j = nc_data[j] + + x = (x_i-x_j)**2 + y = (y_i-y_j)**2 + z = (z_i-z_j)**2 + + if i == j: + cm_row.append(0.5*Z_i**2.4) + else: + cm_row.append(conversion_rate*Z_i*Z_j/math.sqrt(x + + y + + z)) + + cm_temp.append(np.array(cm_row)) + + cm = np.array(cm_temp) + # Now the value will be returned. + if as_eig: + return np.sort(eig(cm)[0])[::-1] + else: + return cm + + +def c_matrix_multiple(mol_data, + nc_data, + pipe=None, + max_len=25, + as_eig=True, + bohr_radius_units=False): + """ + Calculates the Coulomb Matrix of multiple molecules. + mol_data: molecule data, matrix of atom coordinates. + nc_data: nuclear charge data, array of atom data. + pipe: for multiprocessing purposes. Sends the data calculated + through a pipe. + max_len: maximum amount of atoms in molecule. + as_eig: if data should be returned as matrix or array of eigenvalues. + bohr_radius_units: if units should be in bohr's radius units. + """ + printc('Coulomb Matrices calculation started.', 'CYAN') + tic = time.perf_counter() + + cm_data = np.array([c_matrix(mol, nc, max_len, as_eig, bohr_radius_units) + for mol, nc in zip(mol_data, nc_data)]) + + toc = time.perf_counter() + printc('\tCM calculation took {:.4f} seconds.'.format(toc - tic), 'GREEN') + + if pipe: + pipe.send(cm_data) + + return cm_data diff --git a/lj_matrix/cholesky_solve.py b/lj_matrix/cholesky_solve.py new file mode 100644 index 000000000..bc6a572a3 --- /dev/null +++ b/lj_matrix/cholesky_solve.py @@ -0,0 +1,64 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import numpy as np +from numpy.linalg import cholesky + + +def cholesky_solve(K, y): + """ + Applies Cholesky decomposition to obtain the 'alpha coeficients'. + K: kernel. + y: known parameters. + """ + # The initial mathematical problem is to solve Ka=y. + + # First, add a small lambda value. + K[np.diag_indices_from(K)] += 1e-8 + + # Get the Cholesky decomposition of the kernel. + L = cholesky(K) + size = len(L) + + # Solve Lx=y for x. + x = np.zeros(size) + x[0] = y[0] / L[0, 0] + for i in range(1, size): + temp_sum = 0.0 + for j in range(i): + temp_sum += L[i, j] * x[j] + x[i] = (y[i] - temp_sum) / L[i, i] + + # Now, solve LTa=x for a. + L2 = L.T + a = np.zeros(size) + a_ms = size - 1 + a[a_ms] = x[a_ms] / L2[a_ms, a_ms] + # Because of the form of L2 (upper triangular matriz), an inversion of + # range() needs to be done. + for i in range(0, a_ms)[::-1]: + temp_sum = 0.0 + for j in range(i, size)[::-1]: + temp_sum += L2[i, j] * a[j] + a[i] = (x[i] - temp_sum) / L2[i, i] + + return a diff --git a/lj_matrix/do_ml.py b/lj_matrix/do_ml.py new file mode 100644 index 000000000..acf5455f4 --- /dev/null +++ b/lj_matrix/do_ml.py @@ -0,0 +1,108 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import time +from misc import printc +import numpy as np +from lj_matrix.gauss_kernel import gauss_kernel +from lj_matrix.cholesky_solve import cholesky_solve + + +def do_ml(desc_data, + energy_data, + training_size, + desc_type=None, + pipe=None, + test_size=None, + sigma=1000.0, + show_msgs=True): + """ + Does the ML methodology. + desc_data: descriptor (or representation) data. + energy_data: energy data associated with desc_data. + training_size: size of the training set to use. + desc_type: string with the name of the descriptor used. + pipe: for multiprocessing purposes. Sends the data calculated + through a pipe. + test_size: size of the test set to use. If no size is given, + the last remaining molecules are used. + sigma: depth of the kernel. + show_msgs: Show debug messages or not. + NOTE: desc_type is just a string and is only for identification purposes. + Also, training is done with the first part of the data and + testing with the ending part of the data. + """ + # Initial calculations for later use. + d_len = len(desc_data) + e_len = len(energy_data) + + if not desc_type: + desc_type = 'NOT SPECIFIED' + + if d_len != e_len: + printc(''.join(['ERROR. Descriptor data size different ', + 'than energy data size.']), 'RED') + return None + + if training_size >= d_len: + printc('ERROR. Training size greater or equal than data size.', 'RED') + return None + + if not test_size: + test_size = d_len - training_size + if test_size > 1500: + test_size = 1500 + + tic = time.perf_counter() + if show_msgs: + printc('{} ML started.'.format(desc_type), 'GREEN') + printc('\tTraining size: {}'.format(training_size), 'CYAN') + printc('\tTest size: {}'.format(test_size), 'CYAN') + printc('\tSigma: {}'.format(sigma), 'CYAN') + + Xcm_training = desc_data[:training_size] + Ycm_training = energy_data[:training_size] + Kcm_training = gauss_kernel(Xcm_training, Xcm_training, sigma) + alpha_cm = cholesky_solve(Kcm_training, Ycm_training) + + Xcm_test = desc_data[-test_size:] + Ycm_test = energy_data[-test_size:] + Kcm_test = gauss_kernel(Xcm_test, Xcm_training, sigma) + Ycm_predicted = np.dot(Kcm_test, alpha_cm) + + mae = np.mean(np.abs(Ycm_predicted - Ycm_test)) + if show_msgs: + printc('\tMAE for {}: {:.4f}'.format(desc_type, mae), 'GREEN') + + toc = time.perf_counter() + tictoc = toc - tic + if show_msgs: + printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc), + 'GREEN') + printc('\t\tTraining size: {}'.format(training_size), 'CYAN') + printc('\t\tTest size: {}'.format(test_size), 'CYAN') + printc('\t\tSigma: {}'.format(sigma), 'CYAN') + + if pipe: + pipe.send([desc_type, training_size, test_size, sigma, mae, tictoc]) + + return mae, tictoc diff --git a/lj_matrix/frob_norm.py b/lj_matrix/frob_norm.py new file mode 100644 index 000000000..4c3a2945d --- /dev/null +++ b/lj_matrix/frob_norm.py @@ -0,0 +1,51 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import math + + +def frob_norm(array): + """ + Calculates the frobenius norm of a given array or matrix. + array: array of data. + """ + + arr_sh_len = len(array.shape) + arr_range = range(len(array)) + fn = 0.0 + + # If it is a 'vector'. + if arr_sh_len == 1: + for i in arr_range: + fn += array[i]*array[i] + + return math.sqrt(fn) + + # If it is a matrix. + elif arr_sh_len == 2: + for i in arr_range: + for j in arr_range: + fn += array[i, j]*array[i, j] + + return math.sqrt(fn) + else: + print('Error. Array size greater than 2 ({}).'.format(arr_sh_len)) diff --git a/lj_matrix/gauss_kernel.py b/lj_matrix/gauss_kernel.py new file mode 100644 index 000000000..5dd8e6406 --- /dev/null +++ b/lj_matrix/gauss_kernel.py @@ -0,0 +1,49 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import math +import numpy as np +from lj_matrix.frob_norm import frob_norm + + +def gauss_kernel(X_1, X_2, sigma): + """ + Calculates the Gaussian Kernel. + X_1: first representations. + X_2: second representations. + sigma: kernel width. + """ + x1_l = len(X_1) + x1_range = range(x1_l) + x2_l = len(X_2) + x2_range = range(x2_l) + + inv_sigma = -0.5 / (sigma*sigma) + + K = np.zeros((x1_l, x2_l)) + for i in x1_range: + for j in x2_range: + f_norm = frob_norm(X_1[i] - X_2[j]) + # print(f_norm) + K[i, j] = math.exp(inv_sigma * f_norm) + + return K diff --git a/lj_matrix/lj_matrix.py b/lj_matrix/lj_matrix.py new file mode 100644 index 000000000..4f63e95ca --- /dev/null +++ b/lj_matrix/lj_matrix.py @@ -0,0 +1,207 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import time +from lj_matrix.misc import printc +import math +import numpy as np +from numpy.linalg import eig + + +def lj_matrix(mol_data, + nc_data, + sigma=1.0, + epsilon=1.0, + max_len=25, + as_eig=True, + bohr_radius_units=False): + """ + Creates the Lennard-Jones Matrix from the molecule data given. + mol_data: molecule data, matrix of atom coordinates. + nc_data: nuclear charge data, array of atom data. + max_len: maximum amount of atoms in molecule. + as_eig: if data should be returned as matrix or array of eigenvalues. + bohr_radius_units: if units should be in bohr's radius units. + """ + if bohr_radius_units: + conversion_rate = 0.52917721067 + else: + conversion_rate = 1 + + mol_n = len(mol_data) + mol_nr = range(mol_n) + + if not mol_n == len(nc_data): + print(''.join(['Error. Molecule matrix dimension is different ', + 'than the nuclear charge array dimension.'])) + else: + if max_len < mol_n: + print(''.join(['Error. Molecule matrix dimension (mol_n) is ', + 'greater than max_len. Using mol_n.'])) + max_len = None + + if max_len: + lj = np.zeros((max_len, max_len)) + ml_r = range(max_len) + + # Actual calculation of the coulomb matrix. + for i in ml_r: + if i < mol_n: + x_i = mol_data[i, 0] + y_i = mol_data[i, 1] + z_i = mol_data[i, 2] + Z_i = nc_data[i] + else: + break + + for j in ml_r: + if j < mol_n: + x_j = mol_data[j, 0] + y_j = mol_data[j, 1] + z_j = mol_data[j, 2] + + x = (x_i-x_j)**2 + y = (y_i-y_j)**2 + z = (z_i-z_j)**2 + + if i == j: + lj[i, j] = (0.5*Z_i**2.4) + else: + # Calculations are done after i==j is checked + # so no division by zero is done. + + # A little play with r exponents + # so no square root is calculated. + # Conversion factor is included in r^2. + + # 1/r^2 + r_2 = sigma**2/(conversion_rate**2*(x + y + z)) + + r_6 = math.pow(r_2, 3) + r_12 = math.pow(r_6, 2) + lj[i, j] = (4*epsilon*(r_12 - r_6)) + else: + break + + # Now the value will be returned. + if as_eig: + lj_sorted = np.sort(eig(lj)[0])[::-1] + # Thanks to SO for the following lines of code. + # https://stackoverflow.com/a/43011036 + + # Keep zeros at the end. + mask = lj_sorted != 0. + f_mask = mask.sum(0, keepdims=1) >\ + np.arange(lj_sorted.shape[0]-1, -1, -1) + + f_mask = f_mask[::-1] + lj_sorted[f_mask] = lj_sorted[mask] + lj_sorted[~f_mask] = 0. + + return lj_sorted + + else: + return lj + + else: + lj_temp = [] + # Actual calculation of the coulomb matrix. + for i in mol_nr: + x_i = mol_data[i, 0] + y_i = mol_data[i, 1] + z_i = mol_data[i, 2] + Z_i = nc_data[i] + + lj_row = [] + for j in mol_nr: + x_j = mol_data[j, 0] + y_j = mol_data[j, 1] + z_j = mol_data[j, 2] + + x = (x_i-x_j)**2 + y = (y_i-y_j)**2 + z = (z_i-z_j)**2 + + if i == j: + lj_row.append(0.5*Z_i**2.4) + else: + # Calculations are done after i==j is checked + # so no division by zero is done. + + # A little play with r exponents + # so no square root is calculated. + # Conversion factor is included in r^2. + + # 1/r^2 + r_2 = sigma**2/(conversion_rate**2*(x + y + z)) + + r_6 = math.pow(r_2, 3) + r_12 = math.pow(r_6, 2) + lj_row.append(4*epsilon*(r_12 - r_6)) + + lj_temp.append(np.array(lj_row)) + + lj = np.array(lj_temp) + # Now the value will be returned. + if as_eig: + return np.sort(eig(lj)[0])[::-1] + else: + return lj + + +def lj_matrix_multiple(mol_data, + nc_data, + pipe=None, + sigma=1, + epsilon=1, + max_len=25, + as_eig=True, + bohr_radius_units=False): + """ + Calculates the Lennard-Jones Matrix of multiple molecules. + mol_data: molecule data, matrix of atom coordinates. + nc_data: nuclear charge data, array of atom data. + pipe: for multiprocessing purposes. Sends the data calculated + through a pipe. + max_len: maximum amount of atoms in molecule. + as_eig: if data should be returned as matrix or array of eigenvalues. + bohr_radius_units: if units should be in bohr's radius units. + """ + printc('L-J Matrices calculation started.', 'CYAN') + tic = time.perf_counter() + + ljm_data = np.array([lj_matrix(mol, + nc, + sigma, + epsilon, + max_len, + as_eig, + bohr_radius_units) + for mol, nc in zip(mol_data, nc_data)]) + + toc = time.perf_counter() + printc('\tL-JM calculation took {:.4f} seconds.'.format(toc-tic), 'GREEN') + + if pipe: + pipe.send(ljm_data) + + return ljm_data diff --git a/lj_matrix/misc.py b/lj_matrix/misc.py new file mode 100644 index 000000000..c50653a5c --- /dev/null +++ b/lj_matrix/misc.py @@ -0,0 +1,53 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +from colorama import init, Fore, Style + +init() + + +def printc(text, color): + """ + Prints texts normaly, but in color. Using colorama. + text: string with the text to print. + color: color to be used, same as available in colorama. + """ + color_dic = {'BLACK': Fore.BLACK, + 'RED': Fore.RED, + 'GREEN': Fore.GREEN, + 'YELLOW': Fore.YELLOW, + 'BLUE': Fore.BLUE, + 'MAGENTA': Fore.MAGENTA, + 'CYAN': Fore.CYAN, + 'WHITE': Fore.WHITE, + 'RESET': Fore.RESET} + + color_dic_keys = color_dic.keys() + if color not in color_dic_keys: + print(Fore.RED + + '\'{}\' not found, using default color.'.format(color) + + Style.RESET_ALL) + actual_color = Fore.RESET + else: + actual_color = color_dic[color] + + print(actual_color + text + Style.RESET_ALL) diff --git a/lj_matrix/read_qm7_data.py b/lj_matrix/read_qm7_data.py new file mode 100644 index 000000000..b54691fb0 --- /dev/null +++ b/lj_matrix/read_qm7_data.py @@ -0,0 +1,144 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import os +import time +import numpy as np +import random +from lj_matrix.misc import printc + + +# 'periodic_table_of_elements.txt' retrieved from +# https://gist.github.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee +def read_nc_data(data_path): + """ + Reads nuclear charge data from file and returns a dictionary. + data_path: path to the data directory. + """ + fname = 'periodic_table_of_elements.txt' + with open(''.join([data_path, '\\', fname]), 'r') as infile: + temp_lines = infile.readlines() + + del temp_lines[0] + + lines = [] + for temp_line in temp_lines: + new_line = temp_line.split(sep=',') + lines.append(new_line) + + # Dictionary of nuclear charge. + return {line[2]: int(line[0]) for line in lines} + + +# 'hof_qm7.txt.txt' retrieved from +# https://github.com/qmlcode/tutorial +def reas_db_data(zi_data, + data_path, + r_seed=111): + """ + Reads molecule database and extracts + its contents as usable variables. + zi_data: dictionary containing nuclear charge data. + data_path: path to the data directory. + r_seed: random seed. + """ + os.chdir(data_path) + + fname = 'hof_qm7.txt' + with open(fname, 'r') as infile: + lines = infile.readlines() + + # Temporary energy dictionary. + energy_temp = dict() + + for line in lines: + xyz_data = line.split() + + xyz_name = xyz_data[0] + hof = float(xyz_data[1]) + dftb = float(xyz_data[2]) + # print(xyz_name, hof, dftb) + + energy_temp[xyz_name] = np.array([hof, hof - dftb]) + + # Use a random seed. + random.seed(r_seed) + + et_keys = list(energy_temp.keys()) + random.shuffle(et_keys) + + # Temporary energy dictionary, shuffled. + energy_temp_shuffled = dict() + for key in et_keys: + energy_temp_shuffled.update({key: energy_temp[key]}) + + mol_data = [] + mol_nc_data = [] + # Actual reading of the xyz files. + for i, k in enumerate(energy_temp_shuffled.keys()): + with open(k, 'r') as xyz_file: + lines = xyz_file.readlines() + + len_lines = len(lines) + mol_temp_data = [] + mol_nc_temp_data = np.array(np.zeros(len_lines-2)) + for j, line in enumerate(lines[2:len_lines]): + line_list = line.split() + + mol_nc_temp_data[j] = float(zi_data[line_list[0]]) + line_data = np.array(np.asarray(line_list[1:4], dtype=float)) + mol_temp_data.append(line_data) + + mol_data.append(mol_temp_data) + mol_nc_data.append(mol_nc_temp_data) + + # Convert everything to a numpy array. + molecules = np.array([np.array(mol) for mol in mol_data]) + nuclear_charge = np.array([nc_d for nc_d in mol_nc_data]) + energy_pbe0 = np.array([energy_temp_shuffled[k][0] + for k in energy_temp_shuffled.keys()]) + energy_delta = np.array([energy_temp_shuffled[k][1] + for k in energy_temp_shuffled.keys()]) + + return molecules, nuclear_charge, energy_pbe0, energy_delta + + +def read_qm7_data(): + """ + Reads all the qm7 data. + """ + tic = time.perf_counter() + printc('Data reading started.', 'CYAN') + + init_path = os.getcwd() + os.chdir('data') + data_path = os.getcwd() + + zi_data = read_nc_data(data_path) + molecules, nuclear_charge, energy_pbe0, energy_delta = \ + reas_db_data(zi_data, data_path) + + os.chdir(init_path) + toc = time.perf_counter() + printc('\tData reading took {:.4f} seconds.'.format(toc-tic), 'GREEN') + + return zi_data, molecules, nuclear_charge, energy_pbe0, energy_delta diff --git a/main.py b/main.py deleted file mode 100644 index 3bf86572e..000000000 --- a/main.py +++ /dev/null @@ -1,238 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import time -from multiprocessing import Process, Pipe -# import matplotlib.pyplot as plt -import pandas as pd -from misc import printc -from read_qm7_data import read_qm7_data -from c_matrix import c_matrix_multiple -from lj_matrix import lj_matrix_multiple -from do_ml import do_ml - - -# Test -def ml(): - """ - Main function that does the whole ML process. - """ - # Initialization time. - init_time = time.perf_counter() - - # Data reading. - zi_data, molecules, nuclear_charge, energy_pbe0, energy_delta =\ - read_qm7_data() - - # Matrices calculation. - procs = [] - pipes = [] - - # cm_recv, cm_send = Pipe(False) - # p1 = Process(target=c_matrix_multiple, - # args=(molecules, nuclear_charge, cm_send)) - # procs.append(p1) - # pipes.append(cm_recv) - # p1.start() - - ljm_recv, ljm_send = Pipe(False) - p2 = Process(target=lj_matrix_multiple, - args=(molecules, nuclear_charge, ljm_send, 1, 0.25)) - procs.append(p2) - pipes.append(ljm_recv) - p2.start() - - # cm_data = pipes[0].recv() - ljm_data = pipes[0].recv() - - for proc in procs: - proc.join() - - # ML calculation. - procs = [] - # cm_pipes = [] - ljm_pipes = [] - for i in range(1500, 6500 + 1, 500): - # cm_recv, cm_send = Pipe(False) - # p1 = Process(target=do_ml, - # args=(cm_data, energy_pbe0, i, 'CM', cm_send)) - # procs.append(p1) - # cm_pipes.append(cm_recv) - # p1.start() - - ljm_recv, ljm_send = Pipe(False) - p2 = Process(target=do_ml, - args=(ljm_data, energy_pbe0, i, 'L-JM', ljm_send)) - procs.append(p2) - ljm_pipes.append(ljm_recv) - p2.start() - - # cm_bench_results = [] - ljm_bench_results = [] - for ljd_pipe in ljm_pipes: # cd_pipe, ljd_pipe in zip(cm_pipes, ljm_pipes): - # cm_bench_results.append(cd_pipe.recv()) - ljm_bench_results.append(ljd_pipe.recv()) - - for proc in procs: - proc.join() - - with open('data\\benchmarks.csv', 'a') as save_file: - # save_file.write(''.join(['ml_type,tr_size,te_size,kernel_s,', - # 'mae,time,lj_s,lj_e,date_ran\n'])) - date = '/'.join([str(field) for field in time.localtime()[:3][::-1]]) - for ljm in ljm_bench_results: # cm, ljm, in zip(cm_bench_results, ljm_bench_results): - # cm_text = ','.join([str(field) for field in cm])\ - # + ',' + date + '\n' - ljm_text = ','.join([str(field) for field in ljm])\ - + ',1,0.25,' + date + '\n' - # save_file.write(cm_text) - save_file.write(ljm_text) - - # End of program - end_time = time.perf_counter() - printc('Program took {:.4f} seconds.'.format(end_time - init_time), - 'CYAN') - - -def pl(): - """ - Function for plotting the benchmarks. - """ - # Original columns. - or_cols = ['ml_type', - 'tr_size', - 'te_size', - 'kernel_s', - 'mae', - 'time', - 'lj_s', - 'lj_e', - 'date_ran'] - # Drop some original columns. - dor_cols = ['te_size', - 'kernel_s', - 'time', - 'date_ran'] - - # Read benchmarks data and drop some columns. - data_temp = pd.read_csv('data\\benchmarks.csv',) - data = pd.DataFrame(data_temp, columns=or_cols) - data = data.drop(columns=dor_cols) - - # Get the data of the first benchmarks and drop unnecesary columns. - first_data = pd.DataFrame(data, index=range(0, 22)) - first_data = first_data.drop(columns=['lj_s', 'lj_e']) - - # Columns to keep temporarily. - fd_columns = ['ml_type', - 'tr_size', - 'mae'] - - # Create new dataframes for each matrix descriptor and fill them. - first_data_cm = pd.DataFrame(columns=fd_columns) - first_data_ljm = pd.DataFrame(columns=fd_columns) - for i in range(first_data.shape[0]): - temp_df = first_data.iloc[[i]] - if first_data.at[i, 'ml_type'] == 'CM': - first_data_cm = first_data_cm.append(temp_df) - else: - first_data_ljm = first_data_ljm.append(temp_df) - - # Drop unnecesary column and rename 'mae' for later use. - first_data_cm = first_data_cm.drop(columns=['ml_type'])\ - .rename(columns={'mae': 'cm_mae'}) - first_data_ljm = first_data_ljm.drop(columns=['ml_type'])\ - .rename(columns={'mae': 'ljm_mae'}) - # print(first_data_cm) - # print(first_data_ljm) - - # Get the cm data axis so it can be joined with the ljm data axis. - cm_axis = first_data_cm.plot(x='tr_size', - y='cm_mae', - kind='line') - # Get the ljm data axis and join it with the cm one. - plot_axis = first_data_ljm.plot(ax=cm_axis, - x='tr_size', - y='ljm_mae', - kind='line') - plot_axis.set_xlabel('tr_size') - plot_axis.set_ylabel('mae') - plot_axis.set_title('mae for different tr_sizes') - # Get the figure and save it. - # plot_axis.get_figure().savefig('.figs\\mae_diff_tr_sizes.pdf') - - # Get the rest of the benchmark data and drop unnecesary column. - new_data = data.drop(index=range(0, 22)) - new_data = new_data.drop(columns=['ml_type']) - - # Get the first set and rename it. - nd_first = first_data_ljm.rename(columns={'ljm_mae': '1, 1'}) - ndf_axis = nd_first.plot(x='tr_size', - y='1, 1', - kind='line') - last_axis = ndf_axis - for i in range(22, 99, 11): - lj_s = new_data['lj_s'][i] - lj_e = new_data['lj_e'][i] - new_mae = '{}, {}'.format(lj_s, lj_e) - nd_temp = pd.DataFrame(new_data, index=range(i, i + 11))\ - .drop(columns=['lj_s', 'lj_e'])\ - .rename(columns={'mae': new_mae}) - last_axis = nd_temp.plot(ax=last_axis, - x='tr_size', - y=new_mae, - kind='line') - print(nd_temp) - - last_axis.set_xlabel('tr_size') - last_axis.set_ylabel('mae') - last_axis.set_title('mae for different parameters of lj(s)') - - last_axis.get_figure().savefig('.figs\\mae_diff_param_lj_s.pdf') - - ndf_axis = nd_first.plot(x='tr_size', - y='1, 1', - kind='line') - last_axis = ndf_axis - for i in range(99, data.shape[0], 11): - lj_s = new_data['lj_s'][i] - lj_e = new_data['lj_e'][i] - new_mae = '{}, {}'.format(lj_s, lj_e) - nd_temp = pd.DataFrame(new_data, index=range(i, i + 11))\ - .drop(columns=['lj_s', 'lj_e'])\ - .rename(columns={'mae': new_mae}) - last_axis = nd_temp.plot(ax=last_axis, - x='tr_size', - y=new_mae, - kind='line') - print(nd_temp) - - last_axis.set_xlabel('tr_size') - last_axis.set_ylabel('mae') - last_axis.set_title('mae for different parameters of lj(e)') - - last_axis.get_figure().savefig('.figs\\mae_diff_param_lj_e.pdf') - - -if __name__ == '__main__': - # ml() - pl() diff --git a/misc.py b/misc.py deleted file mode 100644 index c50653a5c..000000000 --- a/misc.py +++ /dev/null @@ -1,53 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -from colorama import init, Fore, Style - -init() - - -def printc(text, color): - """ - Prints texts normaly, but in color. Using colorama. - text: string with the text to print. - color: color to be used, same as available in colorama. - """ - color_dic = {'BLACK': Fore.BLACK, - 'RED': Fore.RED, - 'GREEN': Fore.GREEN, - 'YELLOW': Fore.YELLOW, - 'BLUE': Fore.BLUE, - 'MAGENTA': Fore.MAGENTA, - 'CYAN': Fore.CYAN, - 'WHITE': Fore.WHITE, - 'RESET': Fore.RESET} - - color_dic_keys = color_dic.keys() - if color not in color_dic_keys: - print(Fore.RED - + '\'{}\' not found, using default color.'.format(color) - + Style.RESET_ALL) - actual_color = Fore.RESET - else: - actual_color = color_dic[color] - - print(actual_color + text + Style.RESET_ALL) diff --git a/read_qm7_data.py b/read_qm7_data.py deleted file mode 100644 index 068ea1a42..000000000 --- a/read_qm7_data.py +++ /dev/null @@ -1,144 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import os -import time -import numpy as np -import random -from misc import printc - - -# 'periodic_table_of_elements.txt' retrieved from -# https://gist.github.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee -def read_nc_data(data_path): - """ - Reads nuclear charge data from file and returns a dictionary. - data_path: path to the data directory. - """ - fname = 'periodic_table_of_elements.txt' - with open(''.join([data_path, '\\', fname]), 'r') as infile: - temp_lines = infile.readlines() - - del temp_lines[0] - - lines = [] - for temp_line in temp_lines: - new_line = temp_line.split(sep=',') - lines.append(new_line) - - # Dictionary of nuclear charge. - return {line[2]: int(line[0]) for line in lines} - - -# 'hof_qm7.txt.txt' retrieved from -# https://github.com/qmlcode/tutorial -def reas_db_data(zi_data, - data_path, - r_seed=111): - """ - Reads molecule database and extracts - its contents as usable variables. - zi_data: dictionary containing nuclear charge data. - data_path: path to the data directory. - r_seed: random seed. - """ - os.chdir(data_path) - - fname = 'hof_qm7.txt' - with open(fname, 'r') as infile: - lines = infile.readlines() - - # Temporary energy dictionary. - energy_temp = dict() - - for line in lines: - xyz_data = line.split() - - xyz_name = xyz_data[0] - hof = float(xyz_data[1]) - dftb = float(xyz_data[2]) - # print(xyz_name, hof, dftb) - - energy_temp[xyz_name] = np.array([hof, hof - dftb]) - - # Use a random seed. - random.seed(r_seed) - - et_keys = list(energy_temp.keys()) - random.shuffle(et_keys) - - # Temporary energy dictionary, shuffled. - energy_temp_shuffled = dict() - for key in et_keys: - energy_temp_shuffled.update({key: energy_temp[key]}) - - mol_data = [] - mol_nc_data = [] - # Actual reading of the xyz files. - for i, k in enumerate(energy_temp_shuffled.keys()): - with open(k, 'r') as xyz_file: - lines = xyz_file.readlines() - - len_lines = len(lines) - mol_temp_data = [] - mol_nc_temp_data = np.array(np.zeros(len_lines-2)) - for j, line in enumerate(lines[2:len_lines]): - line_list = line.split() - - mol_nc_temp_data[j] = float(zi_data[line_list[0]]) - line_data = np.array(np.asarray(line_list[1:4], dtype=float)) - mol_temp_data.append(line_data) - - mol_data.append(mol_temp_data) - mol_nc_data.append(mol_nc_temp_data) - - # Convert everything to a numpy array. - molecules = np.array([np.array(mol) for mol in mol_data]) - nuclear_charge = np.array([nc_d for nc_d in mol_nc_data]) - energy_pbe0 = np.array([energy_temp_shuffled[k][0] - for k in energy_temp_shuffled.keys()]) - energy_delta = np.array([energy_temp_shuffled[k][1] - for k in energy_temp_shuffled.keys()]) - - return molecules, nuclear_charge, energy_pbe0, energy_delta - - -def read_qm7_data(): - """ - Reads all the qm7 data. - """ - tic = time.perf_counter() - printc('Data reading started.', 'CYAN') - - init_path = os.getcwd() - os.chdir('data') - data_path = os.getcwd() - - zi_data = read_nc_data(data_path) - molecules, nuclear_charge, energy_pbe0, energy_delta = \ - reas_db_data(zi_data, data_path) - - os.chdir(init_path) - toc = time.perf_counter() - printc('\tData reading took {:.4f} seconds.'.format(toc-tic), 'GREEN') - - return zi_data, molecules, nuclear_charge, energy_pbe0, energy_delta -- cgit v1.2.3-54-g00ecf