summaryrefslogtreecommitdiff
path: root/lj_matrix
diff options
context:
space:
mode:
authorDavid Luevano Alvarado <55825613+luevano@users.noreply.github.com>2020-01-23 18:29:21 -0700
committerGitHub <noreply@github.com>2020-01-23 18:29:21 -0700
commit7f122fdb38cd34916820d6ff4fb0e3a49fde80fc (patch)
tree47efddb979957029945a473fde6ed2cde2c2b196 /lj_matrix
parentbd4fb4d77919bc75d3d181e124c3c5752a74dff3 (diff)
parent4704314c9b4d1066383da5c3d6ca87bba9067c8d (diff)
Merge pull request #1 from luevano/add_parallel
Add parallel
Diffstat (limited to 'lj_matrix')
-rw-r--r--lj_matrix/__init__.py48
-rw-r--r--lj_matrix/__main__.py38
-rw-r--r--lj_matrix/c_matrix.py179
-rw-r--r--lj_matrix/cholesky_solve.py64
-rw-r--r--lj_matrix/do_ml.py227
-rw-r--r--lj_matrix/frob_norm.py51
-rw-r--r--lj_matrix/gauss_kernel.py49
-rw-r--r--lj_matrix/lj_matrix.py222
-rw-r--r--lj_matrix/misc.py174
-rw-r--r--lj_matrix/parallel_create_matrices.py85
-rw-r--r--lj_matrix/read_qm7_data.py145
-rw-r--r--lj_matrix/version.py23
12 files changed, 1305 insertions, 0 deletions
diff --git a/lj_matrix/__init__.py b/lj_matrix/__init__.py
new file mode 100644
index 000000000..a430aac68
--- /dev/null
+++ b/lj_matrix/__init__.py
@@ -0,0 +1,48 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+from lj_matrix.read_qm7_data import read_nc_data, read_db_data, read_qm7_data
+from lj_matrix.c_matrix import c_matrix, c_matrix_multiple
+from lj_matrix.lj_matrix import lj_matrix, lj_matrix_multiple
+from lj_matrix.frob_norm import frob_norm
+from lj_matrix.gauss_kernel import gauss_kernel
+from lj_matrix.cholesky_solve import cholesky_solve
+from lj_matrix.do_ml import do_ml
+from lj_matrix.parallel_create_matrices import parallel_create_matrices
+from lj_matrix.misc import plot_benchmarks
+
+
+# If somebody does "from package import *", this is what they will
+# be able to access:
+__all__ = ['read_nc_data',
+ 'read_db_data',
+ 'read_qm7_data',
+ 'c_matrix',
+ 'c_matrix_multiple',
+ 'lj_matrix',
+ 'lj_matrix_multiple',
+ 'frob_norm',
+ 'gauss_kernel',
+ 'cholesky_solve',
+ 'do_ml',
+ 'parallel_create_matrices',
+ 'plot_benchmarks']
diff --git a/lj_matrix/__main__.py b/lj_matrix/__main__.py
new file mode 100644
index 000000000..688e5adcc
--- /dev/null
+++ b/lj_matrix/__main__.py
@@ -0,0 +1,38 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+from lj_matrix.do_ml import do_ml
+# from lj_matrix.misc import plot_benchmarks
+
+if __name__ == '__main__':
+ do_ml(min_training_size=1500,
+ max_training_size=2000,
+ training_increment_size=500,
+ test_size=None,
+ ljm_diag_value=None,
+ ljm_sigma=1.0,
+ ljm_epsilon=1.0,
+ r_seed=111,
+ save_benchmarks=False,
+ show_msgs=True)
+ # plot_benchmarks()
+ print('OK!')
diff --git a/lj_matrix/c_matrix.py b/lj_matrix/c_matrix.py
new file mode 100644
index 000000000..f21ccfd8c
--- /dev/null
+++ b/lj_matrix/c_matrix.py
@@ -0,0 +1,179 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import time
+import math
+import numpy as np
+from numpy.linalg import eig
+from lj_matrix.misc import printc
+
+
+def c_matrix(mol_data,
+ nc_data,
+ max_len=25,
+ as_eig=True,
+ bohr_radius_units=False):
+ """
+ Creates the Coulomb Matrix from the molecule data given.
+ mol_data: molecule data, matrix of atom coordinates.
+ nc_data: nuclear charge data, array of atom data.
+ max_len: maximum amount of atoms in molecule.
+ as_eig: if data should be returned as matrix or array of eigenvalues.
+ bohr_radius_units: if units should be in bohr's radius units.
+ """
+ if bohr_radius_units:
+ conversion_rate = 0.52917721067
+ else:
+ conversion_rate = 1
+
+ mol_n = len(mol_data)
+ mol_nr = range(mol_n)
+
+ if not mol_n == len(nc_data):
+ print(''.join(['Error. Molecule matrix dimension is different ',
+ 'than the nuclear charge array dimension.']))
+ else:
+ if max_len < mol_n:
+ print(''.join(['Error. Molecule matrix dimension (mol_n) is ',
+ 'greater than max_len. Using mol_n.']))
+ max_len = None
+
+ if max_len:
+ cm = np.zeros((max_len, max_len))
+ ml_r = range(max_len)
+
+ # Actual calculation of the coulomb matrix.
+ for i in ml_r:
+ if i < mol_n:
+ x_i = mol_data[i, 0]
+ y_i = mol_data[i, 1]
+ z_i = mol_data[i, 2]
+ Z_i = nc_data[i]
+ else:
+ break
+
+ for j in ml_r:
+ if j < mol_n:
+ x_j = mol_data[j, 0]
+ y_j = mol_data[j, 1]
+ z_j = mol_data[j, 2]
+ Z_j = nc_data[j]
+
+ x = (x_i-x_j)**2
+ y = (y_i-y_j)**2
+ z = (z_i-z_j)**2
+
+ if i == j:
+ cm[i, j] = (0.5*Z_i**2.4)
+ else:
+ cm[i, j] = (conversion_rate*Z_i*Z_j/math.sqrt(x
+ + y
+ + z))
+ else:
+ break
+
+ # Now the value will be returned.
+ if as_eig:
+ cm_sorted = np.sort(eig(cm)[0])[::-1]
+ # Thanks to SO for the following lines of code.
+ # https://stackoverflow.com/a/43011036
+
+ # Keep zeros at the end.
+ mask = cm_sorted != 0.
+ f_mask = mask.sum(0, keepdims=1) >\
+ np.arange(cm_sorted.shape[0]-1, -1, -1)
+
+ f_mask = f_mask[::-1]
+ cm_sorted[f_mask] = cm_sorted[mask]
+ cm_sorted[~f_mask] = 0.
+
+ return cm_sorted
+
+ else:
+ return cm
+
+ else:
+ cm_temp = []
+ # Actual calculation of the coulomb matrix.
+ for i in mol_nr:
+ x_i = mol_data[i, 0]
+ y_i = mol_data[i, 1]
+ z_i = mol_data[i, 2]
+ Z_i = nc_data[i]
+
+ cm_row = []
+ for j in mol_nr:
+ x_j = mol_data[j, 0]
+ y_j = mol_data[j, 1]
+ z_j = mol_data[j, 2]
+ Z_j = nc_data[j]
+
+ x = (x_i-x_j)**2
+ y = (y_i-y_j)**2
+ z = (z_i-z_j)**2
+
+ if i == j:
+ cm_row.append(0.5*Z_i**2.4)
+ else:
+ cm_row.append(conversion_rate*Z_i*Z_j/math.sqrt(x
+ + y
+ + z))
+
+ cm_temp.append(np.array(cm_row))
+
+ cm = np.array(cm_temp)
+ # Now the value will be returned.
+ if as_eig:
+ return np.sort(eig(cm)[0])[::-1]
+ else:
+ return cm
+
+
+def c_matrix_multiple(mol_data,
+ nc_data,
+ pipe=None,
+ max_len=25,
+ as_eig=True,
+ bohr_radius_units=False):
+ """
+ Calculates the Coulomb Matrix of multiple molecules.
+ mol_data: molecule data, matrix of atom coordinates.
+ nc_data: nuclear charge data, array of atom data.
+ pipe: for multiprocessing purposes. Sends the data calculated
+ through a pipe.
+ max_len: maximum amount of atoms in molecule.
+ as_eig: if data should be returned as matrix or array of eigenvalues.
+ bohr_radius_units: if units should be in bohr's radius units.
+ """
+ printc('Coulomb Matrices calculation started.', 'CYAN')
+ tic = time.perf_counter()
+
+ cm_data = np.array([c_matrix(mol, nc, max_len, as_eig, bohr_radius_units)
+ for mol, nc in zip(mol_data, nc_data)])
+
+ toc = time.perf_counter()
+ printc('\tCM calculation took {:.4f} seconds.'.format(toc - tic), 'GREEN')
+
+ if pipe:
+ pipe.send(cm_data)
+
+ return cm_data
diff --git a/lj_matrix/cholesky_solve.py b/lj_matrix/cholesky_solve.py
new file mode 100644
index 000000000..bc6a572a3
--- /dev/null
+++ b/lj_matrix/cholesky_solve.py
@@ -0,0 +1,64 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import numpy as np
+from numpy.linalg import cholesky
+
+
+def cholesky_solve(K, y):
+ """
+ Applies Cholesky decomposition to obtain the 'alpha coeficients'.
+ K: kernel.
+ y: known parameters.
+ """
+ # The initial mathematical problem is to solve Ka=y.
+
+ # First, add a small lambda value.
+ K[np.diag_indices_from(K)] += 1e-8
+
+ # Get the Cholesky decomposition of the kernel.
+ L = cholesky(K)
+ size = len(L)
+
+ # Solve Lx=y for x.
+ x = np.zeros(size)
+ x[0] = y[0] / L[0, 0]
+ for i in range(1, size):
+ temp_sum = 0.0
+ for j in range(i):
+ temp_sum += L[i, j] * x[j]
+ x[i] = (y[i] - temp_sum) / L[i, i]
+
+ # Now, solve LTa=x for a.
+ L2 = L.T
+ a = np.zeros(size)
+ a_ms = size - 1
+ a[a_ms] = x[a_ms] / L2[a_ms, a_ms]
+ # Because of the form of L2 (upper triangular matriz), an inversion of
+ # range() needs to be done.
+ for i in range(0, a_ms)[::-1]:
+ temp_sum = 0.0
+ for j in range(i, size)[::-1]:
+ temp_sum += L2[i, j] * a[j]
+ a[i] = (x[i] - temp_sum) / L2[i, i]
+
+ return a
diff --git a/lj_matrix/do_ml.py b/lj_matrix/do_ml.py
new file mode 100644
index 000000000..25a55e823
--- /dev/null
+++ b/lj_matrix/do_ml.py
@@ -0,0 +1,227 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import time
+import numpy as np
+from multiprocessing import Process, Pipe
+from lj_matrix.misc import printc
+from lj_matrix.gauss_kernel import gauss_kernel
+from lj_matrix.cholesky_solve import cholesky_solve
+from lj_matrix.read_qm7_data import read_qm7_data
+from lj_matrix.parallel_create_matrices import parallel_create_matrices
+
+
+def ml(desc_data,
+ energy_data,
+ training_size,
+ desc_type=None,
+ pipe=None,
+ test_size=None,
+ sigma=1000.0,
+ show_msgs=True):
+ """
+ Does the ML methodology.
+ desc_data: descriptor (or representation) data.
+ energy_data: energy data associated with desc_data.
+ training_size: size of the training set to use.
+ desc_type: string with the name of the descriptor used.
+ pipe: for multiprocessing purposes. Sends the data calculated
+ through a pipe.
+ test_size: size of the test set to use. If no size is given,
+ the last remaining molecules are used.
+ sigma: depth of the kernel.
+ show_msgs: Show debug messages or not.
+ NOTE: desc_type is just a string and is only for identification purposes.
+ Also, training is done with the first part of the data and
+ testing with the ending part of the data.
+ """
+ tic = time.perf_counter()
+ # Initial calculations for later use.
+ d_len = len(desc_data)
+ e_len = len(energy_data)
+
+ if not desc_type:
+ desc_type = 'NOT SPECIFIED'
+
+ if d_len != e_len:
+ printc(''.join(['ERROR. Descriptor data size different ',
+ 'than energy data size.']), 'RED')
+ return None
+
+ if training_size >= d_len:
+ printc('ERROR. Training size greater or equal than data size.', 'RED')
+ return None
+
+ if not test_size:
+ test_size = d_len - training_size
+ if test_size > 1500:
+ test_size = 1500
+
+ if show_msgs:
+ printc('{} ML started.'.format(desc_type), 'GREEN')
+ printc('\tTraining size: {}'.format(training_size), 'CYAN')
+ printc('\tTest size: {}'.format(test_size), 'CYAN')
+ printc('\tSigma: {}'.format(sigma), 'CYAN')
+
+ X_training = desc_data[:training_size]
+ Y_training = energy_data[:training_size]
+ K_training = gauss_kernel(X_training, X_training, sigma)
+ alpha_ = cholesky_solve(K_training, Y_training)
+
+ X_test = desc_data[-test_size:]
+ Y_test = energy_data[-test_size:]
+ K_test = gauss_kernel(X_test, X_training, sigma)
+ Y_predicted = np.dot(K_test, alpha_)
+
+ mae = np.mean(np.abs(Y_predicted - Y_test))
+ if show_msgs:
+ printc('\tMAE for {}: {:.4f}'.format(desc_type, mae), 'GREEN')
+
+ toc = time.perf_counter()
+ tictoc = toc - tic
+ if show_msgs:
+ printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc),
+ 'GREEN')
+ printc('\t\tTraining size: {}'.format(training_size), 'CYAN')
+ printc('\t\tTest size: {}'.format(test_size), 'CYAN')
+ printc('\t\tSigma: {}'.format(sigma), 'CYAN')
+
+ if pipe:
+ pipe.send([desc_type, training_size, test_size, sigma, mae, tictoc])
+
+ return mae, tictoc
+
+
+def do_ml(min_training_size,
+ max_training_size=None,
+ training_increment_size=500,
+ test_size=None,
+ ljm_diag_value=None,
+ ljm_sigma=1.0,
+ ljm_epsilon=1.0,
+ r_seed=111,
+ save_benchmarks=False,
+ max_len=25,
+ as_eig=True,
+ bohr_radius_units=False,
+ sigma=1000.0,
+ show_msgs=True):
+ """
+ Main function that does the whole ML process.
+ min_training_size: minimum training size.
+ max_training_size: maximum training size.
+ training_increment_size: training increment size.
+ test_size: size of the test set to use. If no size is given,
+ the last remaining molecules are used.
+ ljm_diag_value: if a special diagonal value should be used in lj matrix.
+ ljm_sigma: sigma value for lj matrix.
+ ljm_epsilon: epsilon value for lj matrix.
+ r_seed: random seed to use for the shuffling.
+ save_benchmarks: if benchmarks should be saved.
+ max_len: maximum amount of atoms in molecule.
+ as_eig: if data should be returned as matrix or array of eigenvalues.
+ bohr_radius_units: if units should be in bohr's radius units.
+ sigma: depth of the kernel.
+ show_msgs: Show debug messages or not.
+ """
+ # Initialization time.
+ init_time = time.perf_counter()
+ if not max_training_size:
+ max_training_size = min_training_size + training_increment_size
+
+ # Data reading.
+ molecules, nuclear_charge, energy_pbe0, energy_delta =\
+ read_qm7_data(r_seed)
+
+ # Matrices calculation.
+ cm_data, ljm_data = parallel_create_matrices(molecules,
+ nuclear_charge,
+ ljm_diag_value,
+ ljm_sigma,
+ ljm_epsilon,
+ max_len,
+ as_eig,
+ bohr_radius_units)
+
+ # ML calculation.
+ procs = []
+ cm_pipes = []
+ ljm_pipes = []
+ for i in range(min_training_size,
+ max_training_size + 1,
+ training_increment_size):
+ cm_recv, cm_send = Pipe(False)
+ p1 = Process(target=ml,
+ args=(cm_data,
+ energy_pbe0,
+ i,
+ 'CM',
+ cm_send,
+ test_size,
+ sigma,
+ show_msgs))
+ procs.append(p1)
+ cm_pipes.append(cm_recv)
+ p1.start()
+
+ ljm_recv, ljm_send = Pipe(False)
+ p2 = Process(target=ml,
+ args=(ljm_data,
+ energy_pbe0,
+ i,
+ 'L-JM',
+ ljm_send,
+ test_size,
+ sigma,
+ show_msgs))
+ procs.append(p2)
+ ljm_pipes.append(ljm_recv)
+ p2.start()
+
+ cm_bench_results = []
+ ljm_bench_results = []
+ for cd_pipe, ljd_pipe in zip(cm_pipes, ljm_pipes):
+ cm_bench_results.append(cd_pipe.recv())
+ ljm_bench_results.append(ljd_pipe.recv())
+
+ for proc in procs:
+ proc.join()
+
+ if save_benchmarks:
+ with open('data\\benchmarks.csv', 'a') as save_file:
+ # save_file.write(''.join(['ml_type,tr_size,te_size,kernel_s,',
+ # 'mae,time,lj_s,lj_e,date_ran\n']))
+ ltime = time.localtime()[:3][::-1]
+ ljm_se = ',' + str(ljm_sigma) + ',' + str(ljm_epsilon) + ','
+ date = '/'.join([str(field) for field in ltime])
+ for cm, ljm, in zip(cm_bench_results, ljm_bench_results):
+ cm_text = ','.join([str(field) for field in cm])\
+ + ',' + date + '\n'
+ ljm_text = ','.join([str(field) for field in ljm])\
+ + ljm_se + date + '\n'
+ save_file.write(cm_text)
+ save_file.write(ljm_text)
+
+ # End of program
+ end_time = time.perf_counter()
+ printc('Program took {:.4f} seconds.'.format(end_time - init_time),
+ 'CYAN')
diff --git a/lj_matrix/frob_norm.py b/lj_matrix/frob_norm.py
new file mode 100644
index 000000000..4c3a2945d
--- /dev/null
+++ b/lj_matrix/frob_norm.py
@@ -0,0 +1,51 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import math
+
+
+def frob_norm(array):
+ """
+ Calculates the frobenius norm of a given array or matrix.
+ array: array of data.
+ """
+
+ arr_sh_len = len(array.shape)
+ arr_range = range(len(array))
+ fn = 0.0
+
+ # If it is a 'vector'.
+ if arr_sh_len == 1:
+ for i in arr_range:
+ fn += array[i]*array[i]
+
+ return math.sqrt(fn)
+
+ # If it is a matrix.
+ elif arr_sh_len == 2:
+ for i in arr_range:
+ for j in arr_range:
+ fn += array[i, j]*array[i, j]
+
+ return math.sqrt(fn)
+ else:
+ print('Error. Array size greater than 2 ({}).'.format(arr_sh_len))
diff --git a/lj_matrix/gauss_kernel.py b/lj_matrix/gauss_kernel.py
new file mode 100644
index 000000000..5dd8e6406
--- /dev/null
+++ b/lj_matrix/gauss_kernel.py
@@ -0,0 +1,49 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import math
+import numpy as np
+from lj_matrix.frob_norm import frob_norm
+
+
+def gauss_kernel(X_1, X_2, sigma):
+ """
+ Calculates the Gaussian Kernel.
+ X_1: first representations.
+ X_2: second representations.
+ sigma: kernel width.
+ """
+ x1_l = len(X_1)
+ x1_range = range(x1_l)
+ x2_l = len(X_2)
+ x2_range = range(x2_l)
+
+ inv_sigma = -0.5 / (sigma*sigma)
+
+ K = np.zeros((x1_l, x2_l))
+ for i in x1_range:
+ for j in x2_range:
+ f_norm = frob_norm(X_1[i] - X_2[j])
+ # print(f_norm)
+ K[i, j] = math.exp(inv_sigma * f_norm)
+
+ return K
diff --git a/lj_matrix/lj_matrix.py b/lj_matrix/lj_matrix.py
new file mode 100644
index 000000000..6739ae283
--- /dev/null
+++ b/lj_matrix/lj_matrix.py
@@ -0,0 +1,222 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import time
+import math
+import numpy as np
+from numpy.linalg import eig
+from lj_matrix.misc import printc
+
+
+def lj_matrix(mol_data,
+ nc_data,
+ diag_value=None,
+ sigma=1.0,
+ epsilon=1.0,
+ max_len=25,
+ as_eig=True,
+ bohr_radius_units=False):
+ """
+ Creates the Lennard-Jones Matrix from the molecule data given.
+ mol_data: molecule data, matrix of atom coordinates.
+ nc_data: nuclear charge data, array of atom data.
+ diag_value: if special diagonal value is to be used.
+ sigma: sigma value.
+ epsilon: epsilon value.
+ max_len: maximum amount of atoms in molecule.
+ as_eig: if data should be returned as matrix or array of eigenvalues.
+ bohr_radius_units: if units should be in bohr's radius units.
+ """
+ if bohr_radius_units:
+ conversion_rate = 0.52917721067
+ else:
+ conversion_rate = 1
+
+ mol_n = len(mol_data)
+ mol_nr = range(mol_n)
+
+ if not mol_n == len(nc_data):
+ print(''.join(['Error. Molecule matrix dimension is different ',
+ 'than the nuclear charge array dimension.']))
+ else:
+ if max_len < mol_n:
+ print(''.join(['Error. Molecule matrix dimension (mol_n) is ',
+ 'greater than max_len. Using mol_n.']))
+ max_len = None
+
+ if max_len:
+ lj = np.zeros((max_len, max_len))
+ ml_r = range(max_len)
+
+ # Actual calculation of the coulomb matrix.
+ for i in ml_r:
+ if i < mol_n:
+ x_i = mol_data[i, 0]
+ y_i = mol_data[i, 1]
+ z_i = mol_data[i, 2]
+ Z_i = nc_data[i]
+ else:
+ break
+
+ for j in ml_r:
+ if j < mol_n:
+ x_j = mol_data[j, 0]
+ y_j = mol_data[j, 1]
+ z_j = mol_data[j, 2]
+
+ x = (x_i-x_j)**2
+ y = (y_i-y_j)**2
+ z = (z_i-z_j)**2
+
+ if i == j:
+ if diag_value is None:
+ lj[i, j] = (0.5*Z_i**2.4)
+ else:
+ lj[i, j] = diag_value
+ else:
+ # Calculations are done after i==j is checked
+ # so no division by zero is done.
+
+ # A little play with r exponents
+ # so no square root is calculated.
+ # Conversion factor is included in r^2.
+
+ # 1/r^2
+ r_2 = sigma**2/(conversion_rate**2*(x + y + z))
+
+ r_6 = math.pow(r_2, 3)
+ r_12 = math.pow(r_6, 2)
+ lj[i, j] = (4*epsilon*(r_12 - r_6))
+ else:
+ break
+
+ # Now the value will be returned.
+ if as_eig:
+ lj_sorted = np.sort(eig(lj)[0])[::-1]
+ # Thanks to SO for the following lines of code.
+ # https://stackoverflow.com/a/43011036
+
+ # Keep zeros at the end.
+ mask = lj_sorted != 0.
+ f_mask = mask.sum(0, keepdims=1) >\
+ np.arange(lj_sorted.shape[0]-1, -1, -1)
+
+ f_mask = f_mask[::-1]
+ lj_sorted[f_mask] = lj_sorted[mask]
+ lj_sorted[~f_mask] = 0.
+
+ return lj_sorted
+
+ else:
+ return lj
+
+ else:
+ lj_temp = []
+ # Actual calculation of the coulomb matrix.
+ for i in mol_nr:
+ x_i = mol_data[i, 0]
+ y_i = mol_data[i, 1]
+ z_i = mol_data[i, 2]
+ Z_i = nc_data[i]
+
+ lj_row = []
+ for j in mol_nr:
+ x_j = mol_data[j, 0]
+ y_j = mol_data[j, 1]
+ z_j = mol_data[j, 2]
+
+ x = (x_i-x_j)**2
+ y = (y_i-y_j)**2
+ z = (z_i-z_j)**2
+
+ if i == j:
+ if not diag_value:
+ lj_row.append(0.5*Z_i**2.4)
+ else:
+ lj_row.append(diag_value)
+ else:
+ # Calculations are done after i==j is checked
+ # so no division by zero is done.
+
+ # A little play with r exponents
+ # so no square root is calculated.
+ # Conversion factor is included in r^2.
+
+ # 1/r^2
+ r_2 = sigma**2/(conversion_rate**2*(x + y + z))
+
+ r_6 = math.pow(r_2, 3)
+ r_12 = math.pow(r_6, 2)
+ lj_row.append(4*epsilon*(r_12 - r_6))
+
+ lj_temp.append(np.array(lj_row))
+
+ lj = np.array(lj_temp)
+ # Now the value will be returned.
+ if as_eig:
+ return np.sort(eig(lj)[0])[::-1]
+ else:
+ return lj
+
+
+def lj_matrix_multiple(mol_data,
+ nc_data,
+ pipe=None,
+ diag_value=None,
+ sigma=1.0,
+ epsilon=1.0,
+ max_len=25,
+ as_eig=True,
+ bohr_radius_units=False):
+ """
+ Calculates the Lennard-Jones Matrix of multiple molecules.
+ mol_data: molecule data, matrix of atom coordinates.
+ nc_data: nuclear charge data, array of atom data.
+ pipe: for multiprocessing purposes. Sends the data calculated
+ through a pipe.
+ diag_value: if special diagonal value is to be used.
+ sigma: sigma value.
+ epsilon: epsilon value.
+ max_len: maximum amount of atoms in molecule.
+ as_eig: if data should be returned as matrix or array of eigenvalues.
+ bohr_radius_units: if units should be in bohr's radius units.
+ """
+ printc('L-J Matrices calculation started.', 'CYAN')
+ tic = time.perf_counter()
+
+ ljm_data = np.array([lj_matrix(mol,
+ nc,
+ diag_value,
+ sigma,
+ epsilon,
+ max_len,
+ as_eig,
+ bohr_radius_units)
+ for mol, nc in zip(mol_data, nc_data)])
+
+ toc = time.perf_counter()
+ printc('\tL-JM calculation took {:.4f} seconds.'.format(toc-tic), 'GREEN')
+
+ if pipe:
+ pipe.send(ljm_data)
+
+ return ljm_data
diff --git a/lj_matrix/misc.py b/lj_matrix/misc.py
new file mode 100644
index 000000000..e9142b05f
--- /dev/null
+++ b/lj_matrix/misc.py
@@ -0,0 +1,174 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+from colorama import init, Fore, Style
+import pandas as pd
+
+init()
+
+
+def printc(text, color):
+ """
+ Prints texts normaly, but in color. Using colorama.
+ text: string with the text to print.
+ color: color to be used, same as available in colorama.
+ """
+ color_dic = {'BLACK': Fore.BLACK,
+ 'RED': Fore.RED,
+ 'GREEN': Fore.GREEN,
+ 'YELLOW': Fore.YELLOW,
+ 'BLUE': Fore.BLUE,
+ 'MAGENTA': Fore.MAGENTA,
+ 'CYAN': Fore.CYAN,
+ 'WHITE': Fore.WHITE,
+ 'RESET': Fore.RESET}
+
+ color_dic_keys = color_dic.keys()
+ if color not in color_dic_keys:
+ print(Fore.RED
+ + '\'{}\' not found, using default color.'.format(color)
+ + Style.RESET_ALL)
+ actual_color = Fore.RESET
+ else:
+ actual_color = color_dic[color]
+
+ print(actual_color + text + Style.RESET_ALL)
+
+
+def plot_benchmarks():
+ """
+ For plotting the benchmarks.
+ """
+ # Original columns.
+ or_cols = ['ml_type',
+ 'tr_size',
+ 'te_size',
+ 'kernel_s',
+ 'mae',
+ 'time',
+ 'lj_s',
+ 'lj_e',
+ 'date_ran']
+ # Drop some original columns.
+ dor_cols = ['te_size',
+ 'kernel_s',
+ 'time',
+ 'date_ran']
+
+ # Read benchmarks data and drop some columns.
+ data_temp = pd.read_csv('data\\benchmarks.csv',)
+ data = pd.DataFrame(data_temp, columns=or_cols)
+ data = data.drop(columns=dor_cols)
+
+ # Get the data of the first benchmarks and drop unnecesary columns.
+ first_data = pd.DataFrame(data, index=range(0, 22))
+ first_data = first_data.drop(columns=['lj_s', 'lj_e'])
+
+ # Columns to keep temporarily.
+ fd_columns = ['ml_type',
+ 'tr_size',
+ 'mae']
+
+ # Create new dataframes for each matrix descriptor and fill them.
+ first_data_cm = pd.DataFrame(columns=fd_columns)
+ first_data_ljm = pd.DataFrame(columns=fd_columns)
+ for i in range(first_data.shape[0]):
+ temp_df = first_data.iloc[[i]]
+ if first_data.at[i, 'ml_type'] == 'CM':
+ first_data_cm = first_data_cm.append(temp_df)
+ else:
+ first_data_ljm = first_data_ljm.append(temp_df)
+
+ # Drop unnecesary column and rename 'mae' for later use.
+ first_data_cm = first_data_cm.drop(columns=['ml_type'])\
+ .rename(columns={'mae': 'cm_mae'})
+ first_data_ljm = first_data_ljm.drop(columns=['ml_type'])\
+ .rename(columns={'mae': 'ljm_mae'})
+ # print(first_data_cm)
+ # print(first_data_ljm)
+
+ # Get the cm data axis so it can be joined with the ljm data axis.
+ cm_axis = first_data_cm.plot(x='tr_size',
+ y='cm_mae',
+ kind='line')
+ # Get the ljm data axis and join it with the cm one.
+ plot_axis = first_data_ljm.plot(ax=cm_axis,
+ x='tr_size',
+ y='ljm_mae',
+ kind='line')
+ plot_axis.set_xlabel('tr_size')
+ plot_axis.set_ylabel('mae')
+ plot_axis.set_title('mae for different tr_sizes')
+ # Get the figure and save it.
+ # plot_axis.get_figure().savefig('.figs\\mae_diff_tr_sizes.pdf')
+
+ # Get the rest of the benchmark data and drop unnecesary column.
+ new_data = data.drop(index=range(0, 22))
+ new_data = new_data.drop(columns=['ml_type'])
+
+ # Get the first set and rename it.
+ nd_first = first_data_ljm.rename(columns={'ljm_mae': '1, 1'})
+ ndf_axis = nd_first.plot(x='tr_size',
+ y='1, 1',
+ kind='line')
+ last_axis = ndf_axis
+ for i in range(22, 99, 11):
+ lj_s = new_data['lj_s'][i]
+ lj_e = new_data['lj_e'][i]
+ new_mae = '{}, {}'.format(lj_s, lj_e)
+ nd_temp = pd.DataFrame(new_data, index=range(i, i + 11))\
+ .drop(columns=['lj_s', 'lj_e'])\
+ .rename(columns={'mae': new_mae})
+ last_axis = nd_temp.plot(ax=last_axis,
+ x='tr_size',
+ y=new_mae,
+ kind='line')
+ print(nd_temp)
+
+ last_axis.set_xlabel('tr_size')
+ last_axis.set_ylabel('mae')
+ last_axis.set_title('mae for different parameters of lj(s)')
+
+ last_axis.get_figure().savefig('.figs\\mae_diff_param_lj_s.pdf')
+
+ ndf_axis = nd_first.plot(x='tr_size',
+ y='1, 1',
+ kind='line')
+ last_axis = ndf_axis
+ for i in range(99, data.shape[0], 11):
+ lj_s = new_data['lj_s'][i]
+ lj_e = new_data['lj_e'][i]
+ new_mae = '{}, {}'.format(lj_s, lj_e)
+ nd_temp = pd.DataFrame(new_data, index=range(i, i + 11))\
+ .drop(columns=['lj_s', 'lj_e'])\
+ .rename(columns={'mae': new_mae})
+ last_axis = nd_temp.plot(ax=last_axis,
+ x='tr_size',
+ y=new_mae,
+ kind='line')
+ print(nd_temp)
+
+ last_axis.set_xlabel('tr_size')
+ last_axis.set_ylabel('mae')
+ last_axis.set_title('mae for different parameters of lj(e)')
+
+ last_axis.get_figure().savefig('.figs\\mae_diff_param_lj_e.pdf')
diff --git a/lj_matrix/parallel_create_matrices.py b/lj_matrix/parallel_create_matrices.py
new file mode 100644
index 000000000..cd5ef5c8e
--- /dev/null
+++ b/lj_matrix/parallel_create_matrices.py
@@ -0,0 +1,85 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+from multiprocessing import Process, Pipe
+from lj_matrix.c_matrix import c_matrix_multiple
+from lj_matrix.lj_matrix import lj_matrix_multiple
+
+
+def parallel_create_matrices(mol_data,
+ nc_data,
+ ljm_diag_value=None,
+ ljm_sigma=1.0,
+ ljm_epsilon=1.0,
+ max_len=25,
+ as_eig=True,
+ bohr_radius_units=False):
+ """
+ Creates the Coulomb and L-J matrices in parallel.
+ mol_data: molecule data, matrix of atom coordinates.
+ nc_data: nuclear charge data, array of atom data.
+ ljm_diag_value: if special diagonal value is to be used for lj matrix.
+ ljm_sigma: sigma value for lj matrix.
+ ljm_epsilon: psilon value for lj matrix.
+ max_len: maximum amount of atoms in molecule.
+ as_eig: if data should be returned as matrix or array of eigenvalues.
+ bohr_radius_units: if units should be in bohr's radius units.
+ """
+
+ # Matrices calculation.
+ procs = []
+ pipes = []
+
+ cm_recv, cm_send = Pipe(False)
+ p1 = Process(target=c_matrix_multiple,
+ args=(mol_data,
+ nc_data,
+ cm_send,
+ max_len,
+ as_eig,
+ bohr_radius_units))
+ procs.append(p1)
+ pipes.append(cm_recv)
+ p1.start()
+
+ ljm_recv, ljm_send = Pipe(False)
+ p2 = Process(target=lj_matrix_multiple,
+ args=(mol_data,
+ nc_data,
+ ljm_send,
+ ljm_diag_value,
+ ljm_sigma,
+ ljm_epsilon,
+ max_len,
+ as_eig,
+ bohr_radius_units))
+ procs.append(p2)
+ pipes.append(ljm_recv)
+ p2.start()
+
+ cm_data = pipes[0].recv()
+ ljm_data = pipes[1].recv()
+
+ for proc in procs:
+ proc.join()
+
+ return cm_data, ljm_data
diff --git a/lj_matrix/read_qm7_data.py b/lj_matrix/read_qm7_data.py
new file mode 100644
index 000000000..4401ca1c0
--- /dev/null
+++ b/lj_matrix/read_qm7_data.py
@@ -0,0 +1,145 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import os
+import time
+import numpy as np
+import random
+from lj_matrix.misc import printc
+
+
+# 'periodic_table_of_elements.txt' retrieved from
+# https://gist.github.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee
+def read_nc_data(data_path):
+ """
+ Reads nuclear charge data from file and returns a dictionary.
+ data_path: path to the data directory.
+ """
+ fname = 'periodic_table_of_elements.txt'
+ with open(''.join([data_path, '\\', fname]), 'r') as infile:
+ temp_lines = infile.readlines()
+
+ del temp_lines[0]
+
+ lines = []
+ for temp_line in temp_lines:
+ new_line = temp_line.split(sep=',')
+ lines.append(new_line)
+
+ # Dictionary of nuclear charge.
+ return {line[2]: int(line[0]) for line in lines}
+
+
+# 'hof_qm7.txt.txt' retrieved from
+# https://github.com/qmlcode/tutorial
+def read_db_data(zi_data,
+ data_path,
+ r_seed=111):
+ """
+ Reads molecule database and extracts
+ its contents as usable variables.
+ zi_data: dictionary containing nuclear charge data.
+ data_path: path to the data directory.
+ r_seed: random seed to use for the shuffling.
+ """
+ os.chdir(data_path)
+
+ fname = 'hof_qm7.txt'
+ with open(fname, 'r') as infile:
+ lines = infile.readlines()
+
+ # Temporary energy dictionary.
+ energy_temp = dict()
+
+ for line in lines:
+ xyz_data = line.split()
+
+ xyz_name = xyz_data[0]
+ hof = float(xyz_data[1])
+ dftb = float(xyz_data[2])
+ # print(xyz_name, hof, dftb)
+
+ energy_temp[xyz_name] = np.array([hof, hof - dftb])
+
+ # Use a random seed.
+ random.seed(r_seed)
+
+ et_keys = list(energy_temp.keys())
+ random.shuffle(et_keys)
+
+ # Temporary energy dictionary, shuffled.
+ energy_temp_shuffled = dict()
+ for key in et_keys:
+ energy_temp_shuffled.update({key: energy_temp[key]})
+
+ mol_data = []
+ mol_nc_data = []
+ # Actual reading of the xyz files.
+ for i, k in enumerate(energy_temp_shuffled.keys()):
+ with open(k, 'r') as xyz_file:
+ lines = xyz_file.readlines()
+
+ len_lines = len(lines)
+ mol_temp_data = []
+ mol_nc_temp_data = np.array(np.zeros(len_lines-2))
+ for j, line in enumerate(lines[2:len_lines]):
+ line_list = line.split()
+
+ mol_nc_temp_data[j] = float(zi_data[line_list[0]])
+ line_data = np.array(np.asarray(line_list[1:4], dtype=float))
+ mol_temp_data.append(line_data)
+
+ mol_data.append(mol_temp_data)
+ mol_nc_data.append(mol_nc_temp_data)
+
+ # Convert everything to a numpy array.
+ molecules = np.array([np.array(mol) for mol in mol_data])
+ nuclear_charge = np.array([nc_d for nc_d in mol_nc_data])
+ energy_pbe0 = np.array([energy_temp_shuffled[k][0]
+ for k in energy_temp_shuffled.keys()])
+ energy_delta = np.array([energy_temp_shuffled[k][1]
+ for k in energy_temp_shuffled.keys()])
+
+ return molecules, nuclear_charge, energy_pbe0, energy_delta
+
+
+def read_qm7_data(r_seed=111):
+ """
+ Reads all the qm7 data.
+ r_seed: random seed to use for the shuffling.
+ """
+ tic = time.perf_counter()
+ printc('Data reading started.', 'CYAN')
+
+ init_path = os.getcwd()
+ os.chdir('data')
+ data_path = os.getcwd()
+
+ zi_data = read_nc_data(data_path)
+ molecules, nuclear_charge, energy_pbe0, energy_delta = \
+ read_db_data(zi_data, data_path, r_seed)
+
+ os.chdir(init_path)
+ toc = time.perf_counter()
+ printc('\tData reading took {:.4f} seconds.'.format(toc-tic), 'GREEN')
+
+ return molecules, nuclear_charge, energy_pbe0, energy_delta
diff --git a/lj_matrix/version.py b/lj_matrix/version.py
new file mode 100644
index 000000000..fab58433d
--- /dev/null
+++ b/lj_matrix/version.py
@@ -0,0 +1,23 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+__version__ = '0.0.1'