Refactor files

author: David Luevano <55825613+luevano@users.noreply.github.com> 2019-12-18 07:21:35 -0700
committer: David Luevano <55825613+luevano@users.noreply.github.com> 2019-12-18 07:21:35 -0700
commit: 487bf8840846b5d4d694b38985268c308aadb36e (patch)
tree: ba3a3a742a503f925a5a7792e1bd16ee518066c9 /lj_matrix
parent: 96a3f2b2950451a478c951e642a4aa188219682b (diff)
10 files changed, 1115 insertions, 0 deletions
diff --git a/lj_matrix/__init__.py b/lj_matrix/__init__.py
new file mode 100644
index 000000000..48cd14913
--- /dev/null
+++ b/lj_matrix/__init__.py
@@ -0,0 +1,22 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
diff --git a/lj_matrix/__main__.py b/lj_matrix/__main__.py
new file mode 100644
index 000000000..4e13f4995
--- /dev/null
+++ b/lj_matrix/__main__.py
@@ -0,0 +1,238 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import time
+from multiprocessing import Process, Pipe
+# import matplotlib.pyplot as plt
+import pandas as pd
+from lj_matrix.misc import printc
+from lj_matrix.read_qm7_data import read_qm7_data
+from lj_matrix.c_matrix import c_matrix_multiple
+from lj_matrix.lj_matrix import lj_matrix_multiple
+from lj_matrix.do_ml import do_ml
+
+
+# Test
+def ml():
+    """
+    Main function that does the whole ML process.
+    """
+    # Initialization time.
+    init_time = time.perf_counter()
+
+    # Data reading.
+    zi_data, molecules, nuclear_charge, energy_pbe0, energy_delta =\
+        read_qm7_data()
+
+    # Matrices calculation.
+    procs = []
+    pipes = []
+
+    # cm_recv, cm_send = Pipe(False)
+    # p1 = Process(target=c_matrix_multiple,
+    #              args=(molecules, nuclear_charge, cm_send))
+    # procs.append(p1)
+    # pipes.append(cm_recv)
+    # p1.start()
+
+    ljm_recv, ljm_send = Pipe(False)
+    p2 = Process(target=lj_matrix_multiple,
+                 args=(molecules, nuclear_charge, ljm_send, 1, 0.25))
+    procs.append(p2)
+    pipes.append(ljm_recv)
+    p2.start()
+
+    # cm_data = pipes[0].recv()
+    ljm_data = pipes[0].recv()
+
+    for proc in procs:
+        proc.join()
+
+    # ML calculation.
+    procs = []
+    # cm_pipes = []
+    ljm_pipes = []
+    for i in range(1500, 6500 + 1, 500):
+        # cm_recv, cm_send = Pipe(False)
+        # p1 = Process(target=do_ml,
+        #              args=(cm_data, energy_pbe0, i, 'CM', cm_send))
+        # procs.append(p1)
+        # cm_pipes.append(cm_recv)
+        # p1.start()
+
+        ljm_recv, ljm_send = Pipe(False)
+        p2 = Process(target=do_ml,
+                     args=(ljm_data, energy_pbe0, i, 'L-JM', ljm_send))
+        procs.append(p2)
+        ljm_pipes.append(ljm_recv)
+        p2.start()
+
+    # cm_bench_results = []
+    ljm_bench_results = []
+    for ljd_pipe in ljm_pipes:  # cd_pipe, ljd_pipe in zip(cm_pipes, ljm_pipes):
+        # cm_bench_results.append(cd_pipe.recv())
+        ljm_bench_results.append(ljd_pipe.recv())
+
+    for proc in procs:
+        proc.join()
+
+    with open('data\\benchmarks.csv', 'a') as save_file:
+        # save_file.write(''.join(['ml_type,tr_size,te_size,kernel_s,',
+        #                          'mae,time,lj_s,lj_e,date_ran\n']))
+        date = '/'.join([str(field) for field in time.localtime()[:3][::-1]])
+        for ljm in ljm_bench_results:  # cm, ljm, in zip(cm_bench_results, ljm_bench_results):
+            # cm_text = ','.join([str(field) for field in cm])\
+            #     + ',' + date + '\n'
+            ljm_text = ','.join([str(field) for field in ljm])\
+                + ',1,0.25,' + date + '\n'
+            # save_file.write(cm_text)
+            save_file.write(ljm_text)
+
+    # End of program
+    end_time = time.perf_counter()
+    printc('Program took {:.4f} seconds.'.format(end_time - init_time),
+           'CYAN')
+
+
+def pl():
+    """
+    Function for plotting the benchmarks.
+    """
+    # Original columns.
+    or_cols = ['ml_type',
+               'tr_size',
+               'te_size',
+               'kernel_s',
+               'mae',
+               'time',
+               'lj_s',
+               'lj_e',
+               'date_ran']
+    # Drop some original columns.
+    dor_cols = ['te_size',
+                'kernel_s',
+                'time',
+                'date_ran']
+
+    # Read benchmarks data and drop some columns.
+    data_temp = pd.read_csv('data\\benchmarks.csv',)
+    data = pd.DataFrame(data_temp, columns=or_cols)
+    data = data.drop(columns=dor_cols)
+
+    # Get the data of the first benchmarks and drop unnecesary columns.
+    first_data = pd.DataFrame(data, index=range(0, 22))
+    first_data = first_data.drop(columns=['lj_s', 'lj_e'])
+
+    # Columns to keep temporarily.
+    fd_columns = ['ml_type',
+                  'tr_size',
+                  'mae']
+
+    # Create new dataframes for each matrix descriptor and fill them.
+    first_data_cm = pd.DataFrame(columns=fd_columns)
+    first_data_ljm = pd.DataFrame(columns=fd_columns)
+    for i in range(first_data.shape[0]):
+        temp_df = first_data.iloc[[i]]
+        if first_data.at[i, 'ml_type'] == 'CM':
+            first_data_cm = first_data_cm.append(temp_df)
+        else:
+            first_data_ljm = first_data_ljm.append(temp_df)
+
+    # Drop unnecesary column and rename 'mae' for later use.
+    first_data_cm = first_data_cm.drop(columns=['ml_type'])\
+        .rename(columns={'mae': 'cm_mae'})
+    first_data_ljm = first_data_ljm.drop(columns=['ml_type'])\
+        .rename(columns={'mae': 'ljm_mae'})
+    # print(first_data_cm)
+    # print(first_data_ljm)
+
+    # Get the cm data axis so it can be joined with the ljm data axis.
+    cm_axis = first_data_cm.plot(x='tr_size',
+                                 y='cm_mae',
+                                 kind='line')
+    # Get the ljm data axis and join it with the cm one.
+    plot_axis = first_data_ljm.plot(ax=cm_axis,
+                                    x='tr_size',
+                                    y='ljm_mae',
+                                    kind='line')
+    plot_axis.set_xlabel('tr_size')
+    plot_axis.set_ylabel('mae')
+    plot_axis.set_title('mae for different tr_sizes')
+    # Get the figure and save it.
+    # plot_axis.get_figure().savefig('.figs\\mae_diff_tr_sizes.pdf')
+
+    # Get the rest of the benchmark data and drop unnecesary column.
+    new_data = data.drop(index=range(0, 22))
+    new_data = new_data.drop(columns=['ml_type'])
+
+    # Get the first set and rename it.
+    nd_first = first_data_ljm.rename(columns={'ljm_mae': '1, 1'})
+    ndf_axis = nd_first.plot(x='tr_size',
+                             y='1, 1',
+                             kind='line')
+    last_axis = ndf_axis
+    for i in range(22, 99, 11):
+        lj_s = new_data['lj_s'][i]
+        lj_e = new_data['lj_e'][i]
+        new_mae = '{}, {}'.format(lj_s, lj_e)
+        nd_temp = pd.DataFrame(new_data, index=range(i, i + 11))\
+            .drop(columns=['lj_s', 'lj_e'])\
+            .rename(columns={'mae': new_mae})
+        last_axis = nd_temp.plot(ax=last_axis,
+                                 x='tr_size',
+                                 y=new_mae,
+                                 kind='line')
+        print(nd_temp)
+
+    last_axis.set_xlabel('tr_size')
+    last_axis.set_ylabel('mae')
+    last_axis.set_title('mae for different parameters of lj(s)')
+
+    last_axis.get_figure().savefig('.figs\\mae_diff_param_lj_s.pdf')
+
+    ndf_axis = nd_first.plot(x='tr_size',
+                             y='1, 1',
+                             kind='line')
+    last_axis = ndf_axis
+    for i in range(99, data.shape[0], 11):
+        lj_s = new_data['lj_s'][i]
+        lj_e = new_data['lj_e'][i]
+        new_mae = '{}, {}'.format(lj_s, lj_e)
+        nd_temp = pd.DataFrame(new_data, index=range(i, i + 11))\
+            .drop(columns=['lj_s', 'lj_e'])\
+            .rename(columns={'mae': new_mae})
+        last_axis = nd_temp.plot(ax=last_axis,
+                                 x='tr_size',
+                                 y=new_mae,
+                                 kind='line')
+        print(nd_temp)
+
+    last_axis.set_xlabel('tr_size')
+    last_axis.set_ylabel('mae')
+    last_axis.set_title('mae for different parameters of lj(e)')
+
+    last_axis.get_figure().savefig('.figs\\mae_diff_param_lj_e.pdf')
+
+
+if __name__ == '__main__':
+    # ml()
+    pl()
diff --git a/lj_matrix/c_matrix.py b/lj_matrix/c_matrix.py
new file mode 100644
index 000000000..f40a18c68
--- /dev/null
+++ b/lj_matrix/c_matrix.py
@@ -0,0 +1,179 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import time
+from lj_matrix.misc import printc
+import math
+import numpy as np
+from numpy.linalg import eig
+
+
+def c_matrix(mol_data,
+             nc_data,
+             max_len=25,
+             as_eig=True,
+             bohr_radius_units=False):
+    """
+    Creates the Coulomb Matrix from the molecule data given.
+    mol_data: molecule data, matrix of atom coordinates.
+    nc_data: nuclear charge data, array of atom data.
+    max_len: maximum amount of atoms in molecule.
+    as_eig: if data should be returned as matrix or array of eigenvalues.
+    bohr_radius_units: if units should be in bohr's radius units.
+    """
+    if bohr_radius_units:
+        conversion_rate = 0.52917721067
+    else:
+        conversion_rate = 1
+
+    mol_n = len(mol_data)
+    mol_nr = range(mol_n)
+
+    if not mol_n == len(nc_data):
+        print(''.join(['Error. Molecule matrix dimension is different ',
+                       'than the nuclear charge array dimension.']))
+    else:
+        if max_len < mol_n:
+            print(''.join(['Error. Molecule matrix dimension (mol_n) is ',
+                           'greater than max_len. Using mol_n.']))
+            max_len = None
+
+        if max_len:
+            cm = np.zeros((max_len, max_len))
+            ml_r = range(max_len)
+
+            # Actual calculation of the coulomb matrix.
+            for i in ml_r:
+                if i < mol_n:
+                    x_i = mol_data[i, 0]
+                    y_i = mol_data[i, 1]
+                    z_i = mol_data[i, 2]
+                    Z_i = nc_data[i]
+                else:
+                    break
+
+                for j in ml_r:
+                    if j < mol_n:
+                        x_j = mol_data[j, 0]
+                        y_j = mol_data[j, 1]
+                        z_j = mol_data[j, 2]
+                        Z_j = nc_data[j]
+
+                        x = (x_i-x_j)**2
+                        y = (y_i-y_j)**2
+                        z = (z_i-z_j)**2
+
+                        if i == j:
+                            cm[i, j] = (0.5*Z_i**2.4)
+                        else:
+                            cm[i, j] = (conversion_rate*Z_i*Z_j/math.sqrt(x
+                                                                          + y
+                                                                          + z))
+                    else:
+                        break
+
+            # Now the value will be returned.
+            if as_eig:
+                cm_sorted = np.sort(eig(cm)[0])[::-1]
+                # Thanks to SO for the following lines of code.
+                # https://stackoverflow.com/a/43011036
+
+                # Keep zeros at the end.
+                mask = cm_sorted != 0.
+                f_mask = mask.sum(0, keepdims=1) >\
+                    np.arange(cm_sorted.shape[0]-1, -1, -1)
+
+                f_mask = f_mask[::-1]
+                cm_sorted[f_mask] = cm_sorted[mask]
+                cm_sorted[~f_mask] = 0.
+
+                return cm_sorted
+
+            else:
+                return cm
+
+        else:
+            cm_temp = []
+            # Actual calculation of the coulomb matrix.
+            for i in mol_nr:
+                x_i = mol_data[i, 0]
+                y_i = mol_data[i, 1]
+                z_i = mol_data[i, 2]
+                Z_i = nc_data[i]
+
+                cm_row = []
+                for j in mol_nr:
+                    x_j = mol_data[j, 0]
+                    y_j = mol_data[j, 1]
+                    z_j = mol_data[j, 2]
+                    Z_j = nc_data[j]
+
+                    x = (x_i-x_j)**2
+                    y = (y_i-y_j)**2
+                    z = (z_i-z_j)**2
+
+                    if i == j:
+                        cm_row.append(0.5*Z_i**2.4)
+                    else:
+                        cm_row.append(conversion_rate*Z_i*Z_j/math.sqrt(x
+                                                                        + y
+                                                                        + z))
+
+                cm_temp.append(np.array(cm_row))
+
+            cm = np.array(cm_temp)
+            # Now the value will be returned.
+            if as_eig:
+                return np.sort(eig(cm)[0])[::-1]
+            else:
+                return cm
+
+
+def c_matrix_multiple(mol_data,
+                      nc_data,
+                      pipe=None,
+                      max_len=25,
+                      as_eig=True,
+                      bohr_radius_units=False):
+    """
+    Calculates the Coulomb Matrix of multiple molecules.
+    mol_data: molecule data, matrix of atom coordinates.
+    nc_data: nuclear charge data, array of atom data.
+    pipe: for multiprocessing purposes. Sends the data calculated
+        through a pipe.
+    max_len: maximum amount of atoms in molecule.
+    as_eig: if data should be returned as matrix or array of eigenvalues.
+    bohr_radius_units: if units should be in bohr's radius units.
+    """
+    printc('Coulomb Matrices calculation started.', 'CYAN')
+    tic = time.perf_counter()
+
+    cm_data = np.array([c_matrix(mol, nc, max_len, as_eig, bohr_radius_units)
+                       for mol, nc in zip(mol_data, nc_data)])
+
+    toc = time.perf_counter()
+    printc('\tCM calculation took {:.4f} seconds.'.format(toc - tic), 'GREEN')
+
+    if pipe:
+        pipe.send(cm_data)
+
+    return cm_data
diff --git a/lj_matrix/cholesky_solve.py b/lj_matrix/cholesky_solve.py
new file mode 100644
index 000000000..bc6a572a3
--- /dev/null
+++ b/lj_matrix/cholesky_solve.py
@@ -0,0 +1,64 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import numpy as np
+from numpy.linalg import cholesky
+
+
+def cholesky_solve(K, y):
+    """
+    Applies Cholesky decomposition to obtain the 'alpha coeficients'.
+    K: kernel.
+    y: known parameters.
+    """
+    # The initial mathematical problem is to solve Ka=y.
+
+    # First, add a small lambda value.
+    K[np.diag_indices_from(K)] += 1e-8
+
+    # Get the Cholesky decomposition of the kernel.
+    L = cholesky(K)
+    size = len(L)
+
+    # Solve Lx=y for x.
+    x = np.zeros(size)
+    x[0] = y[0] / L[0, 0]
+    for i in range(1, size):
+        temp_sum = 0.0
+        for j in range(i):
+            temp_sum += L[i, j] * x[j]
+        x[i] = (y[i] - temp_sum) / L[i, i]
+
+    # Now, solve LTa=x for a.
+    L2 = L.T
+    a = np.zeros(size)
+    a_ms = size - 1
+    a[a_ms] = x[a_ms] / L2[a_ms, a_ms]
+    # Because of the form of L2 (upper triangular matriz), an inversion of
+    # range() needs to be done.
+    for i in range(0, a_ms)[::-1]:
+        temp_sum = 0.0
+        for j in range(i, size)[::-1]:
+            temp_sum += L2[i, j] * a[j]
+        a[i] = (x[i] - temp_sum) / L2[i, i]
+
+    return a
diff --git a/lj_matrix/do_ml.py b/lj_matrix/do_ml.py
new file mode 100644
index 000000000..acf5455f4
--- /dev/null
+++ b/lj_matrix/do_ml.py
@@ -0,0 +1,108 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import time
+from misc import printc
+import numpy as np
+from lj_matrix.gauss_kernel import gauss_kernel
+from lj_matrix.cholesky_solve import cholesky_solve
+
+
+def do_ml(desc_data,
+          energy_data,
+          training_size,
+          desc_type=None,
+          pipe=None,
+          test_size=None,
+          sigma=1000.0,
+          show_msgs=True):
+    """
+    Does the ML methodology.
+    desc_data: descriptor (or representation) data.
+    energy_data: energy data associated with desc_data.
+    training_size: size of the training set to use.
+    desc_type: string with the name of the descriptor used.
+    pipe: for multiprocessing purposes. Sends the data calculated
+        through a pipe.
+    test_size: size of the test set to use. If no size is given,
+        the last remaining molecules are used.
+    sigma: depth of the kernel.
+    show_msgs: Show debug messages or not.
+    NOTE: desc_type is just a string and is only for identification purposes.
+    Also, training is done with the first part of the data and
+    testing with the ending part of the data.
+    """
+    # Initial calculations for later use.
+    d_len = len(desc_data)
+    e_len = len(energy_data)
+
+    if not desc_type:
+        desc_type = 'NOT SPECIFIED'
+
+    if d_len != e_len:
+        printc(''.join(['ERROR. Descriptor data size different ',
+                        'than energy data size.']), 'RED')
+        return None
+
+    if training_size >= d_len:
+        printc('ERROR. Training size greater or equal than data size.', 'RED')
+        return None
+
+    if not test_size:
+        test_size = d_len - training_size
+        if test_size > 1500:
+            test_size = 1500
+
+    tic = time.perf_counter()
+    if show_msgs:
+        printc('{} ML started.'.format(desc_type), 'GREEN')
+        printc('\tTraining size: {}'.format(training_size), 'CYAN')
+        printc('\tTest size: {}'.format(test_size), 'CYAN')
+        printc('\tSigma: {}'.format(sigma), 'CYAN')
+
+    Xcm_training = desc_data[:training_size]
+    Ycm_training = energy_data[:training_size]
+    Kcm_training = gauss_kernel(Xcm_training, Xcm_training, sigma)
+    alpha_cm = cholesky_solve(Kcm_training, Ycm_training)
+
+    Xcm_test = desc_data[-test_size:]
+    Ycm_test = energy_data[-test_size:]
+    Kcm_test = gauss_kernel(Xcm_test, Xcm_training, sigma)
+    Ycm_predicted = np.dot(Kcm_test, alpha_cm)
+
+    mae = np.mean(np.abs(Ycm_predicted - Ycm_test))
+    if show_msgs:
+        printc('\tMAE for {}: {:.4f}'.format(desc_type, mae), 'GREEN')
+
+    toc = time.perf_counter()
+    tictoc = toc - tic
+    if show_msgs:
+        printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc),
+               'GREEN')
+        printc('\t\tTraining size: {}'.format(training_size), 'CYAN')
+        printc('\t\tTest size: {}'.format(test_size), 'CYAN')
+        printc('\t\tSigma: {}'.format(sigma), 'CYAN')
+
+    if pipe:
+        pipe.send([desc_type, training_size, test_size, sigma, mae, tictoc])
+
+    return mae, tictoc
diff --git a/lj_matrix/frob_norm.py b/lj_matrix/frob_norm.py
new file mode 100644
index 000000000..4c3a2945d
--- /dev/null
+++ b/lj_matrix/frob_norm.py
@@ -0,0 +1,51 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import math
+
+
+def frob_norm(array):
+    """
+    Calculates the frobenius norm of a given array or matrix.
+    array: array of data.
+    """
+
+    arr_sh_len = len(array.shape)
+    arr_range = range(len(array))
+    fn = 0.0
+
+    # If it is a 'vector'.
+    if arr_sh_len == 1:
+        for i in arr_range:
+            fn += array[i]*array[i]
+
+        return math.sqrt(fn)
+
+    # If it is a matrix.
+    elif arr_sh_len == 2:
+        for i in arr_range:
+            for j in arr_range:
+                fn += array[i, j]*array[i, j]
+
+        return math.sqrt(fn)
+    else:
+        print('Error. Array size greater than 2 ({}).'.format(arr_sh_len))
diff --git a/lj_matrix/gauss_kernel.py b/lj_matrix/gauss_kernel.py
new file mode 100644
index 000000000..5dd8e6406
--- /dev/null
+++ b/lj_matrix/gauss_kernel.py
@@ -0,0 +1,49 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import math
+import numpy as np
+from lj_matrix.frob_norm import frob_norm
+
+
+def gauss_kernel(X_1, X_2, sigma):
+    """
+    Calculates the Gaussian Kernel.
+    X_1: first representations.
+    X_2: second representations.
+    sigma: kernel width.
+    """
+    x1_l = len(X_1)
+    x1_range = range(x1_l)
+    x2_l = len(X_2)
+    x2_range = range(x2_l)
+
+    inv_sigma = -0.5 / (sigma*sigma)
+
+    K = np.zeros((x1_l, x2_l))
+    for i in x1_range:
+        for j in x2_range:
+            f_norm = frob_norm(X_1[i] - X_2[j])
+            # print(f_norm)
+            K[i, j] = math.exp(inv_sigma * f_norm)
+
+    return K
diff --git a/lj_matrix/lj_matrix.py b/lj_matrix/lj_matrix.py
new file mode 100644
index 000000000..4f63e95ca
--- /dev/null
+++ b/lj_matrix/lj_matrix.py
@@ -0,0 +1,207 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import time
+from lj_matrix.misc import printc
+import math
+import numpy as np
+from numpy.linalg import eig
+
+
+def lj_matrix(mol_data,
+              nc_data,
+              sigma=1.0,
+              epsilon=1.0,
+              max_len=25,
+              as_eig=True,
+              bohr_radius_units=False):
+    """
+    Creates the Lennard-Jones Matrix from the molecule data given.
+    mol_data: molecule data, matrix of atom coordinates.
+    nc_data: nuclear charge data, array of atom data.
+    max_len: maximum amount of atoms in molecule.
+    as_eig: if data should be returned as matrix or array of eigenvalues.
+    bohr_radius_units: if units should be in bohr's radius units.
+    """
+    if bohr_radius_units:
+        conversion_rate = 0.52917721067
+    else:
+        conversion_rate = 1
+
+    mol_n = len(mol_data)
+    mol_nr = range(mol_n)
+
+    if not mol_n == len(nc_data):
+        print(''.join(['Error. Molecule matrix dimension is different ',
+                       'than the nuclear charge array dimension.']))
+    else:
+        if max_len < mol_n:
+            print(''.join(['Error. Molecule matrix dimension (mol_n) is ',
+                           'greater than max_len. Using mol_n.']))
+            max_len = None
+
+        if max_len:
+            lj = np.zeros((max_len, max_len))
+            ml_r = range(max_len)
+
+            # Actual calculation of the coulomb matrix.
+            for i in ml_r:
+                if i < mol_n:
+                    x_i = mol_data[i, 0]
+                    y_i = mol_data[i, 1]
+                    z_i = mol_data[i, 2]
+                    Z_i = nc_data[i]
+                else:
+                    break
+
+                for j in ml_r:
+                    if j < mol_n:
+                        x_j = mol_data[j, 0]
+                        y_j = mol_data[j, 1]
+                        z_j = mol_data[j, 2]
+
+                        x = (x_i-x_j)**2
+                        y = (y_i-y_j)**2
+                        z = (z_i-z_j)**2
+
+                        if i == j:
+                            lj[i, j] = (0.5*Z_i**2.4)
+                        else:
+                            # Calculations are done after i==j is checked
+                            # so no division by zero is done.
+
+                            # A little play with r exponents
+                            # so no square root is calculated.
+                            # Conversion factor is included in r^2.
+
+                            # 1/r^2
+                            r_2 = sigma**2/(conversion_rate**2*(x + y + z))
+
+                            r_6 = math.pow(r_2, 3)
+                            r_12 = math.pow(r_6, 2)
+                            lj[i, j] = (4*epsilon*(r_12 - r_6))
+                    else:
+                        break
+
+            # Now the value will be returned.
+            if as_eig:
+                lj_sorted = np.sort(eig(lj)[0])[::-1]
+                # Thanks to SO for the following lines of code.
+                # https://stackoverflow.com/a/43011036
+
+                # Keep zeros at the end.
+                mask = lj_sorted != 0.
+                f_mask = mask.sum(0, keepdims=1) >\
+                    np.arange(lj_sorted.shape[0]-1, -1, -1)
+
+                f_mask = f_mask[::-1]
+                lj_sorted[f_mask] = lj_sorted[mask]
+                lj_sorted[~f_mask] = 0.
+
+                return lj_sorted
+
+            else:
+                return lj
+
+        else:
+            lj_temp = []
+            # Actual calculation of the coulomb matrix.
+            for i in mol_nr:
+                x_i = mol_data[i, 0]
+                y_i = mol_data[i, 1]
+                z_i = mol_data[i, 2]
+                Z_i = nc_data[i]
+
+                lj_row = []
+                for j in mol_nr:
+                    x_j = mol_data[j, 0]
+                    y_j = mol_data[j, 1]
+                    z_j = mol_data[j, 2]
+
+                    x = (x_i-x_j)**2
+                    y = (y_i-y_j)**2
+                    z = (z_i-z_j)**2
+
+                    if i == j:
+                        lj_row.append(0.5*Z_i**2.4)
+                    else:
+                        # Calculations are done after i==j is checked
+                        # so no division by zero is done.
+
+                        # A little play with r exponents
+                        # so no square root is calculated.
+                        # Conversion factor is included in r^2.
+
+                        # 1/r^2
+                        r_2 = sigma**2/(conversion_rate**2*(x + y + z))
+
+                        r_6 = math.pow(r_2, 3)
+                        r_12 = math.pow(r_6, 2)
+                        lj_row.append(4*epsilon*(r_12 - r_6))
+
+                lj_temp.append(np.array(lj_row))
+
+            lj = np.array(lj_temp)
+            # Now the value will be returned.
+            if as_eig:
+                return np.sort(eig(lj)[0])[::-1]
+            else:
+                return lj
+
+
+def lj_matrix_multiple(mol_data,
+                       nc_data,
+                       pipe=None,
+                       sigma=1,
+                       epsilon=1,
+                       max_len=25,
+                       as_eig=True,
+                       bohr_radius_units=False):
+    """
+    Calculates the Lennard-Jones Matrix of multiple molecules.
+    mol_data: molecule data, matrix of atom coordinates.
+    nc_data: nuclear charge data, array of atom data.
+    pipe: for multiprocessing purposes. Sends the data calculated
+        through a pipe.
+    max_len: maximum amount of atoms in molecule.
+    as_eig: if data should be returned as matrix or array of eigenvalues.
+    bohr_radius_units: if units should be in bohr's radius units.
+    """
+    printc('L-J Matrices calculation started.', 'CYAN')
+    tic = time.perf_counter()
+
+    ljm_data = np.array([lj_matrix(mol,
+                                   nc,
+                                   sigma,
+                                   epsilon,
+                                   max_len,
+                                   as_eig,
+                                   bohr_radius_units)
+                        for mol, nc in zip(mol_data, nc_data)])
+
+    toc = time.perf_counter()
+    printc('\tL-JM calculation took {:.4f} seconds.'.format(toc-tic), 'GREEN')
+
+    if pipe:
+        pipe.send(ljm_data)
+
+    return ljm_data
diff --git a/lj_matrix/misc.py b/lj_matrix/misc.py
new file mode 100644
index 000000000..c50653a5c
--- /dev/null
+++ b/lj_matrix/misc.py
@@ -0,0 +1,53 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+from colorama import init, Fore, Style
+
+init()
+
+
+def printc(text, color):
+    """
+    Prints texts normaly, but in color. Using colorama.
+    text: string with the text to print.
+    color: color to be used, same as available in colorama.
+    """
+    color_dic = {'BLACK': Fore.BLACK,
+                 'RED': Fore.RED,
+                 'GREEN': Fore.GREEN,
+                 'YELLOW': Fore.YELLOW,
+                 'BLUE': Fore.BLUE,
+                 'MAGENTA': Fore.MAGENTA,
+                 'CYAN': Fore.CYAN,
+                 'WHITE': Fore.WHITE,
+                 'RESET': Fore.RESET}
+
+    color_dic_keys = color_dic.keys()
+    if color not in color_dic_keys:
+        print(Fore.RED
+              + '\'{}\' not found, using default color.'.format(color)
+              + Style.RESET_ALL)
+        actual_color = Fore.RESET
+    else:
+        actual_color = color_dic[color]
+
+    print(actual_color + text + Style.RESET_ALL)
diff --git a/lj_matrix/read_qm7_data.py b/lj_matrix/read_qm7_data.py
new file mode 100644
index 000000000..b54691fb0
--- /dev/null
+++ b/lj_matrix/read_qm7_data.py
@@ -0,0 +1,144 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import os
+import time
+import numpy as np
+import random
+from lj_matrix.misc import printc
+
+
+# 'periodic_table_of_elements.txt' retrieved from
+# https://gist.github.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee
+def read_nc_data(data_path):
+    """
+    Reads nuclear charge data from file and returns a dictionary.
+    data_path: path to the data directory.
+    """
+    fname = 'periodic_table_of_elements.txt'
+    with open(''.join([data_path, '\\', fname]), 'r') as infile:
+        temp_lines = infile.readlines()
+
+    del temp_lines[0]
+
+    lines = []
+    for temp_line in temp_lines:
+        new_line = temp_line.split(sep=',')
+        lines.append(new_line)
+
+    # Dictionary of nuclear charge.
+    return {line[2]: int(line[0]) for line in lines}
+
+
+# 'hof_qm7.txt.txt' retrieved from
+# https://github.com/qmlcode/tutorial
+def reas_db_data(zi_data,
+                 data_path,
+                 r_seed=111):
+    """
+    Reads molecule database and extracts
+    its contents as usable variables.
+    zi_data: dictionary containing nuclear charge data.
+    data_path: path to the data directory.
+    r_seed: random seed.
+    """
+    os.chdir(data_path)
+
+    fname = 'hof_qm7.txt'
+    with open(fname, 'r') as infile:
+        lines = infile.readlines()
+
+    # Temporary energy dictionary.
+    energy_temp = dict()
+
+    for line in lines:
+        xyz_data = line.split()
+
+        xyz_name = xyz_data[0]
+        hof = float(xyz_data[1])
+        dftb = float(xyz_data[2])
+        # print(xyz_name, hof, dftb)
+
+        energy_temp[xyz_name] = np.array([hof, hof - dftb])
+
+    # Use a random seed.
+    random.seed(r_seed)
+
+    et_keys = list(energy_temp.keys())
+    random.shuffle(et_keys)
+
+    # Temporary energy dictionary, shuffled.
+    energy_temp_shuffled = dict()
+    for key in et_keys:
+        energy_temp_shuffled.update({key: energy_temp[key]})
+
+    mol_data = []
+    mol_nc_data = []
+    # Actual reading of the xyz files.
+    for i, k in enumerate(energy_temp_shuffled.keys()):
+        with open(k, 'r') as xyz_file:
+            lines = xyz_file.readlines()
+
+        len_lines = len(lines)
+        mol_temp_data = []
+        mol_nc_temp_data = np.array(np.zeros(len_lines-2))
+        for j, line in enumerate(lines[2:len_lines]):
+            line_list = line.split()
+
+            mol_nc_temp_data[j] = float(zi_data[line_list[0]])
+            line_data = np.array(np.asarray(line_list[1:4], dtype=float))
+            mol_temp_data.append(line_data)
+
+        mol_data.append(mol_temp_data)
+        mol_nc_data.append(mol_nc_temp_data)
+
+    # Convert everything to a numpy array.
+    molecules = np.array([np.array(mol) for mol in mol_data])
+    nuclear_charge = np.array([nc_d for nc_d in mol_nc_data])
+    energy_pbe0 = np.array([energy_temp_shuffled[k][0]
+                            for k in energy_temp_shuffled.keys()])
+    energy_delta = np.array([energy_temp_shuffled[k][1]
+                             for k in energy_temp_shuffled.keys()])
+
+    return molecules, nuclear_charge, energy_pbe0, energy_delta
+
+
+def read_qm7_data():
+    """
+    Reads all the qm7 data.
+    """
+    tic = time.perf_counter()
+    printc('Data reading started.', 'CYAN')
+
+    init_path = os.getcwd()
+    os.chdir('data')
+    data_path = os.getcwd()
+
+    zi_data = read_nc_data(data_path)
+    molecules, nuclear_charge, energy_pbe0, energy_delta = \
+        reas_db_data(zi_data, data_path)
+
+    os.chdir(init_path)
+    toc = time.perf_counter()
+    printc('\tData reading took {:.4f} seconds.'.format(toc-tic), 'GREEN')
+
+    return zi_data, molecules, nuclear_charge, energy_pbe0, energy_delta
author	David Luevano <55825613+luevano@users.noreply.github.com>	2019-12-18 07:21:35 -0700
committer	David Luevano <55825613+luevano@users.noreply.github.com>	2019-12-18 07:21:35 -0700
commit	487bf8840846b5d4d694b38985268c308aadb36e (patch)
tree	ba3a3a742a503f925a5a7792e1bd16ee518066c9 /lj_matrix
parent	96a3f2b2950451a478c951e642a4aa188219682b (diff)