Merge pull request #1 from luevano/add_parallel

Add parallel
author: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> 2020-01-23 18:29:21 -0700
committer: GitHub <noreply@github.com> 2020-01-23 18:29:21 -0700
commit: 7f122fdb38cd34916820d6ff4fb0e3a49fde80fc (patch)
tree: 47efddb979957029945a473fde6ed2cde2c2b196
parent: bd4fb4d77919bc75d3d181e124c3c5752a74dff3 (diff)
parent: 4704314c9b4d1066383da5c3d6ca87bba9067c8d (diff)
18 files changed, 769 insertions, 183 deletions
diff --git a/.gitignore b/.gitignore
index a1bdb4dde..02ab56ded 100644
--- a/.gitignore
+++ b/.gitignore
@@ -114,3 +114,7 @@ venv.bak/
 
 # Original data.
 .original_data/
+
+# Benchmarks and figures
+benchmarks.csv
+.figs/
+\ No newline at end of file
diff --git a/do_ml.py b/do_ml.py
deleted file mode 100644
index 63a6fc671..000000000
--- a/do_ml.py
+++ /dev/null
@@ -1,97 +0,0 @@
-"""MIT License
-
-Copyright (c) 2019 David Luevano Alvarado
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-"""
-import time
-from misc import printc
-import numpy as np
-from gauss_kernel import gauss_kernel
-from cholesky_solve import cholesky_solve
-
-
-def do_ml(desc_data,
-          energy_data,
-          training_size,
-          test_size=None,
-          sigma=1000.0,
-          desc_type=None,
-          show_msgs=True):
-    """
-    Does the ML methodology.
-    desc_data: descriptor (or representation) data.
-    energy_data: energy data associated with desc_data.
-    training_size: size of the training set to use.
-    test_size: size of the test set to use. If no size is given,
-        the last remaining molecules are used.
-    sigma: depth of the kernel.
-    desc_type: string with the name of the descriptor used.
-    show_msgs: Show debug messages or not.
-    NOTE: desc_type is just a string and is only for identification purposes.
-    Also, training is done with the first part of the data and
-    testing with the ending part of the data.
-    """
-    # Initial calculations for later use.
-    d_len = len(desc_data)
-    e_len = len(energy_data)
-
-    if not desc_type:
-        desc_type = 'NOT SPECIFIED'
-
-    if d_len != e_len:
-        printc(''.join(['ERROR. Descriptor data size different ',
-                        'than energy data size.']), 'RED')
-        return None
-
-    if training_size >= d_len:
-        printc('ERROR. Training size greater or equal than data size.', 'RED')
-        return None
-
-    if not test_size:
-        test_size = d_len - training_size
-
-    tic = time.perf_counter()
-    if show_msgs:
-        printc('{} ML started, with parameters:'.format(desc_type), 'CYAN')
-        printc('\tTraining size: {}'.format(training_size), 'BLUE')
-        printc('\tTest size: {}'.format(test_size), 'BLUE')
-        printc('\tSigma: {}'.format(sigma), 'BLUE')
-
-    Xcm_training = desc_data[:training_size]
-    Ycm_training = energy_data[:training_size]
-    Kcm_training = gauss_kernel(Xcm_training, Xcm_training, sigma)
-    alpha_cm = cholesky_solve(Kcm_training, Ycm_training)
-
-    Xcm_test = desc_data[-test_size:]
-    Ycm_test = energy_data[-test_size:]
-    Kcm_test = gauss_kernel(Xcm_test, Xcm_training, sigma)
-    Ycm_predicted = np.dot(Kcm_test, alpha_cm)
-
-    mae = np.mean(np.abs(Ycm_predicted - Ycm_test))
-    if show_msgs:
-        print('\tMAE for {}: {:.4f}'.format(desc_type, mae))
-
-    toc = time.perf_counter()
-    tictoc = toc - tic
-    if show_msgs:
-        printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc),
-               'GREEN')
-
-    return mae, tictoc
diff --git a/lj_matrix/__init__.py b/lj_matrix/__init__.py
new file mode 100644
index 000000000..a430aac68
--- /dev/null
+++ b/lj_matrix/__init__.py
@@ -0,0 +1,48 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+from lj_matrix.read_qm7_data import read_nc_data, read_db_data, read_qm7_data
+from lj_matrix.c_matrix import c_matrix, c_matrix_multiple
+from lj_matrix.lj_matrix import lj_matrix, lj_matrix_multiple
+from lj_matrix.frob_norm import frob_norm
+from lj_matrix.gauss_kernel import gauss_kernel
+from lj_matrix.cholesky_solve import cholesky_solve
+from lj_matrix.do_ml import do_ml
+from lj_matrix.parallel_create_matrices import parallel_create_matrices
+from lj_matrix.misc import plot_benchmarks
+
+
+# If somebody does "from package import *", this is what they will
+# be able to access:
+__all__ = ['read_nc_data',
+           'read_db_data',
+           'read_qm7_data',
+           'c_matrix',
+           'c_matrix_multiple',
+           'lj_matrix',
+           'lj_matrix_multiple',
+           'frob_norm',
+           'gauss_kernel',
+           'cholesky_solve',
+           'do_ml',
+           'parallel_create_matrices',
+           'plot_benchmarks']
diff --git a/misc.py b/lj_matrix/__main__.py
index c50653a5c..688e5adcc 100644
--- a/misc.py
+++ b/lj_matrix/__main__.py
@@ -20,34 +20,19 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 """
-from colorama import init, Fore, Style
+from lj_matrix.do_ml import do_ml
+# from lj_matrix.misc import plot_benchmarks
 
-init()
-
-
-def printc(text, color):
-    """
-    Prints texts normaly, but in color. Using colorama.
-    text: string with the text to print.
-    color: color to be used, same as available in colorama.
-    """
-    color_dic = {'BLACK': Fore.BLACK,
-                 'RED': Fore.RED,
-                 'GREEN': Fore.GREEN,
-                 'YELLOW': Fore.YELLOW,
-                 'BLUE': Fore.BLUE,
-                 'MAGENTA': Fore.MAGENTA,
-                 'CYAN': Fore.CYAN,
-                 'WHITE': Fore.WHITE,
-                 'RESET': Fore.RESET}
-
-    color_dic_keys = color_dic.keys()
-    if color not in color_dic_keys:
-        print(Fore.RED
-              + '\'{}\' not found, using default color.'.format(color)
-              + Style.RESET_ALL)
-        actual_color = Fore.RESET
-    else:
-        actual_color = color_dic[color]
-
-    print(actual_color + text + Style.RESET_ALL)
+if __name__ == '__main__':
+    do_ml(min_training_size=1500,
+          max_training_size=2000,
+          training_increment_size=500,
+          test_size=None,
+          ljm_diag_value=None,
+          ljm_sigma=1.0,
+          ljm_epsilon=1.0,
+          r_seed=111,
+          save_benchmarks=False,
+          show_msgs=True)
+    # plot_benchmarks()
+    print('OK!')
diff --git a/c_matrix.py b/lj_matrix/c_matrix.py
index 2bc4d4c0c..f21ccfd8c 100644
--- a/c_matrix.py
+++ b/lj_matrix/c_matrix.py
@@ -21,16 +21,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 """
 import time
-from misc import printc
 import math
 import numpy as np
 from numpy.linalg import eig
+from lj_matrix.misc import printc
 
 
 def c_matrix(mol_data,
              nc_data,
              max_len=25,
-             as_eig=False,
+             as_eig=True,
              bohr_radius_units=False):
     """
     Creates the Coulomb Matrix from the molecule data given.
@@ -150,13 +150,16 @@ def c_matrix(mol_data,
 
 def c_matrix_multiple(mol_data,
                       nc_data,
+                      pipe=None,
                       max_len=25,
-                      as_eig=False,
+                      as_eig=True,
                       bohr_radius_units=False):
     """
     Calculates the Coulomb Matrix of multiple molecules.
     mol_data: molecule data, matrix of atom coordinates.
     nc_data: nuclear charge data, array of atom data.
+    pipe: for multiprocessing purposes. Sends the data calculated
+        through a pipe.
     max_len: maximum amount of atoms in molecule.
     as_eig: if data should be returned as matrix or array of eigenvalues.
     bohr_radius_units: if units should be in bohr's radius units.
@@ -170,4 +173,7 @@ def c_matrix_multiple(mol_data,
     toc = time.perf_counter()
     printc('\tCM calculation took {:.4f} seconds.'.format(toc - tic), 'GREEN')
 
+    if pipe:
+        pipe.send(cm_data)
+
     return cm_data
diff --git a/cholesky_solve.py b/lj_matrix/cholesky_solve.py
index bc6a572a3..bc6a572a3 100644
--- a/cholesky_solve.py
+++ b/lj_matrix/cholesky_solve.py
diff --git a/lj_matrix/do_ml.py b/lj_matrix/do_ml.py
new file mode 100644
index 000000000..25a55e823
--- /dev/null
+++ b/lj_matrix/do_ml.py
@@ -0,0 +1,227 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import time
+import numpy as np
+from multiprocessing import Process, Pipe
+from lj_matrix.misc import printc
+from lj_matrix.gauss_kernel import gauss_kernel
+from lj_matrix.cholesky_solve import cholesky_solve
+from lj_matrix.read_qm7_data import read_qm7_data
+from lj_matrix.parallel_create_matrices import parallel_create_matrices
+
+
+def ml(desc_data,
+       energy_data,
+       training_size,
+       desc_type=None,
+       pipe=None,
+       test_size=None,
+       sigma=1000.0,
+       show_msgs=True):
+    """
+    Does the ML methodology.
+    desc_data: descriptor (or representation) data.
+    energy_data: energy data associated with desc_data.
+    training_size: size of the training set to use.
+    desc_type: string with the name of the descriptor used.
+    pipe: for multiprocessing purposes. Sends the data calculated
+        through a pipe.
+    test_size: size of the test set to use. If no size is given,
+        the last remaining molecules are used.
+    sigma: depth of the kernel.
+    show_msgs: Show debug messages or not.
+    NOTE: desc_type is just a string and is only for identification purposes.
+    Also, training is done with the first part of the data and
+    testing with the ending part of the data.
+    """
+    tic = time.perf_counter()
+    # Initial calculations for later use.
+    d_len = len(desc_data)
+    e_len = len(energy_data)
+
+    if not desc_type:
+        desc_type = 'NOT SPECIFIED'
+
+    if d_len != e_len:
+        printc(''.join(['ERROR. Descriptor data size different ',
+                        'than energy data size.']), 'RED')
+        return None
+
+    if training_size >= d_len:
+        printc('ERROR. Training size greater or equal than data size.', 'RED')
+        return None
+
+    if not test_size:
+        test_size = d_len - training_size
+        if test_size > 1500:
+            test_size = 1500
+
+    if show_msgs:
+        printc('{} ML started.'.format(desc_type), 'GREEN')
+        printc('\tTraining size: {}'.format(training_size), 'CYAN')
+        printc('\tTest size: {}'.format(test_size), 'CYAN')
+        printc('\tSigma: {}'.format(sigma), 'CYAN')
+
+    X_training = desc_data[:training_size]
+    Y_training = energy_data[:training_size]
+    K_training = gauss_kernel(X_training, X_training, sigma)
+    alpha_ = cholesky_solve(K_training, Y_training)
+
+    X_test = desc_data[-test_size:]
+    Y_test = energy_data[-test_size:]
+    K_test = gauss_kernel(X_test, X_training, sigma)
+    Y_predicted = np.dot(K_test, alpha_)
+
+    mae = np.mean(np.abs(Y_predicted - Y_test))
+    if show_msgs:
+        printc('\tMAE for {}: {:.4f}'.format(desc_type, mae), 'GREEN')
+
+    toc = time.perf_counter()
+    tictoc = toc - tic
+    if show_msgs:
+        printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc),
+               'GREEN')
+        printc('\t\tTraining size: {}'.format(training_size), 'CYAN')
+        printc('\t\tTest size: {}'.format(test_size), 'CYAN')
+        printc('\t\tSigma: {}'.format(sigma), 'CYAN')
+
+    if pipe:
+        pipe.send([desc_type, training_size, test_size, sigma, mae, tictoc])
+
+    return mae, tictoc
+
+
+def do_ml(min_training_size,
+          max_training_size=None,
+          training_increment_size=500,
+          test_size=None,
+          ljm_diag_value=None,
+          ljm_sigma=1.0,
+          ljm_epsilon=1.0,
+          r_seed=111,
+          save_benchmarks=False,
+          max_len=25,
+          as_eig=True,
+          bohr_radius_units=False,
+          sigma=1000.0,
+          show_msgs=True):
+    """
+    Main function that does the whole ML process.
+    min_training_size: minimum training size.
+    max_training_size: maximum training size.
+    training_increment_size: training increment size.
+    test_size: size of the test set to use. If no size is given,
+        the last remaining molecules are used.
+    ljm_diag_value: if a special diagonal value should be used in lj matrix.
+    ljm_sigma: sigma value for lj matrix.
+    ljm_epsilon: epsilon value for lj matrix.
+    r_seed: random seed to use for the shuffling.
+    save_benchmarks: if benchmarks should be saved.
+    max_len: maximum amount of atoms in molecule.
+    as_eig: if data should be returned as matrix or array of eigenvalues.
+    bohr_radius_units: if units should be in bohr's radius units.
+    sigma: depth of the kernel.
+    show_msgs: Show debug messages or not.
+    """
+    # Initialization time.
+    init_time = time.perf_counter()
+    if not max_training_size:
+        max_training_size = min_training_size + training_increment_size
+
+    # Data reading.
+    molecules, nuclear_charge, energy_pbe0, energy_delta =\
+        read_qm7_data(r_seed)
+
+    # Matrices calculation.
+    cm_data, ljm_data = parallel_create_matrices(molecules,
+                                                 nuclear_charge,
+                                                 ljm_diag_value,
+                                                 ljm_sigma,
+                                                 ljm_epsilon,
+                                                 max_len,
+                                                 as_eig,
+                                                 bohr_radius_units)
+
+    # ML calculation.
+    procs = []
+    cm_pipes = []
+    ljm_pipes = []
+    for i in range(min_training_size,
+                   max_training_size + 1,
+                   training_increment_size):
+        cm_recv, cm_send = Pipe(False)
+        p1 = Process(target=ml,
+                     args=(cm_data,
+                           energy_pbe0,
+                           i,
+                           'CM',
+                           cm_send,
+                           test_size,
+                           sigma,
+                           show_msgs))
+        procs.append(p1)
+        cm_pipes.append(cm_recv)
+        p1.start()
+
+        ljm_recv, ljm_send = Pipe(False)
+        p2 = Process(target=ml,
+                     args=(ljm_data,
+                           energy_pbe0,
+                           i,
+                           'L-JM',
+                           ljm_send,
+                           test_size,
+                           sigma,
+                           show_msgs))
+        procs.append(p2)
+        ljm_pipes.append(ljm_recv)
+        p2.start()
+
+    cm_bench_results = []
+    ljm_bench_results = []
+    for cd_pipe, ljd_pipe in zip(cm_pipes, ljm_pipes):
+        cm_bench_results.append(cd_pipe.recv())
+        ljm_bench_results.append(ljd_pipe.recv())
+
+    for proc in procs:
+        proc.join()
+
+    if save_benchmarks:
+        with open('data\\benchmarks.csv', 'a') as save_file:
+            # save_file.write(''.join(['ml_type,tr_size,te_size,kernel_s,',
+            #                          'mae,time,lj_s,lj_e,date_ran\n']))
+            ltime = time.localtime()[:3][::-1]
+            ljm_se = ',' + str(ljm_sigma) + ',' + str(ljm_epsilon) + ','
+            date = '/'.join([str(field) for field in ltime])
+            for cm, ljm, in zip(cm_bench_results, ljm_bench_results):
+                cm_text = ','.join([str(field) for field in cm])\
+                    + ',' + date + '\n'
+                ljm_text = ','.join([str(field) for field in ljm])\
+                    + ljm_se + date + '\n'
+                save_file.write(cm_text)
+                save_file.write(ljm_text)
+
+    # End of program
+    end_time = time.perf_counter()
+    printc('Program took {:.4f} seconds.'.format(end_time - init_time),
+           'CYAN')
diff --git a/frob_norm.py b/lj_matrix/frob_norm.py
index 4c3a2945d..4c3a2945d 100644
--- a/frob_norm.py
+++ b/lj_matrix/frob_norm.py
diff --git a/gauss_kernel.py b/lj_matrix/gauss_kernel.py
index 0dfc65d59..5dd8e6406 100644
--- a/gauss_kernel.py
+++ b/lj_matrix/gauss_kernel.py
@@ -22,7 +22,7 @@ SOFTWARE.
 """
 import math
 import numpy as np
-from frob_norm import frob_norm
+from lj_matrix.frob_norm import frob_norm
 
 
 def gauss_kernel(X_1, X_2, sigma):
diff --git a/lj_matrix.py b/lj_matrix/lj_matrix.py
index 6769bc0c3..6739ae283 100644
--- a/lj_matrix.py
+++ b/lj_matrix/lj_matrix.py
@@ -21,21 +21,27 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 """
 import time
-from misc import printc
 import math
 import numpy as np
 from numpy.linalg import eig
+from lj_matrix.misc import printc
 
 
 def lj_matrix(mol_data,
               nc_data,
+              diag_value=None,
+              sigma=1.0,
+              epsilon=1.0,
               max_len=25,
-              as_eig=False,
+              as_eig=True,
               bohr_radius_units=False):
     """
     Creates the Lennard-Jones Matrix from the molecule data given.
     mol_data: molecule data, matrix of atom coordinates.
     nc_data: nuclear charge data, array of atom data.
+    diag_value: if special diagonal value is to be used.
+    sigma: sigma value.
+    epsilon: epsilon value.
     max_len: maximum amount of atoms in molecule.
     as_eig: if data should be returned as matrix or array of eigenvalues.
     bohr_radius_units: if units should be in bohr's radius units.
@@ -82,7 +88,10 @@ def lj_matrix(mol_data,
                         z = (z_i-z_j)**2
 
                         if i == j:
-                            lj[i, j] = (0.5*Z_i**2.4)
+                            if diag_value is None:
+                                lj[i, j] = (0.5*Z_i**2.4)
+                            else:
+                                lj[i, j] = diag_value
                         else:
                             # Calculations are done after i==j is checked
                             # so no division by zero is done.
@@ -92,11 +101,11 @@ def lj_matrix(mol_data,
                             # Conversion factor is included in r^2.
 
                             # 1/r^2
-                            r_2 = 1/(conversion_rate**2*(x + y + z))
+                            r_2 = sigma**2/(conversion_rate**2*(x + y + z))
 
                             r_6 = math.pow(r_2, 3)
                             r_12 = math.pow(r_6, 2)
-                            lj[i, j] = (4*(r_12 - r_6))
+                            lj[i, j] = (4*epsilon*(r_12 - r_6))
                     else:
                         break
 
@@ -140,7 +149,10 @@ def lj_matrix(mol_data,
                     z = (z_i-z_j)**2
 
                     if i == j:
-                        lj_row.append(0.5*Z_i**2.4)
+                        if not diag_value:
+                            lj_row.append(0.5*Z_i**2.4)
+                        else:
+                            lj_row.append(diag_value)
                     else:
                         # Calculations are done after i==j is checked
                         # so no division by zero is done.
@@ -150,11 +162,11 @@ def lj_matrix(mol_data,
                         # Conversion factor is included in r^2.
 
                         # 1/r^2
-                        r_2 = 1/(conversion_rate**2*(x + y + z))
+                        r_2 = sigma**2/(conversion_rate**2*(x + y + z))
 
                         r_6 = math.pow(r_2, 3)
                         r_12 = math.pow(r_6, 2)
-                        lj_row.append(4*(r_12 - r_6))
+                        lj_row.append(4*epsilon*(r_12 - r_6))
 
                 lj_temp.append(np.array(lj_row))
 
@@ -168,13 +180,22 @@ def lj_matrix(mol_data,
 
 def lj_matrix_multiple(mol_data,
                        nc_data,
+                       pipe=None,
+                       diag_value=None,
+                       sigma=1.0,
+                       epsilon=1.0,
                        max_len=25,
-                       as_eig=False,
+                       as_eig=True,
                        bohr_radius_units=False):
     """
     Calculates the Lennard-Jones Matrix of multiple molecules.
     mol_data: molecule data, matrix of atom coordinates.
     nc_data: nuclear charge data, array of atom data.
+    pipe: for multiprocessing purposes. Sends the data calculated
+        through a pipe.
+    diag_value: if special diagonal value is to be used.
+    sigma: sigma value.
+    epsilon: epsilon value.
     max_len: maximum amount of atoms in molecule.
     as_eig: if data should be returned as matrix or array of eigenvalues.
     bohr_radius_units: if units should be in bohr's radius units.
@@ -182,10 +203,20 @@ def lj_matrix_multiple(mol_data,
     printc('L-J Matrices calculation started.', 'CYAN')
     tic = time.perf_counter()
 
-    ljm_data = np.array([lj_matrix(mol, nc, max_len, as_eig, bohr_radius_units)
+    ljm_data = np.array([lj_matrix(mol,
+                                   nc,
+                                   diag_value,
+                                   sigma,
+                                   epsilon,
+                                   max_len,
+                                   as_eig,
+                                   bohr_radius_units)
                         for mol, nc in zip(mol_data, nc_data)])
 
     toc = time.perf_counter()
     printc('\tL-JM calculation took {:.4f} seconds.'.format(toc-tic), 'GREEN')
 
+    if pipe:
+        pipe.send(ljm_data)
+
     return ljm_data
diff --git a/lj_matrix/misc.py b/lj_matrix/misc.py
new file mode 100644
index 000000000..e9142b05f
--- /dev/null
+++ b/lj_matrix/misc.py
@@ -0,0 +1,174 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+from colorama import init, Fore, Style
+import pandas as pd
+
+init()
+
+
+def printc(text, color):
+    """
+    Prints texts normaly, but in color. Using colorama.
+    text: string with the text to print.
+    color: color to be used, same as available in colorama.
+    """
+    color_dic = {'BLACK': Fore.BLACK,
+                 'RED': Fore.RED,
+                 'GREEN': Fore.GREEN,
+                 'YELLOW': Fore.YELLOW,
+                 'BLUE': Fore.BLUE,
+                 'MAGENTA': Fore.MAGENTA,
+                 'CYAN': Fore.CYAN,
+                 'WHITE': Fore.WHITE,
+                 'RESET': Fore.RESET}
+
+    color_dic_keys = color_dic.keys()
+    if color not in color_dic_keys:
+        print(Fore.RED
+              + '\'{}\' not found, using default color.'.format(color)
+              + Style.RESET_ALL)
+        actual_color = Fore.RESET
+    else:
+        actual_color = color_dic[color]
+
+    print(actual_color + text + Style.RESET_ALL)
+
+
+def plot_benchmarks():
+    """
+    For plotting the benchmarks.
+    """
+    # Original columns.
+    or_cols = ['ml_type',
+               'tr_size',
+               'te_size',
+               'kernel_s',
+               'mae',
+               'time',
+               'lj_s',
+               'lj_e',
+               'date_ran']
+    # Drop some original columns.
+    dor_cols = ['te_size',
+                'kernel_s',
+                'time',
+                'date_ran']
+
+    # Read benchmarks data and drop some columns.
+    data_temp = pd.read_csv('data\\benchmarks.csv',)
+    data = pd.DataFrame(data_temp, columns=or_cols)
+    data = data.drop(columns=dor_cols)
+
+    # Get the data of the first benchmarks and drop unnecesary columns.
+    first_data = pd.DataFrame(data, index=range(0, 22))
+    first_data = first_data.drop(columns=['lj_s', 'lj_e'])
+
+    # Columns to keep temporarily.
+    fd_columns = ['ml_type',
+                  'tr_size',
+                  'mae']
+
+    # Create new dataframes for each matrix descriptor and fill them.
+    first_data_cm = pd.DataFrame(columns=fd_columns)
+    first_data_ljm = pd.DataFrame(columns=fd_columns)
+    for i in range(first_data.shape[0]):
+        temp_df = first_data.iloc[[i]]
+        if first_data.at[i, 'ml_type'] == 'CM':
+            first_data_cm = first_data_cm.append(temp_df)
+        else:
+            first_data_ljm = first_data_ljm.append(temp_df)
+
+    # Drop unnecesary column and rename 'mae' for later use.
+    first_data_cm = first_data_cm.drop(columns=['ml_type'])\
+        .rename(columns={'mae': 'cm_mae'})
+    first_data_ljm = first_data_ljm.drop(columns=['ml_type'])\
+        .rename(columns={'mae': 'ljm_mae'})
+    # print(first_data_cm)
+    # print(first_data_ljm)
+
+    # Get the cm data axis so it can be joined with the ljm data axis.
+    cm_axis = first_data_cm.plot(x='tr_size',
+                                 y='cm_mae',
+                                 kind='line')
+    # Get the ljm data axis and join it with the cm one.
+    plot_axis = first_data_ljm.plot(ax=cm_axis,
+                                    x='tr_size',
+                                    y='ljm_mae',
+                                    kind='line')
+    plot_axis.set_xlabel('tr_size')
+    plot_axis.set_ylabel('mae')
+    plot_axis.set_title('mae for different tr_sizes')
+    # Get the figure and save it.
+    # plot_axis.get_figure().savefig('.figs\\mae_diff_tr_sizes.pdf')
+
+    # Get the rest of the benchmark data and drop unnecesary column.
+    new_data = data.drop(index=range(0, 22))
+    new_data = new_data.drop(columns=['ml_type'])
+
+    # Get the first set and rename it.
+    nd_first = first_data_ljm.rename(columns={'ljm_mae': '1, 1'})
+    ndf_axis = nd_first.plot(x='tr_size',
+                             y='1, 1',
+                             kind='line')
+    last_axis = ndf_axis
+    for i in range(22, 99, 11):
+        lj_s = new_data['lj_s'][i]
+        lj_e = new_data['lj_e'][i]
+        new_mae = '{}, {}'.format(lj_s, lj_e)
+        nd_temp = pd.DataFrame(new_data, index=range(i, i + 11))\
+            .drop(columns=['lj_s', 'lj_e'])\
+            .rename(columns={'mae': new_mae})
+        last_axis = nd_temp.plot(ax=last_axis,
+                                 x='tr_size',
+                                 y=new_mae,
+                                 kind='line')
+        print(nd_temp)
+
+    last_axis.set_xlabel('tr_size')
+    last_axis.set_ylabel('mae')
+    last_axis.set_title('mae for different parameters of lj(s)')
+
+    last_axis.get_figure().savefig('.figs\\mae_diff_param_lj_s.pdf')
+
+    ndf_axis = nd_first.plot(x='tr_size',
+                             y='1, 1',
+                             kind='line')
+    last_axis = ndf_axis
+    for i in range(99, data.shape[0], 11):
+        lj_s = new_data['lj_s'][i]
+        lj_e = new_data['lj_e'][i]
+        new_mae = '{}, {}'.format(lj_s, lj_e)
+        nd_temp = pd.DataFrame(new_data, index=range(i, i + 11))\
+            .drop(columns=['lj_s', 'lj_e'])\
+            .rename(columns={'mae': new_mae})
+        last_axis = nd_temp.plot(ax=last_axis,
+                                 x='tr_size',
+                                 y=new_mae,
+                                 kind='line')
+        print(nd_temp)
+
+    last_axis.set_xlabel('tr_size')
+    last_axis.set_ylabel('mae')
+    last_axis.set_title('mae for different parameters of lj(e)')
+
+    last_axis.get_figure().savefig('.figs\\mae_diff_param_lj_e.pdf')
diff --git a/lj_matrix/parallel_create_matrices.py b/lj_matrix/parallel_create_matrices.py
new file mode 100644
index 000000000..cd5ef5c8e
--- /dev/null
+++ b/lj_matrix/parallel_create_matrices.py
@@ -0,0 +1,85 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+from multiprocessing import Process, Pipe
+from lj_matrix.c_matrix import c_matrix_multiple
+from lj_matrix.lj_matrix import lj_matrix_multiple
+
+
+def parallel_create_matrices(mol_data,
+                             nc_data,
+                             ljm_diag_value=None,
+                             ljm_sigma=1.0,
+                             ljm_epsilon=1.0,
+                             max_len=25,
+                             as_eig=True,
+                             bohr_radius_units=False):
+    """
+    Creates the Coulomb and L-J matrices in parallel.
+    mol_data: molecule data, matrix of atom coordinates.
+    nc_data: nuclear charge data, array of atom data.
+    ljm_diag_value: if special diagonal value is to be used for lj matrix.
+    ljm_sigma: sigma value for lj matrix.
+    ljm_epsilon: psilon value for lj matrix.
+    max_len: maximum amount of atoms in molecule.
+    as_eig: if data should be returned as matrix or array of eigenvalues.
+    bohr_radius_units: if units should be in bohr's radius units.
+    """
+
+    # Matrices calculation.
+    procs = []
+    pipes = []
+
+    cm_recv, cm_send = Pipe(False)
+    p1 = Process(target=c_matrix_multiple,
+                 args=(mol_data,
+                       nc_data,
+                       cm_send,
+                       max_len,
+                       as_eig,
+                       bohr_radius_units))
+    procs.append(p1)
+    pipes.append(cm_recv)
+    p1.start()
+
+    ljm_recv, ljm_send = Pipe(False)
+    p2 = Process(target=lj_matrix_multiple,
+                 args=(mol_data,
+                       nc_data,
+                       ljm_send,
+                       ljm_diag_value,
+                       ljm_sigma,
+                       ljm_epsilon,
+                       max_len,
+                       as_eig,
+                       bohr_radius_units))
+    procs.append(p2)
+    pipes.append(ljm_recv)
+    p2.start()
+
+    cm_data = pipes[0].recv()
+    ljm_data = pipes[1].recv()
+
+    for proc in procs:
+        proc.join()
+
+    return cm_data, ljm_data
diff --git a/read_qm7_data.py b/lj_matrix/read_qm7_data.py
index 068ea1a42..4401ca1c0 100644
--- a/read_qm7_data.py
+++ b/lj_matrix/read_qm7_data.py
@@ -24,7 +24,7 @@ import os
 import time
 import numpy as np
 import random
-from misc import printc
+from lj_matrix.misc import printc
 
 
 # 'periodic_table_of_elements.txt' retrieved from
@@ -51,7 +51,7 @@ def read_nc_data(data_path):
 
 # 'hof_qm7.txt.txt' retrieved from
 # https://github.com/qmlcode/tutorial
-def reas_db_data(zi_data,
+def read_db_data(zi_data,
                  data_path,
                  r_seed=111):
     """
@@ -59,7 +59,7 @@ def reas_db_data(zi_data,
     its contents as usable variables.
     zi_data: dictionary containing nuclear charge data.
     data_path: path to the data directory.
-    r_seed: random seed.
+    r_seed: random seed to use for the shuffling.
     """
     os.chdir(data_path)
 
@@ -122,9 +122,10 @@ def reas_db_data(zi_data,
     return molecules, nuclear_charge, energy_pbe0, energy_delta
 
 
-def read_qm7_data():
+def read_qm7_data(r_seed=111):
     """
     Reads all the qm7 data.
+    r_seed: random seed to use for the shuffling.
     """
     tic = time.perf_counter()
     printc('Data reading started.', 'CYAN')
@@ -135,10 +136,10 @@ def read_qm7_data():
 
     zi_data = read_nc_data(data_path)
     molecules, nuclear_charge, energy_pbe0, energy_delta = \
-        reas_db_data(zi_data, data_path)
+        read_db_data(zi_data, data_path, r_seed)
 
     os.chdir(init_path)
     toc = time.perf_counter()
     printc('\tData reading took {:.4f} seconds.'.format(toc-tic), 'GREEN')
 
-    return zi_data, molecules, nuclear_charge, energy_pbe0, energy_delta
+    return molecules, nuclear_charge, energy_pbe0, energy_delta
diff --git a/lj_matrix/version.py b/lj_matrix/version.py
new file mode 100644
index 000000000..fab58433d
--- /dev/null
+++ b/lj_matrix/version.py
@@ -0,0 +1,23 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+__version__ = '0.0.1'
diff --git a/requirements.txt b/requirements.txt
index f91fd71c2..28b557ddb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,4 @@
-colorama==0.4.1
-numpy==1.17.4
+colorama==0.4.3
+numpy==1.18.0
+pandas==0.25.3
+matplotlib==3.1.2
+\ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 000000000..719ef3ce0
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,102 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+# This setup.py template was obtained from
+# https://github.com/navdeep-G/setup.py/blob/master/setup.py
+# ----------------------------------------------------------------------
+# Note: To use the 'upload' functionality of this file, you must:
+#   $ pipenv install twine --dev
+
+import io
+import os
+
+from setuptools import find_packages, setup
+
+from lj_matrix.version import __version__
+
+# Package meta-data.
+NAME = 'lj_matrix'
+DESCRIPTION = 'A Lennard Jones matrix exploration.'
+URL = 'https://github.com/luevano/lj_matrix'
+EMAIL = 'a301436@uach.mx'
+AUTHOR = 'David Luevano Alvarado'
+REQUIRES_PYTHON = '>=3.7'
+VERSION = __version__
+# VERSION = '0.0.1'
+
+# What packages are required for this module to be executed?
+REQUIRED = [
+    # 'requests', 'maya', 'records',
+]
+
+# What packages are optional?
+EXTRAS = {
+    # 'fancy feature': ['django'],
+}
+
+# The rest you shouldn't have to touch too much :)
+# ------------------------------------------------
+# Except, perhaps the License and Trove Classifiers!
+# If you do change the License, remember to change
+# the Trove Classifier for that!
+
+here = os.path.abspath(os.path.dirname(__file__))
+
+# Import the README and use it as the long-description.
+# Note: this will only work if 'README.md'
+# is present in your MANIFEST.in file!
+try:
+    with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
+        long_description = '\n' + f.read()
+except FileNotFoundError:
+    long_description = DESCRIPTION
+
+# Where the magic happens:
+setup(
+    name=NAME,
+    version=VERSION,
+    description=DESCRIPTION,
+    long_description=long_description,
+    long_description_content_type='text/markdown',
+    author=AUTHOR,
+    author_email=EMAIL,
+    python_requires=REQUIRES_PYTHON,
+    url=URL,
+    packages=find_packages(exclude=["tests",
+                                    "*.tests",
+                                    "*.tests.*",
+                                    "tests.*"]),
+    # If your package is a single module, use this instead of 'packages':
+    # py_modules=['mypackage'],
+    install_requires=REQUIRED,
+    extras_require=EXTRAS,
+    include_package_data=True,
+    license='MIT',
+    classifiers=[
+        # Trove classifiers
+        # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
+        'License :: OSI Approved :: MIT License',
+        'Programming Language :: Python',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.7'
+    ]
+)
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 000000000..48cd14913
--- /dev/null
+++ b/test/__init__.py
@@ -0,0 +1,22 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
diff --git a/main.py b/test/test_c_matrix.py
index 734069920..a8bb5ae34 100644
--- a/main.py
+++ b/test/test_c_matrix.py
@@ -20,41 +20,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 """
-import time
-from misc import printc
-# import matplotlib.pyplot as plt
-from read_qm7_data import read_qm7_data
-from c_matrix import c_matrix_multiple
-from lj_matrix import lj_matrix_multiple
-from do_ml import do_ml
+import unittest
+from lj_matrix.c_matrix import c_matrix
 
 
-# Initialization time.
-init_time = time.perf_counter()
+class TestCMatrix(unittest.TestCase):
+    def test_c_matrix(self):
+        self.assertAlmostEqual(1, 1)
 
-# Data reading.
-zi_data, molecules, nuclear_charge, energy_pbe0, energy_delta =\
-    read_qm7_data()
 
-# Matrices calculation.
-cm_data = c_matrix_multiple(molecules, nuclear_charge, as_eig=True)
-ljm_data = lj_matrix_multiple(molecules, nuclear_charge, as_eig=True)
-
-# ML calculation.
-do_ml(cm_data,
-      energy_pbe0,
-      1000,
-      test_size=100,
-      sigma=1000.0,
-      desc_type='CM')
-do_ml(ljm_data,
-      energy_pbe0,
-      1000,
-      test_size=100,
-      sigma=1000.0,
-      desc_type='L-JM')
-
-# End of program
-end_time = time.perf_counter()
-printc('Program took {:.4f} seconds of runtime.'.format(end_time - init_time),
-       'CYAN')
+if __name__ == '__main__':
+    unittest.main()
author	David Luevano Alvarado <55825613+luevano@users.noreply.github.com>	2020-01-23 18:29:21 -0700
committer	GitHub <noreply@github.com>	2020-01-23 18:29:21 -0700
commit	7f122fdb38cd34916820d6ff4fb0e3a49fde80fc (patch)
tree	47efddb979957029945a473fde6ed2cde2c2b196
parent	bd4fb4d77919bc75d3d181e124c3c5752a74dff3 (diff)
parent	4704314c9b4d1066383da5c3d6ca87bba9067c8d (diff)