Add simple ml methodology, fix bugs

author: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> 2020-02-27 16:50:57 -0700
committer: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> 2020-02-27 16:50:57 -0700
commit: c4d30a346d114185ab93eb5e88e241b922b5a538 (patch)
tree: c2bff1ba9802078904c57783e8bd397c36d030eb
parent: b937df41c1c5e996be94a3a690908ea989e281dc (diff)
4 files changed, 56 insertions, 22 deletions
diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py
index c62141a58..48ece56c8 100644
--- a/ml_exp/__init__.py
+++ b/ml_exp/__init__.py
@@ -25,6 +25,7 @@ from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\
     first_neighbor_matrix, adjacency_matrix, check_bond, bag_of_stuff
 from ml_exp.math import cholesky_solve
 from ml_exp.qm7db import qm7db
+from ml_exp.do_ml import simple_ml, do_ml
 
 __all__ = ['Compound',
            'coulomb_matrix',
@@ -34,4 +35,6 @@ __all__ = ['Compound',
            'check_bond',
            'bag_of_stuff',
            'cholesky_solve',
-           'qm7db']
+           'qm7db',
+           'simple_ml',
+           'do_ml']
diff --git a/ml_exp/compound.py b/ml_exp/compound.py
index 9c7102860..1598fa482 100644
--- a/ml_exp/compound.py
+++ b/ml_exp/compound.py
@@ -93,8 +93,8 @@ class Compound:
                                         bohr_ru=bohr_ru)
 
     def gen_am(self,
-               size=23,
                use_forces=False,
+               size=23,
                bohr_ru=False):
         """
         Generate the Adjacency Matrix for the compund.
diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py
index f32480367..40243d413 100644
--- a/ml_exp/do_ml.py
+++ b/ml_exp/do_ml.py
@@ -31,19 +31,19 @@ from ml_exp.qm7db import qm7db
 def simple_ml(descriptors,
               energies,
               training_size,
-              identifier=None,
               test_size=None,
               sigma=1000.0,
+              identifier=None,
               show_msgs=True):
     """
     Basic ML methodology for a single descriptor type.
     descriptors: array of descriptors.
     energies: array of energies.
     training_size: size of the training set to use.
-    identifier: string with the name of the descriptor used.
     test_size: size of the test set to use. If no size is given,
         the last remaining molecules are used.
     sigma: depth of the kernel.
+    identifier: string with the name of the descriptor used.
     show_msgs: if debug messages should be shown.
     NOTE: identifier is just a string and is only for identification purposes.
     Also, training is done with the first part of the data and
@@ -86,7 +86,7 @@ def simple_ml(descriptors,
 
     mae = np.mean(np.abs(Y_predicted - Y_test))
     if show_msgs:
-        printc('\tMAE for {identifier}: {mae:.4f}', 'GREEN')
+        printc(f'\tMAE for {identifier}: {mae:.4f}', 'GREEN')
 
     toc = time.perf_counter()
     tictoc = toc - tic
@@ -102,20 +102,27 @@ def simple_ml(descriptors,
 def do_ml(db_path='data',
           is_shuffled=True,
           r_seed=111,
+          diag_value=None,
+          lj_sigma=1.0,
+          lj_epsilon=1.0,
+          use_forces=False,
+          size=23,
+          as_eig=True,
+          bohr_ru=False,
+          sigma=1000.0,
           show_msgs=True):
     """
     Main function that does the whole ML process.
-    training_size: minimum training size.
-    test_size: size of the test set to use. If no size is given,
-        the last remaining molecules are used.
-    ljm_diag_value: if a special diagonal value should be used in lj matrix.
-    ljm_sigma: sigma value for lj matrix.
-    ljm_epsilon: epsilon value for lj matrix.
+    db_path: path to the database directory.
+    is_shuffled: if the resulting list of compounds should be shuffled.
     r_seed: random seed to use for the shuffling.
-    save_benchmarks: if benchmarks should be saved.
-    size: maximum amount of atoms in molecule.
-    as_eig: if data should be returned as matrix or array of eigenvalues.
-    bohr_radius_units: if units should be in bohr's radius units.
+    diag_value: if special diagonal value is to be used.
+    lj_sigma: sigma value.
+    lj_epsilon: epsilon value.
+    use_forces: if the use of forces instead of k_cx should be used.
+    size: compound size.
+    as_eig: if the representation should be as the eigenvalues.
+    bohr_ru: if radius units should be in bohr's radius units.
     sigma: depth of the kernel.
     show_msgs: if debug messages should be shown.
     """
@@ -134,16 +141,40 @@ def do_ml(db_path='data',
     # Matrices calculation.
     tic = time.perf_counter()
     for compound in compounds:
-        compound.gen_cm()
-        compound.gen_ljm()
-        compound.gen_am()
+        compound.gen_cm(size=size,
+                        as_eig=as_eig,
+                        bohr_ru=bohr_ru)
+        compound.gen_ljm(diag_value=diag_value,
+                         sigma=lj_sigma,
+                         epsilon=lj_epsilon,
+                         size=size,
+                         as_eig=as_eig,
+                         bohr_ru=bohr_ru)
+        compound.gen_am(use_forces=use_forces,
+                        size=size,
+                        bohr_ru=bohr_ru)
+
+    # Create a numpy array for the descriptors.
+    cm_data = np.array([compound.cm for compound in compounds], dtype=float)
+    ljm_data = np.array([compound.ljm for compound in compounds], dtype=float)
+    am_data = np.array([compound.cm for compound in compounds], dtype=float)
+    print(cm_data.shape, ljm_data.shape, am_data.shape)
+
     toc = time.perf_counter()
     tictoc = toc - tic
     if show_msgs:
         printc(f'Matrices calculation took {tictoc:.4f} seconds.', 'CYAN')
 
     # ML calculation.
-    # PLHLDR
+    # CM
+    cm_mae, cm_tictoc = simple_ml(cm_data,
+                                  energy_pbe0,
+                                  training_size=5000,
+                                  test_size=1500,
+                                  sigma=1000.0,
+                                  identifier='CM',
+                                  show_msgs=show_msgs)
+    print(cm_mae, cm_tictoc)
 
     # End of program
     end_time = time.perf_counter()
diff --git a/ml_exp/kernels.py b/ml_exp/kernels.py
index 7a61a1e1e..3914ffc20 100644
--- a/ml_exp/kernels.py
+++ b/ml_exp/kernels.py
@@ -36,9 +36,9 @@ def gaussian_kernel(X1,
     inv_sigma = -0.5 / (sigma*sigma)
 
     K = np.zeros((X1.shape[0], X2.shape[0]), dtype=float)
-    for i in X1:
-        for j in X2:
-            f_norm = np.linalg.norm(i - j)
+    for i, x1 in enumerate(X1):
+        for j, x2 in enumerate(X2):
+            f_norm = np.linalg.norm(x1 - x2)
             # print(f_norm)
             K[i, j] = math.exp(inv_sigma * f_norm)
author	David Luevano Alvarado <55825613+luevano@users.noreply.github.com>	2020-02-27 16:50:57 -0700
committer	David Luevano Alvarado <55825613+luevano@users.noreply.github.com>	2020-02-27 16:50:57 -0700
commit	c4d30a346d114185ab93eb5e88e241b922b5a538 (patch)
tree	c2bff1ba9802078904c57783e8bd397c36d030eb
parent	b937df41c1c5e996be94a3a690908ea989e281dc (diff)