From da6b1428cda40534b820642d8e9be398d7fee8be Mon Sep 17 00:00:00 2001
From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com>
Date: Sat, 29 Feb 2020 09:06:54 -0700
Subject: Rewrite kernel, gotta fix adj mat

---
 ml_exp/compound.py |  1 -
 ml_exp/do_ml.py    | 29 +++++++++++++++++++++++------
 ml_exp/kernels.py  | 25 ++++++++++++++++++-------
 3 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/ml_exp/compound.py b/ml_exp/compound.py
index ec5085797..f603ddd4e 100644
--- a/ml_exp/compound.py
+++ b/ml_exp/compound.py
@@ -126,7 +126,6 @@ class Compound:
         Generate the Adjacency Matrix for the compund.
         use_forces: if the use of forces instead of k_cx should be used.
         size: compound size.
-        bohr_ru: if radius units should be in bohr's radius units.
         """
         self.am = adjacency_matrix(self.fnm,
                                    self.bonds,
diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py
index 3cb67675f..758bf01ae 100644
--- a/ml_exp/do_ml.py
+++ b/ml_exp/do_ml.py
@@ -33,6 +33,7 @@ def simple_ml(descriptors,
               training_size,
               test_size=None,
               sigma=1000.0,
+              opt=True,
               identifier=None,
               show_msgs=True):
     """
@@ -43,6 +44,7 @@ def simple_ml(descriptors,
     test_size: size of the test set to use. If no size is given,
         the last remaining molecules are used.
     sigma: depth of the kernel.
+    opt: if the optimized algorithm should be used. For benchmarking purposes.
     identifier: string with the name of the descriptor used.
     show_msgs: if debug messages should be shown.
     NOTE: identifier is just a string and is only for identification purposes.
@@ -76,13 +78,21 @@ def simple_ml(descriptors,
 
     X_training = descriptors[:training_size]
     Y_training = energies[:training_size]
-    K_training = gaussian_kernel(X_training, X_training, sigma)
-    alpha = cholesky_solve(K_training, Y_training)
+    K_training = gaussian_kernel(X_training,
+                                 X_training,
+                                 sigma,
+                                 opt=opt)
+    alpha = cholesky_solve(K_training,
+                           Y_training)
 
     X_test = descriptors[-test_size:]
     Y_test = energies[-test_size:]
-    K_test = gaussian_kernel(X_test, X_training, sigma)
-    Y_predicted = np.dot(K_test, alpha)
+    K_test = gaussian_kernel(X_test,
+                             X_training,
+                             sigma,
+                             opt=opt)
+    Y_predicted = np.dot(K_test,
+                         alpha)
 
     mae = np.mean(np.abs(Y_predicted - Y_test))
     if show_msgs:
@@ -113,6 +123,7 @@ def do_ml(db_path='data',
           training_size=1500,
           test_size=None,
           sigma=1000.0,
+          opt=True,
           identifiers=["CM"],
           show_msgs=True):
     """
@@ -132,6 +143,7 @@ def do_ml(db_path='data',
     test_size: size of the test set to use. If no size is given,
         the last remaining molecules are used.
     sigma: depth of the kernel.
+    opt: if the optimized algorithm should be used. For benchmarking purposes.
     identifiers: list of names (strings) of descriptors to use.
     show_msgs: if debug messages should be shown.
     """
@@ -165,9 +177,10 @@ def do_ml(db_path='data',
                              as_eig=as_eig,
                              bohr_ru=bohr_ru)
         if 'AM' in identifiers:
-            compound.gen_am(use_forces=use_forces,
-                            size=size,
+            compound.gen_hd(size=size,
                             bohr_ru=bohr_ru)
+            compound.gen_am(use_forces=use_forces,
+                            size=size)
         if 'BOB' in identifiers:
             compound.gen_bob(size=size)
 
@@ -193,6 +206,7 @@ def do_ml(db_path='data',
                                       training_size=training_size,
                                       test_size=test_size,
                                       sigma=sigma,
+                                      opt=opt,
                                       identifier='CM',
                                       show_msgs=show_msgs)
     if 'LJM' in identifiers:
@@ -201,6 +215,7 @@ def do_ml(db_path='data',
                                         training_size=training_size,
                                         test_size=test_size,
                                         sigma=sigma,
+                                        opt=opt,
                                         identifier='LJM',
                                         show_msgs=show_msgs)
     if 'AM' in identifiers:
@@ -209,6 +224,7 @@ def do_ml(db_path='data',
                                       training_size=training_size,
                                       test_size=test_size,
                                       sigma=sigma,
+                                      opt=opt,
                                       identifier='CM',
                                       show_msgs=show_msgs)
     if 'BOB' in identifiers:
@@ -217,6 +233,7 @@ def do_ml(db_path='data',
                                         training_size=training_size,
                                         test_size=test_size,
                                         sigma=sigma,
+                                        opt=opt,
                                         identifier='CM',
                                         show_msgs=show_msgs)
 
diff --git a/ml_exp/kernels.py b/ml_exp/kernels.py
index 3914ffc20..e6855f97e 100644
--- a/ml_exp/kernels.py
+++ b/ml_exp/kernels.py
@@ -26,20 +26,31 @@ import numpy as np
 
 def gaussian_kernel(X1,
                     X2,
-                    sigma):
+                    sigma,
+                    opt=True):
     """
     Calculates the Gaussian Kernel.
     X1: first representations.
     X2: second representations.
     sigma: kernel width.
+    opt: if the optimized algorithm should be used. For benchmarking purposes.
     """
-    inv_sigma = -0.5 / (sigma*sigma)
+    i_sigma = -0.5 / (sigma*sigma)
 
     K = np.zeros((X1.shape[0], X2.shape[0]), dtype=float)
-    for i, x1 in enumerate(X1):
-        for j, x2 in enumerate(X2):
-            f_norm = np.linalg.norm(x1 - x2)
-            # print(f_norm)
-            K[i, j] = math.exp(inv_sigma * f_norm)
+    if opt:
+        # Faster way of calculating the kernel.
+        for i, x1 in enumerate(X1):
+            if X2.ndim == 3:
+                norm = np.linalg.norm(X2 - x1, axis=(1, 2))
+            else:
+                norm = np.linalg.norm(X2 - x1, axis=-1)
+            K[i, :] = np.exp(i_sigma * np.square(norm))
+    else:
+        for i, x1 in enumerate(X1):
+            for j, x2 in enumerate(X2):
+                f_norm = np.linalg.norm(x2 - x1)
+                # print(f_norm)
+                K[i, j] = math.exp(i_sigma * f_norm**2)
 
     return K
-- 
cgit v1.2.3-70-g09d2