From cf88764c4571cdb6dec2c69fb14288adf377c0e3 Mon Sep 17 00:00:00 2001
From: David Luevano <55825613+luevano@users.noreply.github.com>
Date: Sat, 1 Feb 2020 13:55:06 -0700
Subject: Add bag of k_cx and fix bug on adj_matrix.py

---
 ml_exp/__main__.py   | 11 +++++++-
 ml_exp/adj_matrix.py | 21 ++++++++++----
 ml_exp/bostuff.py    | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+), 6 deletions(-)

diff --git a/ml_exp/__main__.py b/ml_exp/__main__.py
index 9b4a18cc3..a1f2f0030 100644
--- a/ml_exp/__main__.py
+++ b/ml_exp/__main__.py
@@ -23,9 +23,10 @@ SOFTWARE.
 # from ml_exp.do_ml import do_ml
 # from ml_exp.misc import plot_benchmarks
 from ml_exp.read_qm7_data import read_qm7_data
-# from ml_exp.adj_matrix import fneig_matrix, adj_matrix
+from ml_exp.adj_matrix import fneig_matrix, adj_matrix
 from ml_exp.c_matrix import c_matrix
 from ml_exp.bob import bob
+from ml_exp.bostuff import bok_cx
 
 if __name__ == '__main__':
     """
@@ -58,5 +59,13 @@ if __name__ == '__main__':
     for i in range(1):
         cm = c_matrix(xyz[i], nc[i], as_eig=False)
         bob = bob(cm, atoms[i])
+
         print(f'{i} coulomb matrix\n{cm}')
         print(f'{i} bag of bonds\n{bob}')
+
+        fnm, bonds = fneig_matrix(atoms[i], xyz[i], nc[i], False)
+        am = adj_matrix(fnm, bonds, None)
+        bok_cx = bok_cx(am, atoms[i])
+
+        print(f'{i} adjacency matrix\n{am}')
+        print(f'{i} bag of k_cx\n{bok_cx}')
diff --git a/ml_exp/adj_matrix.py b/ml_exp/adj_matrix.py
index 67eab79cd..de484b5be 100644
--- a/ml_exp/adj_matrix.py
+++ b/ml_exp/adj_matrix.py
@@ -33,6 +33,8 @@ def fneig_matrix(atoms,
     Creates the first neighbor matrix of the given molecule data.
     atoms: list of atoms.
     xyz: matrix of atomic coords.
+    nc: nuclear charge info.
+    use_forces: if the use of forces instead of k_cx should be used.
     NOTE: Bond distance of carbon to other elements
         are (for atoms present in the qm7 dataset):
             C: 1.20 - 1.53 A
@@ -71,7 +73,7 @@ def fneig_matrix(atoms,
                         if use_forces:
                             forces.append(rv*nc[i]*nc[j]/r**3)
                 elif (ch_bond == bond) and (r >= 1.06 and r <= 1.12):
-                    fnm[i, j] = 0.5
+                    fnm[i, j] = 1.0
                     if j > i:
                         bonds.append((i, j))
                         if use_forces:
@@ -99,15 +101,24 @@ def fneig_matrix(atoms,
     return fnm, bonds
 
 
-def adj_matrix(bonds,
-               forces=None):
+def adj_matrix(fneig_matrix,
+               bonds,
+               forces=None,
+               max_len=25):
     """
     Calculates the adjacency matrix given the bond list.
     bonds: list of bonds (tuple of indexes).
+    max_len: maximum amount of atoms in molecule.
     forces: list of forces.
     """
     n = len(bonds)
-    am = array(zeros((n, n)))
+
+    if max_len < n:
+        print(''.join(['Error. Molecule matrix dimension (mol_n) is ',
+                       'greater than max_len. Using mol_n.']))
+        max_len = n
+
+    am = array(zeros((max_len, max_len)))
     for i, bond_i in enumerate(bonds):
         for j, bond_j in enumerate(bonds):
             # Ignore the diagonal.
@@ -116,5 +127,5 @@ def adj_matrix(bonds,
                     if forces:
                         am[i, j] = dot(forces[i], forces[j])
                     else:
-                        am[i, j] = 1
+                        am[i, j] = fneig_matrix[bond_i[0], bond_i[1]]
     return am
diff --git a/ml_exp/bostuff.py b/ml_exp/bostuff.py
index 48cd14913..3e3f65e66 100644
--- a/ml_exp/bostuff.py
+++ b/ml_exp/bostuff.py
@@ -20,3 +20,80 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 """
+from numpy import array, zeros
+from collections import Counter
+from ml_exp.bob import check_bond
+
+
+def bok_cx(adj_matrix,
+           atoms,
+           max_n=25,
+           max_bond_len=325):
+    """
+    Creates the bag of k_cx using the adjacency matrix data.
+    adj_matrix: adjacency matrix.
+    max_n: maximum amount of atoms.
+    max_bond_len: maximum amount of bonds in molecule.
+    """
+    n = len(atoms)
+    bond_n = (n * n - n) / 2 + n
+    n_r = range(n)
+
+    if max_n < n:
+        print(''.join(['Error. Molecule matrix dimension (mol_n) is ',
+                       'greater than max_len. Using mol_n.']))
+        max_n = n
+
+    if max_bond_len < bond_n:
+        print(''.join(['Error. Molecule bond lenght (bond_n) is ',
+                       'greater than max_bond_len. Using bond_n.']))
+        max_bond_len = bond_n
+
+    # List where each bag data is stored.
+    bags = []
+    for i in n_r:
+        for j in n_r:
+            # Work only in the upper triangle of the coulomb matrix.
+            if j >= i:
+                # Get the string of the current bond.
+                if i == j:
+                    current_bond = atoms[i]
+                else:
+                    current_bond = ''.join(sorted([atoms[i], atoms[j]]))
+
+                # Check if that bond is already in a bag.
+                checker = check_bond(bags, current_bond)
+                # Either create a new bag or add values to an existing one.
+                if not checker[0]:
+                    bags.append([current_bond, adj_matrix[i, j]])
+                else:
+                    bags[checker[1]].append(adj_matrix[i, j])
+
+    # Create the actual bond list ordered.
+    atom_counter = Counter(atoms)
+    atom_list = sorted(list(set(atoms)))
+    bonds = []
+    for i, a_i in enumerate(atom_list):
+        if atom_counter[a_i] > 1:
+            for a_j in atom_list[i:]:
+                bonds.append(''.join(sorted([a_i, a_j])))
+    bonds = atom_list + bonds
+
+    # Create the final vector for the bok_cx.
+    bok_cx = array(zeros(max_bond_len), dtype=float)
+    c_i = 0
+    for i, bond in enumerate(bonds):
+        checker = check_bond(bags, bond)
+        if checker[0]:
+            for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]):
+                # Use c_i as the index for bok_cx if the zero padding should
+                # be at the end of the vector instead of between each bond.
+                bok_cx[i*max_n + j] = num
+                c_i += 1
+        # This is set to false because this was a debugging measure.
+        else:
+            print(''.join([f'Error. Bond {bond} from bond list coudn\'t',
+                           ' be found in the bags list. This could be',
+                           ' a case where the atom is only present once',
+                           ' in the molecule.']))
+    return bok_cx
-- 
cgit v1.2.3-70-g09d2