From c911e871d0644829122b260572a5fe06823c2142 Mon Sep 17 00:00:00 2001
From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com>
Date: Thu, 26 Mar 2020 19:49:49 -0700
Subject: Rewrite bob

---
 ml_exp/__init__.py        |   3 +-
 ml_exp/compound.py        |  10 ++--
 ml_exp/representations.py | 116 +++++++++++++++++-----------------------------
 3 files changed, 50 insertions(+), 79 deletions(-)

diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py
index cb072d673..6eb47425f 100644
--- a/ml_exp/__init__.py
+++ b/ml_exp/__init__.py
@@ -22,7 +22,7 @@ SOFTWARE.
 """
 from ml_exp.compound import Compound
 from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\
-        get_helping_data, adjacency_matrix, epsilon_index, check_bond, bag_of_bonds
+        get_helping_data, adjacency_matrix, epsilon_index, bag_of_bonds
 from ml_exp.readdb import qm7db, qm9db
 from ml_exp.data import NUCLEAR_CHARGE, POSSIBLE_BONDS
 from ml_exp.kernels import gaussian_kernel, laplacian_kernel, wasserstein_kernel
@@ -34,7 +34,6 @@ __all__ = ['Compound',
            'get_helping_data',
            'adjacency_matrix',
            'epsilon_index',
-           'check_bond',
            'bag_of_bonds',
            'qm7db',
            'qm9db',
diff --git a/ml_exp/compound.py b/ml_exp/compound.py
index a85daefa4..6ace7738a 100644
--- a/ml_exp/compound.py
+++ b/ml_exp/compound.py
@@ -176,14 +176,18 @@ class Compound:
                                 size=size)
 
     def gen_bob(self,
-                size=23):
+                sort=False,
+                acount={'C':7, 'H':16, 'N':3, 'O':3, 'S':1}):
         """
         Generate the Bag of Bonds for the compound.
-        size: compound size.
+        sort: if the representation should be sorted bag-wise.
+        acount: atom count for the compound.
+        NOTE: 'cm' shouldn't be sorted by row-norm since 'atoms' isn't (sorted).
         """
         self.bob = bag_of_bonds(self.cm,
                                 self.atoms,
-                                size=size)
+                                sort=sort,
+                                acount=acount)
 
     def read_xyz(self,
                  filename,
diff --git a/ml_exp/representations.py b/ml_exp/representations.py
index cd91fdd1b..2943c39c4 100644
--- a/ml_exp/representations.py
+++ b/ml_exp/representations.py
@@ -307,96 +307,64 @@ the current compound.')
     return ei
 
 
-def check_bond(bags,
-               bond):
-    """
-    Checks if a bond is in a bag.
-    bags: list of bags, containing a bag per entry, which in turn
-        contains a list of bond-values.
-    bond: bond to check.
-    """
-    if bags == []:
-        return False, None
-
-    for i, bag in enumerate(bags):
-        if bag[0] == bond:
-            return True, i
-
-    return False, None
-
-
 def bag_of_bonds(cm,
                  atoms,
-                 size=23):
+                 sort=False,
+                 acount={'C':7, 'H':16, 'N':3, 'O':3, 'S':1}):
     """
     Creates the Bag of Bonds using the Coulomb Matrix.
     cm: coulomb matrix.
     atoms: list of atoms.
-    size: compound size.
+    sort: if the representation should be sorted bag-wise.
+    acount: atom count for the compound.
+    NOTE: 'cm' shouldn't be sorted by row-norm since 'atoms' isn't (sorted).
     """
     if cm is None:
         raise ValueError('Coulomb Matrix hasn\'t been initialized for the \
 current compound.')
 
-    if cm.ndim == 1:
-        raise ValueError('Coulomb Matrix (CM) dimension is 1. Maybe it was \
-generated as the vector of eigenvalues, try (re-)generating the CM.')
-
-    n = len(atoms)
-
-    if size < n:
-        print('Error. Compound size (n) is greater than (size). Using (n)',
-              'instead of (size).')
-        size = n
-
-    # Bond max length, calculated using only the upper triangular matrix.
-    bond_size = np.int32((size * size - size)/2 + size)
-
-    # List where each bag data is stored.
-    bags = []
+    if cm.ndim == 1 and cm.shape[0] < 30:
+        raise ValueError('CM was generated as the vector of eigenvalues. \
+Use non-eigenvalue representation.')
+
+    # Base bags.
+    ackeys = list(acount.keys())
+    bags = dict()
+    for i, atom_i in enumerate(ackeys):
+        for j, atom_j in enumerate(ackeys[i:]):
+            # Add current bond to bags.
+            if j == 0:
+                bags[atom_i] = [acount[atom_i], []]
+                if acount[atom_i] > 1:
+                    bsize = np.int32((acount[atom_i]**2 - acount[atom_i])/2)
+                    bags[''.join(sorted([atom_i, atom_j]))] = [bsize, []]
+            else:
+                bags[''.join(sorted([atom_i, atom_j]))] = [acount[atom_i] *
+                                                           acount[atom_j], []]
+
+    bond_size = 0
+    for b in bags.keys():
+        bond_size += bags[b][0]
+
+    # Adding actual values to the bags.
     for i, atom_i in enumerate(atoms):
         for j, atom_j in enumerate(atoms):
-            # Work only in the upper triangle of the coulomb matrix.
+            # Operate on the upper triangular matrix and get the current bond.
             if j >= i:
-                # Get the string of the current bond.
-                if i == j:
-                    current_bond = atom_i
+                if j == i:
+                    bag = atom_i
                 else:
-                    current_bond = ''.join(sorted([atom_i, atom_j]))
+                    bag = ''.join(sorted([atom_i, atom_j]))
 
-                # Check if that bond is already in a bag.
-                checker = check_bond(bags, current_bond)
-                # Either create a new bag or add values to an existing one.
-                if not checker[0]:
-                    bags.append([current_bond, cm[i, j]])
-                else:
-                    bags[checker[1]].append(cm[i, j])
+            bags[bag][1].append(cm[i, j])
 
-    # Create the actual bond list ordered.
-    atom_counter = Counter(atoms)
-    atom_list = sorted(list(set(atoms)))
-    bonds = []
-    for i, a_i in enumerate(atom_list):
-        if atom_counter[a_i] > 1:
-            for a_j in atom_list[i:]:
-                bonds.append(''.join(sorted([a_i, a_j])))
-    bonds = atom_list + bonds
-
-    # Create the final vector for the bob.
-    bob = np.zeros(bond_size, dtype=np.float64)
-    c_i = 0
-    for i, bond in enumerate(bonds):
-        checker = check_bond(bags, bond)
-        if checker[0]:
-            for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]):
-                # Use c_i as the index for bob if the zero padding should
-                # be at the end of the vector instead of between each bond.
-                # bob[i*size + j] = num
-                bob[c_i] = num
-                c_i += 1
+    # Change to a numpy array and add padding.
+    for bag in bags.keys():
+        if sort:
+            b = np.sort(np.array(bags[bag][1]))[::-1]
         else:
-            print(f'Error. Bond {bond} from bond list coudn\'t be found',
-                  'in the bags list. This could be a case where the atom',
-                  'is only present oncce in the molecule.')
+            b = np.array(bags[bag][1])
+        b = np.pad(b, (0, bags[bag][0] - b.shape[0]), 'constant')
+        bags[bag][1] = b
 
-    return bob
+    return np.concatenate([bags[bag][1] for bag in bags.keys()])
-- 
cgit v1.2.3-70-g09d2