From c911e871d0644829122b260572a5fe06823c2142 Mon Sep 17 00:00:00 2001 From: David Luevano Alvarado <55825613+luevano@users.noreply.github.com> Date: Thu, 26 Mar 2020 19:49:49 -0700 Subject: Rewrite bob --- ml_exp/__init__.py | 3 +- ml_exp/compound.py | 10 ++-- ml_exp/representations.py | 116 +++++++++++++++++----------------------------- 3 files changed, 50 insertions(+), 79 deletions(-) diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py index cb072d673..6eb47425f 100644 --- a/ml_exp/__init__.py +++ b/ml_exp/__init__.py @@ -22,7 +22,7 @@ SOFTWARE. """ from ml_exp.compound import Compound from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\ - get_helping_data, adjacency_matrix, epsilon_index, check_bond, bag_of_bonds + get_helping_data, adjacency_matrix, epsilon_index, bag_of_bonds from ml_exp.readdb import qm7db, qm9db from ml_exp.data import NUCLEAR_CHARGE, POSSIBLE_BONDS from ml_exp.kernels import gaussian_kernel, laplacian_kernel, wasserstein_kernel @@ -34,7 +34,6 @@ __all__ = ['Compound', 'get_helping_data', 'adjacency_matrix', 'epsilon_index', - 'check_bond', 'bag_of_bonds', 'qm7db', 'qm9db', diff --git a/ml_exp/compound.py b/ml_exp/compound.py index a85daefa4..6ace7738a 100644 --- a/ml_exp/compound.py +++ b/ml_exp/compound.py @@ -176,14 +176,18 @@ class Compound: size=size) def gen_bob(self, - size=23): + sort=False, + acount={'C':7, 'H':16, 'N':3, 'O':3, 'S':1}): """ Generate the Bag of Bonds for the compound. - size: compound size. + sort: if the representation should be sorted bag-wise. + acount: atom count for the compound. + NOTE: 'cm' shouldn't be sorted by row-norm since 'atoms' isn't (sorted). """ self.bob = bag_of_bonds(self.cm, self.atoms, - size=size) + sort=sort, + acount=acount) def read_xyz(self, filename, diff --git a/ml_exp/representations.py b/ml_exp/representations.py index cd91fdd1b..2943c39c4 100644 --- a/ml_exp/representations.py +++ b/ml_exp/representations.py @@ -307,96 +307,64 @@ the current compound.') return ei -def check_bond(bags, - bond): - """ - Checks if a bond is in a bag. - bags: list of bags, containing a bag per entry, which in turn - contains a list of bond-values. - bond: bond to check. - """ - if bags == []: - return False, None - - for i, bag in enumerate(bags): - if bag[0] == bond: - return True, i - - return False, None - - def bag_of_bonds(cm, atoms, - size=23): + sort=False, + acount={'C':7, 'H':16, 'N':3, 'O':3, 'S':1}): """ Creates the Bag of Bonds using the Coulomb Matrix. cm: coulomb matrix. atoms: list of atoms. - size: compound size. + sort: if the representation should be sorted bag-wise. + acount: atom count for the compound. + NOTE: 'cm' shouldn't be sorted by row-norm since 'atoms' isn't (sorted). """ if cm is None: raise ValueError('Coulomb Matrix hasn\'t been initialized for the \ current compound.') - if cm.ndim == 1: - raise ValueError('Coulomb Matrix (CM) dimension is 1. Maybe it was \ -generated as the vector of eigenvalues, try (re-)generating the CM.') - - n = len(atoms) - - if size < n: - print('Error. Compound size (n) is greater than (size). Using (n)', - 'instead of (size).') - size = n - - # Bond max length, calculated using only the upper triangular matrix. - bond_size = np.int32((size * size - size)/2 + size) - - # List where each bag data is stored. - bags = [] + if cm.ndim == 1 and cm.shape[0] < 30: + raise ValueError('CM was generated as the vector of eigenvalues. \ +Use non-eigenvalue representation.') + + # Base bags. + ackeys = list(acount.keys()) + bags = dict() + for i, atom_i in enumerate(ackeys): + for j, atom_j in enumerate(ackeys[i:]): + # Add current bond to bags. + if j == 0: + bags[atom_i] = [acount[atom_i], []] + if acount[atom_i] > 1: + bsize = np.int32((acount[atom_i]**2 - acount[atom_i])/2) + bags[''.join(sorted([atom_i, atom_j]))] = [bsize, []] + else: + bags[''.join(sorted([atom_i, atom_j]))] = [acount[atom_i] * + acount[atom_j], []] + + bond_size = 0 + for b in bags.keys(): + bond_size += bags[b][0] + + # Adding actual values to the bags. for i, atom_i in enumerate(atoms): for j, atom_j in enumerate(atoms): - # Work only in the upper triangle of the coulomb matrix. + # Operate on the upper triangular matrix and get the current bond. if j >= i: - # Get the string of the current bond. - if i == j: - current_bond = atom_i + if j == i: + bag = atom_i else: - current_bond = ''.join(sorted([atom_i, atom_j])) + bag = ''.join(sorted([atom_i, atom_j])) - # Check if that bond is already in a bag. - checker = check_bond(bags, current_bond) - # Either create a new bag or add values to an existing one. - if not checker[0]: - bags.append([current_bond, cm[i, j]]) - else: - bags[checker[1]].append(cm[i, j]) + bags[bag][1].append(cm[i, j]) - # Create the actual bond list ordered. - atom_counter = Counter(atoms) - atom_list = sorted(list(set(atoms))) - bonds = [] - for i, a_i in enumerate(atom_list): - if atom_counter[a_i] > 1: - for a_j in atom_list[i:]: - bonds.append(''.join(sorted([a_i, a_j]))) - bonds = atom_list + bonds - - # Create the final vector for the bob. - bob = np.zeros(bond_size, dtype=np.float64) - c_i = 0 - for i, bond in enumerate(bonds): - checker = check_bond(bags, bond) - if checker[0]: - for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]): - # Use c_i as the index for bob if the zero padding should - # be at the end of the vector instead of between each bond. - # bob[i*size + j] = num - bob[c_i] = num - c_i += 1 + # Change to a numpy array and add padding. + for bag in bags.keys(): + if sort: + b = np.sort(np.array(bags[bag][1]))[::-1] else: - print(f'Error. Bond {bond} from bond list coudn\'t be found', - 'in the bags list. This could be a case where the atom', - 'is only present oncce in the molecule.') + b = np.array(bags[bag][1]) + b = np.pad(b, (0, bags[bag][0] - b.shape[0]), 'constant') + bags[bag][1] = b - return bob + return np.concatenate([bags[bag][1] for bag in bags.keys()]) -- cgit v1.2.3