summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Luevano Alvarado <55825613+luevano@users.noreply.github.com>2020-03-26 19:49:49 -0700
committerDavid Luevano Alvarado <55825613+luevano@users.noreply.github.com>2020-03-26 19:49:49 -0700
commitc911e871d0644829122b260572a5fe06823c2142 (patch)
tree0193eddb8567369b2f9eda6c15626e4064bd2cf9
parentc09a8dcecbb11de020788be87a9890c25d1b1861 (diff)
Rewrite bob
-rw-r--r--ml_exp/__init__.py3
-rw-r--r--ml_exp/compound.py10
-rw-r--r--ml_exp/representations.py116
3 files changed, 50 insertions, 79 deletions
diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py
index cb072d673..6eb47425f 100644
--- a/ml_exp/__init__.py
+++ b/ml_exp/__init__.py
@@ -22,7 +22,7 @@ SOFTWARE.
"""
from ml_exp.compound import Compound
from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\
- get_helping_data, adjacency_matrix, epsilon_index, check_bond, bag_of_bonds
+ get_helping_data, adjacency_matrix, epsilon_index, bag_of_bonds
from ml_exp.readdb import qm7db, qm9db
from ml_exp.data import NUCLEAR_CHARGE, POSSIBLE_BONDS
from ml_exp.kernels import gaussian_kernel, laplacian_kernel, wasserstein_kernel
@@ -34,7 +34,6 @@ __all__ = ['Compound',
'get_helping_data',
'adjacency_matrix',
'epsilon_index',
- 'check_bond',
'bag_of_bonds',
'qm7db',
'qm9db',
diff --git a/ml_exp/compound.py b/ml_exp/compound.py
index a85daefa4..6ace7738a 100644
--- a/ml_exp/compound.py
+++ b/ml_exp/compound.py
@@ -176,14 +176,18 @@ class Compound:
size=size)
def gen_bob(self,
- size=23):
+ sort=False,
+ acount={'C':7, 'H':16, 'N':3, 'O':3, 'S':1}):
"""
Generate the Bag of Bonds for the compound.
- size: compound size.
+ sort: if the representation should be sorted bag-wise.
+ acount: atom count for the compound.
+ NOTE: 'cm' shouldn't be sorted by row-norm since 'atoms' isn't (sorted).
"""
self.bob = bag_of_bonds(self.cm,
self.atoms,
- size=size)
+ sort=sort,
+ acount=acount)
def read_xyz(self,
filename,
diff --git a/ml_exp/representations.py b/ml_exp/representations.py
index cd91fdd1b..2943c39c4 100644
--- a/ml_exp/representations.py
+++ b/ml_exp/representations.py
@@ -307,96 +307,64 @@ the current compound.')
return ei
-def check_bond(bags,
- bond):
- """
- Checks if a bond is in a bag.
- bags: list of bags, containing a bag per entry, which in turn
- contains a list of bond-values.
- bond: bond to check.
- """
- if bags == []:
- return False, None
-
- for i, bag in enumerate(bags):
- if bag[0] == bond:
- return True, i
-
- return False, None
-
-
def bag_of_bonds(cm,
atoms,
- size=23):
+ sort=False,
+ acount={'C':7, 'H':16, 'N':3, 'O':3, 'S':1}):
"""
Creates the Bag of Bonds using the Coulomb Matrix.
cm: coulomb matrix.
atoms: list of atoms.
- size: compound size.
+ sort: if the representation should be sorted bag-wise.
+ acount: atom count for the compound.
+ NOTE: 'cm' shouldn't be sorted by row-norm since 'atoms' isn't (sorted).
"""
if cm is None:
raise ValueError('Coulomb Matrix hasn\'t been initialized for the \
current compound.')
- if cm.ndim == 1:
- raise ValueError('Coulomb Matrix (CM) dimension is 1. Maybe it was \
-generated as the vector of eigenvalues, try (re-)generating the CM.')
-
- n = len(atoms)
-
- if size < n:
- print('Error. Compound size (n) is greater than (size). Using (n)',
- 'instead of (size).')
- size = n
-
- # Bond max length, calculated using only the upper triangular matrix.
- bond_size = np.int32((size * size - size)/2 + size)
-
- # List where each bag data is stored.
- bags = []
+ if cm.ndim == 1 and cm.shape[0] < 30:
+ raise ValueError('CM was generated as the vector of eigenvalues. \
+Use non-eigenvalue representation.')
+
+ # Base bags.
+ ackeys = list(acount.keys())
+ bags = dict()
+ for i, atom_i in enumerate(ackeys):
+ for j, atom_j in enumerate(ackeys[i:]):
+ # Add current bond to bags.
+ if j == 0:
+ bags[atom_i] = [acount[atom_i], []]
+ if acount[atom_i] > 1:
+ bsize = np.int32((acount[atom_i]**2 - acount[atom_i])/2)
+ bags[''.join(sorted([atom_i, atom_j]))] = [bsize, []]
+ else:
+ bags[''.join(sorted([atom_i, atom_j]))] = [acount[atom_i] *
+ acount[atom_j], []]
+
+ bond_size = 0
+ for b in bags.keys():
+ bond_size += bags[b][0]
+
+ # Adding actual values to the bags.
for i, atom_i in enumerate(atoms):
for j, atom_j in enumerate(atoms):
- # Work only in the upper triangle of the coulomb matrix.
+ # Operate on the upper triangular matrix and get the current bond.
if j >= i:
- # Get the string of the current bond.
- if i == j:
- current_bond = atom_i
+ if j == i:
+ bag = atom_i
else:
- current_bond = ''.join(sorted([atom_i, atom_j]))
+ bag = ''.join(sorted([atom_i, atom_j]))
- # Check if that bond is already in a bag.
- checker = check_bond(bags, current_bond)
- # Either create a new bag or add values to an existing one.
- if not checker[0]:
- bags.append([current_bond, cm[i, j]])
- else:
- bags[checker[1]].append(cm[i, j])
+ bags[bag][1].append(cm[i, j])
- # Create the actual bond list ordered.
- atom_counter = Counter(atoms)
- atom_list = sorted(list(set(atoms)))
- bonds = []
- for i, a_i in enumerate(atom_list):
- if atom_counter[a_i] > 1:
- for a_j in atom_list[i:]:
- bonds.append(''.join(sorted([a_i, a_j])))
- bonds = atom_list + bonds
-
- # Create the final vector for the bob.
- bob = np.zeros(bond_size, dtype=np.float64)
- c_i = 0
- for i, bond in enumerate(bonds):
- checker = check_bond(bags, bond)
- if checker[0]:
- for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]):
- # Use c_i as the index for bob if the zero padding should
- # be at the end of the vector instead of between each bond.
- # bob[i*size + j] = num
- bob[c_i] = num
- c_i += 1
+ # Change to a numpy array and add padding.
+ for bag in bags.keys():
+ if sort:
+ b = np.sort(np.array(bags[bag][1]))[::-1]
else:
- print(f'Error. Bond {bond} from bond list coudn\'t be found',
- 'in the bags list. This could be a case where the atom',
- 'is only present oncce in the molecule.')
+ b = np.array(bags[bag][1])
+ b = np.pad(b, (0, bags[bag][0] - b.shape[0]), 'constant')
+ bags[bag][1] = b
- return bob
+ return np.concatenate([bags[bag][1] for bag in bags.keys()])