summaryrefslogtreecommitdiff
path: root/ml_exp/representations.py
diff options
context:
space:
mode:
authorDavid Luevano Alvarado <55825613+luevano@users.noreply.github.com>2020-02-28 06:48:42 -0700
committerGitHub <noreply@github.com>2020-02-28 06:48:42 -0700
commit86faabeac5aed75c1dfc883fe757c21eba94b645 (patch)
tree463515502ff326afa8b0dfeeaa15e3ba00ee08b2 /ml_exp/representations.py
parent7920c4b8471741cb9b5a9aa06da436c1b678f62a (diff)
parent6ad25bb4cd5822fa84d1c4cec54952c6fd8df2f6 (diff)
Merge pull request #2 from luevano/rewrite
Rewrite
Diffstat (limited to 'ml_exp/representations.py')
-rw-r--r--ml_exp/representations.py384
1 files changed, 384 insertions, 0 deletions
diff --git a/ml_exp/representations.py b/ml_exp/representations.py
new file mode 100644
index 000000000..ea079595e
--- /dev/null
+++ b/ml_exp/representations.py
@@ -0,0 +1,384 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import numpy as np
+from collections import Counter
+
+
+def coulomb_matrix(coords,
+ nc,
+ size=23,
+ as_eig=True,
+ bohr_ru=False):
+ """
+ Creates the Coulomb Matrix from the molecule data given.
+ coords: compound coordinates.
+ nc: nuclear charge data.
+ size: compound size.
+ as_eig: if the representation should be as the eigenvalues.
+ bohr_ru: if radius units should be in bohr's radius units.
+ """
+ if bohr_ru:
+ cr = 0.52917721067
+ else:
+ cr = 1.0
+
+ n = coords.shape[0]
+
+ if not n == nc.shape[0]:
+ raise ValueError('Compound size is different than the nuclear charge \
+size. Arrays are not of the right shape.')
+
+ if size < n:
+ print('Error. Compound size (n) is greater han (size). Using (n)',
+ 'instead of (size).')
+ size = n
+
+ cm = np.zeros((size, size), dtype=float)
+
+ # Actual calculation of the coulomb matrix.
+ for i, xyz_i in enumerate(coords):
+ for j, xyz_j in enumerate(coords):
+ rv = xyz_i - xyz_j
+ r = np.linalg.norm(rv)/cr
+ if i == j:
+ cm[i, j] = (0.5*nc[i]**2.4)
+ else:
+ cm[i, j] = nc[i]*nc[j]/r
+
+ # Now the value will be returned.
+ if as_eig:
+ cm_sorted = np.sort(np.linalg.eig(cm)[0])[::-1]
+ # Thanks to SO for the following lines of code.
+ # https://stackoverflow.com/a/43011036
+
+ # Keep zeros at the end.
+ mask = cm_sorted != 0.
+ f_mask = mask.sum(0, keepdims=1) >\
+ np.arange(cm_sorted.shape[0]-1, -1, -1)
+
+ f_mask = f_mask[::-1]
+ cm_sorted[f_mask] = cm_sorted[mask]
+ cm_sorted[~f_mask] = 0.
+
+ return cm_sorted
+ else:
+ return cm
+
+
+def lennard_jones_matrix(coords,
+ nc,
+ diag_value=None,
+ sigma=1.0,
+ epsilon=1.0,
+ size=23,
+ as_eig=True,
+ bohr_ru=False):
+ """
+ Creates the Lennard-Jones Matrix from the molecule data given.
+ coords: compound coordinates.
+ nc: nuclear charge data.
+ diag_value: if special diagonal value is to be used.
+ sigma: sigma value.
+ epsilon: epsilon value.
+ size: compound size.
+ as_eig: if the representation should be as the eigenvalues.
+ bohr_ru: if radius units should be in bohr's radius units.
+ """
+ if bohr_ru:
+ cr = 0.52917721067
+ else:
+ cr = 1.0
+
+ n = coords.shape[0]
+
+ if not n == nc.shape[0]:
+ raise ValueError('Compound size is different than the nuclear charge \
+size. Arrays are not of the right shape.')
+
+ if size < n:
+ print('Error. Compound size (n) is greater han (size). Using (n)',
+ 'instead of (size).')
+ size = n
+
+ lj = np.zeros((size, size), dtype=float)
+
+ # Actual calculation of the lennard-jones matrix.
+ for i, xyz_i in enumerate(coords):
+ for j, xyz_j in enumerate(coords):
+ if i == j:
+ if diag_value is None:
+ lj[i, j] = (0.5*nc[i]**2.4)
+ else:
+ lj[i, j] = diag_value
+ else:
+ # Calculations are done after i==j is checked
+ # so no division by zero is done.
+
+ # A little play with r exponents
+ # so no square root is calculated.
+ # Conversion factor is included in r^2.
+ rv = xyz_i - xyz_j
+ r = np.linalg.norm(rv)/cr
+
+ # 1/r^n
+ r_2 = sigma**2/r**2
+ r_6 = r_2**3
+ r_12 = r_6**2
+ lj[i, j] = (4*epsilon*(r_12 - r_6))
+
+ # Now the value will be returned.
+ if as_eig:
+ lj_sorted = np.sort(np.linalg.eig(lj)[0])[::-1]
+ # Thanks to SO for the following lines of code.
+ # https://stackoverflow.com/a/43011036
+
+ # Keep zeros at the end.
+ mask = lj_sorted != 0.
+ f_mask = mask.sum(0, keepdims=1) >\
+ np.arange(lj_sorted.shape[0]-1, -1, -1)
+
+ f_mask = f_mask[::-1]
+ lj_sorted[f_mask] = lj_sorted[mask]
+ lj_sorted[~f_mask] = 0.
+
+ return lj_sorted
+ else:
+ return lj
+
+
+def first_neighbor_matrix(coords,
+ nc,
+ atoms,
+ size=23,
+ use_forces=False,
+ bohr_ru=False):
+ """
+ Creates the First Neighbor Matrix from the molecule data given.
+ coords: compound coordinates.
+ nc: nuclear charge data.
+ atoms: list of atoms.
+ use_forces: if the use of forces instead of k_cx should be used.
+ bohr_ru: if radius units should be in bohr's radius units.
+ NOTE: Bond distance of carbon to other elements
+ are (for atoms present in the qm7 dataset):
+ C: 1.20 - 1.54 A (Edited to 1.19 - 1.54 A)
+ H: 1.06 - 1.12 A
+ O: 1.43 - 2.15 A
+ N: 1.47 - 2.10 A
+ S: 1.81 - 2.55 A
+ """
+ if bohr_ru:
+ cr = 0.52917721067
+ else:
+ cr = 1.0
+
+ n = coords.shape[0]
+
+ if not n == nc.shape[0]:
+ raise ValueError('Compound size is different than the nuclear charge \
+size. Arrays are not of the right shape.')
+
+ if size < n:
+ print('Error. Compound size (n) is greater han (size). Using (n)',
+ 'instead of (size).')
+ size = n
+
+ # Possible bonds.
+ cc_bond = sorted(['C', 'C'])
+ ch_bond = sorted(['C', 'H'])
+ co_bond = sorted(['C', 'O'])
+ cn_bond = sorted(['C', 'N'])
+ cs_bond = sorted(['C', 'S'])
+
+ fnm = np.zeros((size, size), dtype=float)
+
+ bonds = []
+ forces = []
+ for i, xyz_i in enumerate(coords):
+ for j, xyz_j in enumerate(coords):
+ # Ignore the diagonal.
+ if i != j:
+ bond = sorted([atoms[i], atoms[j]])
+ rv = xyz_i - xyz_j
+ r = np.linalg.norm(rv)/cr
+
+ # Check for each type of bond.
+ if (cc_bond == bond) and (r >= 1.19 and r <= 1.54):
+ fnm[i, j] = 1.0
+ if j > i:
+ bonds.append((i, j))
+ if use_forces:
+ forces.append(rv*nc[i]*nc[j]/r**3)
+ elif (ch_bond == bond) and (r >= 1.06 and r <= 1.12):
+ fnm[i, j] = 1.0
+ if j > i:
+ bonds.append((i, j))
+ if use_forces:
+ forces.append(rv*nc[i]*nc[j]/r**3)
+ elif (co_bond == bond) and (r >= 1.43 and r <= 2.15):
+ fnm[i, j] = 0.8
+ if j > i:
+ bonds.append((i, j))
+ if use_forces:
+ forces.append(rv*nc[i]*nc[j]/r**3)
+ elif (cn_bond == bond) and (r >= 1.47 and r <= 2.10):
+ fnm[i, j] = 1.0
+ if j > i:
+ bonds.append((i, j))
+ if use_forces:
+ forces.append(rv*nc[i]*nc[j]/r**3)
+ elif (cs_bond == bond) and (r >= 1.81 and r <= 2.55):
+ fnm[i, j] = 0.7
+ if j > i:
+ bonds.append((i, j))
+ if use_forces:
+ forces.append(rv*nc[i]*nc[j]/r**3)
+
+ return fnm, bonds, forces
+
+
+def adjacency_matrix(fnm,
+ bonds,
+ forces,
+ size=22):
+ """
+ Calculates the adjacency matrix given the bond list.
+ fnm: first neighbour matrix.
+ bonds: list of bonds (tuple of indexes).
+ forces: list of forces.
+ size: compund size.
+ """
+ n = len(bonds)
+
+ if size < n:
+ print('Error. Compound size (n) is greater han (size). Using (n)\
+ instead of (size).')
+ size = n
+
+ am = np.zeros((size, size), dtype=float)
+
+ for i, bond_i in enumerate(bonds):
+ for j, bond_j in enumerate(bonds):
+ # Ignore the diagonal.
+ if i != j:
+ if (bond_i[0] in bond_j) or (bond_i[1] in bond_j):
+ if forces:
+ am[i, j] = np.dot(forces[i], forces[j])
+ else:
+ am[i, j] = fnm[bond_i[0], bond_i[1]]
+
+ return am
+
+
+def check_bond(bags,
+ bond):
+ """
+ Checks if a bond is in a bag.
+ bags: list of bags, containing a bag per entry, which in turn
+ contains a list of bond-values.
+ bond: bond to check.
+ """
+ if bags == []:
+ return False, None
+
+ for i, bag in enumerate(bags):
+ if bag[0] == bond:
+ return True, i
+
+ return False, None
+
+
+def bag_of_stuff(cm,
+ atoms,
+ size=23,
+ stuff='bonds'):
+ """
+ Creates the Bag of Bonds using the Coulomb Matrix.
+ cm: coulomb matrix.
+ atoms: list of atoms.
+ size: compound size.
+ """
+ if cm is None:
+ raise ValueError('Coulomb Matrix hasn\'t been initialized for the \
+current compound.')
+
+ if cm.ndim == 1:
+ raise ValueError('Coulomb Matrix (CM) dimension is 1. Maybe it was \
+generated as the vector of eigenvalues, try (re-)generating the CM.')
+
+ n = len(atoms)
+
+ if size < n:
+ print('Error. Compound size (n) is greater han (size). Using (n)',
+ 'instead of (size).')
+ size = n
+
+ # Bond max length, calculated using only the upper triangular matrix.
+ bond_size = int((size * size - size)/2 + size)
+
+ # List where each bag data is stored.
+ bags = []
+ for i, atom_i in enumerate(atoms):
+ for j, atom_j in enumerate(atoms):
+ # Work only in the upper triangle of the coulomb matrix.
+ if j >= i:
+ # Get the string of the current bond.
+ if i == j:
+ current_bond = atom_i
+ else:
+ current_bond = ''.join(sorted([atom_i, atom_j]))
+
+ # Check if that bond is already in a bag.
+ checker = check_bond(bags, current_bond)
+ # Either create a new bag or add values to an existing one.
+ if not checker[0]:
+ bags.append([current_bond, cm[i, j]])
+ else:
+ bags[checker[1]].append(cm[i, j])
+
+ # Create the actual bond list ordered.
+ atom_counter = Counter(atoms)
+ atom_list = sorted(list(set(atoms)))
+ bonds = []
+ for i, a_i in enumerate(atom_list):
+ if atom_counter[a_i] > 1:
+ for a_j in atom_list[i:]:
+ bonds.append(''.join(sorted([a_i, a_j])))
+ bonds = atom_list + bonds
+
+ # Create the final vector for the bos.
+ bos = np.zeros(bond_size, dtype=float)
+ c_i = 0
+ for i, bond in enumerate(bonds):
+ checker = check_bond(bags, bond)
+ if checker[0]:
+ for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]):
+ # Use c_i as the index for bos if the zero padding should
+ # be at the end of the vector instead of between each bond.
+ bos[i*size + j] = num
+ c_i += 1
+ else:
+ print(f'Error. Bond {bond} from bond list coudn\'t be found',
+ 'in the bags list. This could be a case where the atom',
+ 'is only present oncce in the molecule.')
+ return bos