summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Luevano Alvarado <55825613+luevano@users.noreply.github.com>2020-03-02 12:02:52 -0700
committerGitHub <noreply@github.com>2020-03-02 12:02:52 -0700
commit1628e8c079c7c6c68c47a29347d2cfabd5abaa9f (patch)
treedd42fddfb03f1ff201d6f770bb92c3f32cae9329
parent86faabeac5aed75c1dfc883fe757c21eba94b645 (diff)
parent62cef5f1782caf4ff379cd021744b98be4be6c8b (diff)
Merge pull request #3 from luevano/addbags
Addbags
-rw-r--r--ml_exp/__init__.py6
-rw-r--r--ml_exp/compound.py82
-rw-r--r--ml_exp/do_ml.py60
-rw-r--r--ml_exp/kernels.py28
-rw-r--r--ml_exp/math.py4
-rw-r--r--ml_exp/qm7db.py8
-rw-r--r--ml_exp/representations.py163
7 files changed, 193 insertions, 158 deletions
diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py
index 48ece56c8..4d672efd7 100644
--- a/ml_exp/__init__.py
+++ b/ml_exp/__init__.py
@@ -22,7 +22,7 @@ SOFTWARE.
"""
from ml_exp.compound import Compound
from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\
- first_neighbor_matrix, adjacency_matrix, check_bond, bag_of_stuff
+ get_helping_data, adjacency_matrix, check_bond, bag_of_bonds
from ml_exp.math import cholesky_solve
from ml_exp.qm7db import qm7db
from ml_exp.do_ml import simple_ml, do_ml
@@ -30,10 +30,10 @@ from ml_exp.do_ml import simple_ml, do_ml
__all__ = ['Compound',
'coulomb_matrix',
'lennard_jones_matrix',
- 'first_neighbor_matrix',
+ 'get_helping_data',
'adjacency_matrix',
'check_bond',
- 'bag_of_stuff',
+ 'bag_of_bonds',
'cholesky_solve',
'qm7db',
'simple_ml',
diff --git a/ml_exp/compound.py b/ml_exp/compound.py
index 1598fa482..6c8525195 100644
--- a/ml_exp/compound.py
+++ b/ml_exp/compound.py
@@ -23,7 +23,7 @@ SOFTWARE.
import numpy as np
from ml_exp.data import NUCLEAR_CHARGE
from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\
- first_neighbor_matrix, adjacency_matrix, bag_of_stuff
+ get_helping_data, adjacency_matrix, bag_of_bonds
class Compound:
@@ -33,20 +33,31 @@ class Compound:
Initialization of the Compound.
xyz: (path to) the xyz file.
"""
+ # General compound data.
self.name = None
-
self.n = None
- self.extra = None # In case a comment is in the compound file.
+ self.extra = None
self.atoms = None
- self.atoms_nc = None
+ self.nc = None
self.coordinates = None
self.pbe0 = None
self.delta = None
+ # Computed data.
self.cm = None
self.ljm = None
self.am = None
- self.bos = None
+ self.bob = None
+ self.bo_atoms = None
+ self.bok_cx = None
+ self.bof = None
+
+ # Helping data.
+ self.fnm = None
+ self.bonds = None
+ self.bonds_i = None
+ self.bonds_k = None
+ self.bonds_f = None
if xyz is not None:
self.read_xyz(xyz)
@@ -62,7 +73,7 @@ class Compound:
bohr_ru: if radius units should be in bohr's radius units.
"""
self.cm = coulomb_matrix(self.coordinates,
- self.atoms_nc,
+ self.nc,
size=size,
as_eig=as_eig,
bohr_ru=bohr_ru)
@@ -84,7 +95,7 @@ class Compound:
bohr_ru: if radius units should be in bohr's radius units.
"""
self.ljm = lennard_jones_matrix(self.coordinates,
- self.atoms_nc,
+ self.nc,
diag_value=diag_value,
sigma=sigma,
epsilon=epsilon,
@@ -92,42 +103,45 @@ class Compound:
as_eig=as_eig,
bohr_ru=bohr_ru)
- def gen_am(self,
- use_forces=False,
+ def gen_hd(self,
size=23,
bohr_ru=False):
"""
+ Generate the helping data for use in Adjacency Matrix, for example.
+ size: compund size.
+ bohr_ru: if radius units should be in bohr's radius units.
+ """
+ hd = get_helping_data(self.coordinates,
+ self.atoms,
+ self.nc,
+ size=size,
+ bohr_ru=bohr_ru)
+
+ self.fnm, self.bonds, self.bonds_i, self.bonds_k, self.bonds_f = hd
+
+ def gen_am(self,
+ use_forces=False,
+ size=23):
+ """
Generate the Adjacency Matrix for the compund.
use_forces: if the use of forces instead of k_cx should be used.
size: compound size.
- bohr_ru: if radius units should be in bohr's radius units.
"""
- # First, generate the first neighor matrix.
- fnm, bonds, forces = first_neighbor_matrix(self.coordinates,
- self.atoms_nc,
- self.atoms,
- size=size,
- use_forces=use_forces,
- bohr_ru=bohr_ru)
-
- # Now, generate the adjacency matrix.
- self.am = adjacency_matrix(fnm,
- bonds,
- forces,
+ self.am = adjacency_matrix(self.bonds_i,
+ self.bonds_k,
+ self.bonds_f,
+ use_forces=use_forces,
size=size)
- def gen_bos(self,
- size=23,
- stuff='bonds'):
+ def gen_bob(self,
+ size=23):
"""
Generate the Bag of Stuff for the compound.
size: compound size.
- stuff: elements of the bag, by default the known bag of bonds.
"""
- self.bos = bag_of_stuff(self.cm,
+ self.bob = bag_of_bonds(self.cm,
self.atoms,
- size=size,
- stuff=stuff)
+ size=size)
def read_xyz(self,
filename):
@@ -139,15 +153,15 @@ class Compound:
lines = f.readlines()
self.name = filename.split('/')[-1]
- self.n = int(lines[0])
+ self.n = np.int32(lines[0])
self.extra = lines[1]
self.atoms = []
- self.atoms_nc = np.empty(self.n, dtype=int)
- self.coordinates = np.empty((self.n, 3), dtype=float)
+ self.nc = np.empty(self.n, dtype=np.int64)
+ self.coordinates = np.empty((self.n, 3), dtype=np.float64)
for i, atom in enumerate(lines[2:self.n + 2]):
atom_d = atom.split()
self.atoms.append(atom_d[0])
- self.atoms_nc[i] = NUCLEAR_CHARGE[atom_d[0]]
- self.coordinates[i] = np.asarray(atom_d[1:4], dtype=float)
+ self.nc[i] = NUCLEAR_CHARGE[atom_d[0]]
+ self.coordinates[i] = np.asarray(atom_d[1:4], dtype=np.float64)
diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py
index d8ee415bf..5efd13690 100644
--- a/ml_exp/do_ml.py
+++ b/ml_exp/do_ml.py
@@ -33,6 +33,7 @@ def simple_ml(descriptors,
training_size,
test_size=None,
sigma=1000.0,
+ opt=True,
identifier=None,
show_msgs=True):
"""
@@ -43,6 +44,7 @@ def simple_ml(descriptors,
test_size: size of the test set to use. If no size is given,
the last remaining molecules are used.
sigma: depth of the kernel.
+ opt: if the optimized algorithm should be used. For benchmarking purposes.
identifier: string with the name of the descriptor used.
show_msgs: if debug messages should be shown.
NOTE: identifier is just a string and is only for identification purposes.
@@ -76,13 +78,21 @@ def simple_ml(descriptors,
X_training = descriptors[:training_size]
Y_training = energies[:training_size]
- K_training = gaussian_kernel(X_training, X_training, sigma)
- alpha = cholesky_solve(K_training, Y_training)
+ K_training = gaussian_kernel(X_training,
+ X_training,
+ sigma,
+ opt=opt)
+ alpha = cholesky_solve(K_training,
+ Y_training)
X_test = descriptors[-test_size:]
Y_test = energies[-test_size:]
- K_test = gaussian_kernel(X_test, X_training, sigma)
- Y_predicted = np.dot(K_test, alpha)
+ K_test = gaussian_kernel(X_test,
+ X_training,
+ sigma,
+ opt=opt)
+ Y_predicted = np.dot(K_test,
+ alpha)
mae = np.mean(np.abs(Y_predicted - Y_test))
if show_msgs:
@@ -113,7 +123,8 @@ def do_ml(db_path='data',
training_size=1500,
test_size=None,
sigma=1000.0,
- identifiers=["CM"],
+ opt=True,
+ identifiers=['CM'],
show_msgs=True):
"""
Main function that does the whole ML process.
@@ -132,6 +143,7 @@ def do_ml(db_path='data',
test_size: size of the test set to use. If no size is given,
the last remaining molecules are used.
sigma: depth of the kernel.
+ opt: if the optimized algorithm should be used. For benchmarking purposes.
identifiers: list of names (strings) of descriptors to use.
show_msgs: if debug messages should be shown.
"""
@@ -164,23 +176,27 @@ def do_ml(db_path='data',
size=size,
as_eig=as_eig,
bohr_ru=bohr_ru)
+ """
if 'AM' in identifiers:
- compound.gen_am(use_forces=use_forces,
- size=size,
+ compound.gen_hd(size=size,
bohr_ru=bohr_ru)
- if 'BOS' in identifiers:
- compound.gen_bos(size=size,
- stuff=stuff)
+ compound.gen_am(use_forces=use_forces,
+ size=size)
+ """
+ if 'BOB' in identifiers:
+ compound.gen_bob(size=size)
# Create a numpy array for the descriptors.
if 'CM' in identifiers:
- cm_data = np.array([comp.cm for comp in compounds], dtype=float)
+ cm_data = np.array([comp.cm for comp in compounds], dtype=np.float64)
if 'LJM' in identifiers:
- ljm_data = np.array([comp.ljm for comp in compounds], dtype=float)
+ ljm_data = np.array([comp.ljm for comp in compounds], dtype=np.float64)
+ """
if 'AM' in identifiers:
- am_data = np.array([comp.cm for comp in compounds], dtype=float)
- if 'BOS' in identifiers:
- bos_data = np.array([comp.bos for comp in compounds], dtype=float)
+ am_data = np.array([comp.cm for comp in compounds], dtype=np.float64)
+ """
+ if 'BOB' in identifiers:
+ bob_data = np.array([comp.bob for comp in compounds], dtype=np.float64)
toc = time.perf_counter()
tictoc = toc - tic
@@ -194,6 +210,7 @@ def do_ml(db_path='data',
training_size=training_size,
test_size=test_size,
sigma=sigma,
+ opt=opt,
identifier='CM',
show_msgs=show_msgs)
if 'LJM' in identifiers:
@@ -202,23 +219,28 @@ def do_ml(db_path='data',
training_size=training_size,
test_size=test_size,
sigma=sigma,
+ opt=opt,
identifier='LJM',
show_msgs=show_msgs)
+ """
if 'AM' in identifiers:
am_mae, am_tictoc = simple_ml(am_data,
energy_pbe0,
training_size=training_size,
test_size=test_size,
sigma=sigma,
- identifier='CM',
+ opt=opt,
+ identifier='AM',
show_msgs=show_msgs)
- if 'BOS' in identifiers:
- bos_mae, bos_tictoc = simple_ml(bos_data,
+ """
+ if 'BOB' in identifiers:
+ bob_mae, bob_tictoc = simple_ml(bob_data,
energy_pbe0,
training_size=training_size,
test_size=test_size,
sigma=sigma,
- identifier='CM',
+ opt=opt,
+ identifier='BOB',
show_msgs=show_msgs)
# End of program
diff --git a/ml_exp/kernels.py b/ml_exp/kernels.py
index 3914ffc20..feaf9a990 100644
--- a/ml_exp/kernels.py
+++ b/ml_exp/kernels.py
@@ -26,20 +26,30 @@ import numpy as np
def gaussian_kernel(X1,
X2,
- sigma):
+ sigma,
+ opt=True):
"""
Calculates the Gaussian Kernel.
X1: first representations.
X2: second representations.
sigma: kernel width.
+ opt: if the optimized algorithm should be used. For benchmarking purposes.
"""
- inv_sigma = -0.5 / (sigma*sigma)
-
- K = np.zeros((X1.shape[0], X2.shape[0]), dtype=float)
- for i, x1 in enumerate(X1):
- for j, x2 in enumerate(X2):
- f_norm = np.linalg.norm(x1 - x2)
- # print(f_norm)
- K[i, j] = math.exp(inv_sigma * f_norm)
+ i_sigma = -0.5 / (sigma*sigma)
+
+ K = np.zeros((X1.shape[0], X2.shape[0]), dtype=np.float64)
+ if opt:
+ # Faster way of calculating the kernel (no numba support).
+ for i, x1 in enumerate(X1):
+ if X2.ndim == 3:
+ norm = np.linalg.norm(X2 - x1, axis=(1, 2))
+ else:
+ norm = np.linalg.norm(X2 - x1, axis=-1)
+ K[i, :] = np.exp(i_sigma * np.square(norm))
+ else:
+ for i, x1 in enumerate(X1):
+ for j, x2 in enumerate(X2):
+ f_norm = np.linalg.norm(x2 - x1)
+ K[i, j] = math.exp(i_sigma * f_norm**2)
return K
diff --git a/ml_exp/math.py b/ml_exp/math.py
index da1fdf08f..253abe5d2 100644
--- a/ml_exp/math.py
+++ b/ml_exp/math.py
@@ -39,7 +39,7 @@ def cholesky_solve(K,
size = K.shape[0]
# Solve Lx=y for x.
- x = np.zeros(size, dtype=float)
+ x = np.zeros(size, dtype=np.float64)
x[0] = y[0] / L[0, 0]
for i in range(1, size):
temp_sum = 0.0
@@ -49,7 +49,7 @@ def cholesky_solve(K,
# Now, solve LTa=x for a.
L2 = L.T
- a = np.zeros(size, dtype=float)
+ a = np.zeros(size, dtype=np.float64)
a[size - 1] = x[size - 1] / L2[size - 1, size - 1]
for i in range(0, size - 1)[::-1]:
temp_sum = 0.0
diff --git a/ml_exp/qm7db.py b/ml_exp/qm7db.py
index 3fd46b6bc..3ba2c5814 100644
--- a/ml_exp/qm7db.py
+++ b/ml_exp/qm7db.py
@@ -42,14 +42,14 @@ def qm7db(db_path='data',
for i, line in enumerate(lines):
line = line.split()
compounds.append(Compound(f'{db_path}/{line[0]}'))
- compounds[i].pbe0 = float(line[1])
- compounds[i].delta = float(line[1]) - float(line[2])
+ compounds[i].pbe0 = np.float64(line[1])
+ compounds[i].delta = np.float64(line[1]) - np.float64(line[2])
if is_shuffled:
random.seed(r_seed)
random.shuffle(compounds)
- e_pbe0 = np.array([compound.pbe0 for compound in compounds], dtype=float)
- e_delta = np.array([compound.delta for compound in compounds], dtype=float)
+ e_pbe0 = np.array([comp.pbe0 for comp in compounds], dtype=np.float64)
+ e_delta = np.array([comp.delta for comp in compounds], dtype=np.float64)
return compounds, e_pbe0, e_delta
diff --git a/ml_exp/representations.py b/ml_exp/representations.py
index ea079595e..be567d67a 100644
--- a/ml_exp/representations.py
+++ b/ml_exp/representations.py
@@ -53,7 +53,7 @@ size. Arrays are not of the right shape.')
'instead of (size).')
size = n
- cm = np.zeros((size, size), dtype=float)
+ cm = np.zeros((size, size), dtype=np.float64)
# Actual calculation of the coulomb matrix.
for i, xyz_i in enumerate(coords):
@@ -120,7 +120,7 @@ size. Arrays are not of the right shape.')
'instead of (size).')
size = n
- lj = np.zeros((size, size), dtype=float)
+ lj = np.zeros((size, size), dtype=np.float64)
# Actual calculation of the lennard-jones matrix.
for i, xyz_i in enumerate(coords):
@@ -166,26 +166,25 @@ size. Arrays are not of the right shape.')
return lj
-def first_neighbor_matrix(coords,
- nc,
- atoms,
- size=23,
- use_forces=False,
- bohr_ru=False):
+def get_helping_data(coords,
+ atoms,
+ nc,
+ size=23,
+ bohr_ru=False):
"""
- Creates the First Neighbor Matrix from the molecule data given.
+ Creates helping data such as the First Neighbor Matrix for the compound.
coords: compound coordinates.
- nc: nuclear charge data.
atoms: list of atoms.
- use_forces: if the use of forces instead of k_cx should be used.
+ nc: nuclear charge data.
+ size: compund size.
bohr_ru: if radius units should be in bohr's radius units.
NOTE: Bond distance of carbon to other elements
are (for atoms present in the qm7 dataset):
- C: 1.20 - 1.54 A (Edited to 1.19 - 1.54 A)
- H: 1.06 - 1.12 A
- O: 1.43 - 2.15 A
- N: 1.47 - 2.10 A
- S: 1.81 - 2.55 A
+ C: 1.19 - 1.54 A, 1.0
+ H: 1.06 - 1.12 A, 1.0
+ O: 1.43 - 2.15 A, 0.8
+ N: 1.47 - 2.10 A, 1.0
+ S: 1.81 - 2.55 A, 0.7
"""
if bohr_ru:
cr = 0.52917721067
@@ -204,88 +203,77 @@ size. Arrays are not of the right shape.')
size = n
# Possible bonds.
- cc_bond = sorted(['C', 'C'])
- ch_bond = sorted(['C', 'H'])
- co_bond = sorted(['C', 'O'])
- cn_bond = sorted(['C', 'N'])
- cs_bond = sorted(['C', 'S'])
-
- fnm = np.zeros((size, size), dtype=float)
-
+ cc_bond = ''.join(sorted(['C', 'C']))
+ ch_bond = ''.join(sorted(['C', 'H']))
+ co_bond = ''.join(sorted(['C', 'O']))
+ cn_bond = ''.join(sorted(['C', 'N']))
+ cs_bond = ''.join(sorted(['C', 'S']))
+ pos_bonds = {cc_bond: (1.19, 1.54, 1.0), ch_bond: (1.06, 1.12, 1.0),
+ co_bond: (1.43, 2.15, 0.8), cn_bond: (1.47, 2.19, 1.0),
+ cs_bond: (1.81, 2.55, 0.7)}
+
+ fnm = np.zeros((size, size), dtype=bool)
bonds = []
- forces = []
+ bonds_i = []
+ bonds_k = []
+ bonds_f = []
for i, xyz_i in enumerate(coords):
for j, xyz_j in enumerate(coords):
# Ignore the diagonal.
if i != j:
- bond = sorted([atoms[i], atoms[j]])
- rv = xyz_i - xyz_j
- r = np.linalg.norm(rv)/cr
-
- # Check for each type of bond.
- if (cc_bond == bond) and (r >= 1.19 and r <= 1.54):
- fnm[i, j] = 1.0
- if j > i:
- bonds.append((i, j))
- if use_forces:
- forces.append(rv*nc[i]*nc[j]/r**3)
- elif (ch_bond == bond) and (r >= 1.06 and r <= 1.12):
- fnm[i, j] = 1.0
- if j > i:
- bonds.append((i, j))
- if use_forces:
- forces.append(rv*nc[i]*nc[j]/r**3)
- elif (co_bond == bond) and (r >= 1.43 and r <= 2.15):
- fnm[i, j] = 0.8
- if j > i:
- bonds.append((i, j))
- if use_forces:
- forces.append(rv*nc[i]*nc[j]/r**3)
- elif (cn_bond == bond) and (r >= 1.47 and r <= 2.10):
- fnm[i, j] = 1.0
- if j > i:
- bonds.append((i, j))
- if use_forces:
- forces.append(rv*nc[i]*nc[j]/r**3)
- elif (cs_bond == bond) and (r >= 1.81 and r <= 2.55):
- fnm[i, j] = 0.7
- if j > i:
- bonds.append((i, j))
- if use_forces:
- forces.append(rv*nc[i]*nc[j]/r**3)
-
- return fnm, bonds, forces
-
-
-def adjacency_matrix(fnm,
- bonds,
- forces,
- size=22):
+ bond = ''.join(sorted([atoms[i], atoms[j]]))
+ if bond in pos_bonds.keys():
+ r_min = pos_bonds[bond][0]
+ r_max = pos_bonds[bond][1]
+ rv = xyz_i - xyz_j
+ r = np.linalg.norm(rv)/cr
+ if r >= r_min and r <= r_max:
+ fnm[i, j] = True
+ # Only add to the list if in the upper triangle.
+ if j > i:
+ bonds.append(bond)
+ bonds_i.append((i, j))
+ bonds_k.append(pos_bonds[bond][2])
+ bonds_f.append(rv*nc[i]*nc[j]/r**3)
+
+ return fnm, bonds, bonds_i, bonds_k, bonds_f
+
+
+def adjacency_matrix(bonds_i,
+ bonds_k,
+ bonds_f,
+ use_forces=False,
+ size=23):
"""
Calculates the adjacency matrix given the bond list.
- fnm: first neighbour matrix.
- bonds: list of bonds (tuple of indexes).
- forces: list of forces.
+ bonds: list of bond names.
+ bonds_i: list of bond indexes (tuple of indexes).
+ bonds_k: list of k_cx values.
+ bonds_f: list of force values.
+ use_forces: if the use of forces instead of k_cx should be used.
size: compund size.
"""
- n = len(bonds)
+ if bonds_i is None:
+ raise ValueError('The helping data hasn\'t been initialized for\
+the current compound.')
+ n = len(bonds_i)
if size < n:
print('Error. Compound size (n) is greater han (size). Using (n)\
instead of (size).')
size = n
- am = np.zeros((size, size), dtype=float)
+ am = np.zeros((size, size), dtype=np.float64)
- for i, bond_i in enumerate(bonds):
- for j, bond_j in enumerate(bonds):
+ for i, bond_i in enumerate(bonds_i):
+ for j, bond_j in enumerate(bonds_i):
# Ignore the diagonal.
if i != j:
if (bond_i[0] in bond_j) or (bond_i[1] in bond_j):
- if forces:
- am[i, j] = np.dot(forces[i], forces[j])
+ if use_forces:
+ am[i, j] = np.dot(bonds_f[i], bonds_f[j])
else:
- am[i, j] = fnm[bond_i[0], bond_i[1]]
+ am[i, j] = bonds_k[i]
return am
@@ -308,10 +296,9 @@ def check_bond(bags,
return False, None
-def bag_of_stuff(cm,
+def bag_of_bonds(cm,
atoms,
- size=23,
- stuff='bonds'):
+ size=23):
"""
Creates the Bag of Bonds using the Coulomb Matrix.
cm: coulomb matrix.
@@ -334,7 +321,7 @@ generated as the vector of eigenvalues, try (re-)generating the CM.')
size = n
# Bond max length, calculated using only the upper triangular matrix.
- bond_size = int((size * size - size)/2 + size)
+ bond_size = np.int32((size * size - size)/2 + size)
# List where each bag data is stored.
bags = []
@@ -366,19 +353,21 @@ generated as the vector of eigenvalues, try (re-)generating the CM.')
bonds.append(''.join(sorted([a_i, a_j])))
bonds = atom_list + bonds
- # Create the final vector for the bos.
- bos = np.zeros(bond_size, dtype=float)
+ # Create the final vector for the bob.
+ bob = np.zeros(bond_size, dtype=np.float64)
c_i = 0
for i, bond in enumerate(bonds):
checker = check_bond(bags, bond)
if checker[0]:
for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]):
- # Use c_i as the index for bos if the zero padding should
+ # Use c_i as the index for bob if the zero padding should
# be at the end of the vector instead of between each bond.
- bos[i*size + j] = num
+ # bob[i*size + j] = num
+ bob[c_i] = num
c_i += 1
else:
print(f'Error. Bond {bond} from bond list coudn\'t be found',
'in the bags list. This could be a case where the atom',
'is only present oncce in the molecule.')
- return bos
+
+ return bob