summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Luevano Alvarado <55825613+luevano@users.noreply.github.com>2020-02-28 06:48:42 -0700
committerGitHub <noreply@github.com>2020-02-28 06:48:42 -0700
commit86faabeac5aed75c1dfc883fe757c21eba94b645 (patch)
tree463515502ff326afa8b0dfeeaa15e3ba00ee08b2
parent7920c4b8471741cb9b5a9aa06da436c1b678f62a (diff)
parent6ad25bb4cd5822fa84d1c4cec54952c6fd8df2f6 (diff)
Merge pull request #2 from luevano/rewrite
Rewrite
-rw-r--r--ml_exp/__init__.py40
-rw-r--r--ml_exp/__main__.py49
-rw-r--r--ml_exp/adj_matrix.py131
-rw-r--r--ml_exp/bob.py117
-rw-r--r--ml_exp/bostuff.py99
-rw-r--r--ml_exp/c_matrix.py179
-rw-r--r--ml_exp/compound.py153
-rw-r--r--ml_exp/data.py141
-rw-r--r--ml_exp/do_ml.py302
-rw-r--r--ml_exp/kernels.py (renamed from ml_exp/gauss_kernel.py)22
-rw-r--r--ml_exp/lj_matrix.py222
-rw-r--r--ml_exp/math.py (renamed from ml_exp/cholesky_solve.py)26
-rw-r--r--ml_exp/misc.py38
-rw-r--r--ml_exp/parallel_create_matrices.py85
-rw-r--r--ml_exp/qm7db.py (renamed from ml_exp/frob_norm.py)46
-rw-r--r--ml_exp/read_qm7_data.py162
-rw-r--r--ml_exp/representations.py384
-rw-r--r--requirements.txt2
18 files changed, 909 insertions, 1289 deletions
diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py
index 06425f6a2..48ece56c8 100644
--- a/ml_exp/__init__.py
+++ b/ml_exp/__init__.py
@@ -20,29 +20,21 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
-from ml_exp.read_qm7_data import read_nc_data, read_db_data, read_qm7_data
-from ml_exp.c_matrix import c_matrix, c_matrix_multiple
-from ml_exp.lj_matrix import lj_matrix, lj_matrix_multiple
-from ml_exp.frob_norm import frob_norm
-from ml_exp.gauss_kernel import gauss_kernel
-from ml_exp.cholesky_solve import cholesky_solve
-from ml_exp.do_ml import do_ml
-from ml_exp.parallel_create_matrices import parallel_create_matrices
-from ml_exp.misc import plot_benchmarks
+from ml_exp.compound import Compound
+from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\
+ first_neighbor_matrix, adjacency_matrix, check_bond, bag_of_stuff
+from ml_exp.math import cholesky_solve
+from ml_exp.qm7db import qm7db
+from ml_exp.do_ml import simple_ml, do_ml
-
-# If somebody does "from package import *", this is what they will
-# be able to access:
-__all__ = ['read_nc_data',
- 'read_db_data',
- 'read_qm7_data',
- 'c_matrix',
- 'c_matrix_multiple',
- 'lj_matrix',
- 'lj_matrix_multiple',
- 'frob_norm',
- 'gauss_kernel',
+__all__ = ['Compound',
+ 'coulomb_matrix',
+ 'lennard_jones_matrix',
+ 'first_neighbor_matrix',
+ 'adjacency_matrix',
+ 'check_bond',
+ 'bag_of_stuff',
'cholesky_solve',
- 'do_ml',
- 'parallel_create_matrices',
- 'plot_benchmarks']
+ 'qm7db',
+ 'simple_ml',
+ 'do_ml']
diff --git a/ml_exp/__main__.py b/ml_exp/__main__.py
index aaeb18714..0d9b87bad 100644
--- a/ml_exp/__main__.py
+++ b/ml_exp/__main__.py
@@ -20,52 +20,5 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
-# from ml_exp.do_ml import do_ml
-# from ml_exp.misc import plot_benchmarks
-from ml_exp.read_qm7_data import read_qm7_data
-from ml_exp.adj_matrix import fneig_matrix, adj_matrix
-from ml_exp.c_matrix import c_matrix
-from ml_exp.bob import bob
-from ml_exp.bostuff import bok_cx
-
if __name__ == '__main__':
- """
- do_ml(min_training_size=1500,
- max_training_size=2000,
- training_increment_size=500,
- test_size=None,
- ljm_diag_value=None,
- ljm_sigma=1.0,
- ljm_epsilon=1.0,
- r_seed=111,
- save_benchmarks=False,
- show_msgs=True)
- """
- # plot_benchmarks()
- """
- xyz, nc, pbe0, delta, atoms = read_qm7_data(return_atoms=True)
- for i in range(1):
- fnm, bonds, forces = fneig_matrix(atoms[i], xyz[i], nc[i], True)
- am = adj_matrix(bonds, forces)
-
- print(f'{i} first neighbor matrix\n{fnm}')
- print(f'{i} bond list\n{bonds}')
- print(f'{i} force list\n{forces}')
- print(f'{i} adjacency matrix\n{am}')
- print('-'*30)
- print('OK!')
- """
- xyz, nc, pbe0, delta, atoms = read_qm7_data(return_atoms=True)
- for i in range(1):
- cm = c_matrix(xyz[i], nc[i], as_eig=False)
- bob_i = bob(cm, atoms[i])
-
- print(f'{i} coulomb matrix\n{cm}')
- print(f'{i} bag of bonds\n{bob}')
-
- fnm, bonds = fneig_matrix(atoms[i], xyz[i], nc[i], False)
- am = adj_matrix(fnm, bonds, None)
- bok_cx_i = bok_cx(am, atoms[i])
-
- print(f'{i} adjacency matrix\n{am}')
- print(f'{i} bag of k_cx\n{bok_cx_i}')
+ print('Everything OK!\n\tThis is a placeholder.')
diff --git a/ml_exp/adj_matrix.py b/ml_exp/adj_matrix.py
deleted file mode 100644
index de484b5be..000000000
--- a/ml_exp/adj_matrix.py
+++ /dev/null
@@ -1,131 +0,0 @@
-"""MIT License
-
-Copyright (c) 2019 David Luevano Alvarado
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-"""
-from numpy import array, zeros, dot
-from numpy.linalg import norm
-from ml_exp.misc import printc
-
-
-def fneig_matrix(atoms,
- xyz,
- nc=None,
- use_forces=False):
- """
- Creates the first neighbor matrix of the given molecule data.
- atoms: list of atoms.
- xyz: matrix of atomic coords.
- nc: nuclear charge info.
- use_forces: if the use of forces instead of k_cx should be used.
- NOTE: Bond distance of carbon to other elements
- are (for atoms present in the qm7 dataset):
- C: 1.20 - 1.53 A
- H: 1.06 - 1.12 A
- O: 1.43 - 2.15 A
- N: 1.47 - 2.10 A
- S: 1.81 - 2.55 A
- """
- if use_forces and nc is None:
- printc(f'Error. use_forces={use_forces} but nc={nc}', 'RED')
- return None
- # Possible bonds.
- cc_bond = sorted(['C', 'C'])
- ch_bond = sorted(['C', 'H'])
- co_bond = sorted(['C', 'O'])
- cn_bond = sorted(['C', 'N'])
- cs_bond = sorted(['C', 'S'])
-
- # Number of atoms, empty matrix and bond list.
- n = len(atoms)
- fnm = array(zeros((n, n)), dtype=float)
- bonds = []
- forces = []
- for i, xyz_i in enumerate(xyz):
- for j, xyz_j in enumerate(xyz):
- # Ignore the diagonal.
- if i != j:
- bond = sorted([atoms[i], atoms[j]])
- rv = xyz_i - xyz_j
- r = norm(rv)
- # Check for each type of bond.
- if (cc_bond == bond) and (r >= 1.20 and r <= 1.53):
- fnm[i, j] = 1.0
- if j > i:
- bonds.append((i, j))
- if use_forces:
- forces.append(rv*nc[i]*nc[j]/r**3)
- elif (ch_bond == bond) and (r >= 1.06 and r <= 1.12):
- fnm[i, j] = 1.0
- if j > i:
- bonds.append((i, j))
- if use_forces:
- forces.append(rv*nc[i]*nc[j]/r**3)
- elif (co_bond == bond) and (r >= 1.43 and r <= 2.15):
- fnm[i, j] = 0.8
- if j > i:
- bonds.append((i, j))
- if use_forces:
- forces.append(rv*nc[i]*nc[j]/r**3)
- elif (cn_bond == bond) and (r >= 1.47 and r <= 2.10):
- fnm[i, j] = 1.0
- if j > i:
- bonds.append((i, j))
- if use_forces:
- forces.append(rv*nc[i]*nc[j]/r**3)
- elif (cs_bond == bond) and (r >= 1.81 and r <= 2.55):
- fnm[i, j] = 0.7
- if j > i:
- bonds.append((i, j))
- if use_forces:
- forces.append(rv*nc[i]*nc[j]/r**3)
- if use_forces:
- return fnm, bonds, forces
- return fnm, bonds
-
-
-def adj_matrix(fneig_matrix,
- bonds,
- forces=None,
- max_len=25):
- """
- Calculates the adjacency matrix given the bond list.
- bonds: list of bonds (tuple of indexes).
- max_len: maximum amount of atoms in molecule.
- forces: list of forces.
- """
- n = len(bonds)
-
- if max_len < n:
- print(''.join(['Error. Molecule matrix dimension (mol_n) is ',
- 'greater than max_len. Using mol_n.']))
- max_len = n
-
- am = array(zeros((max_len, max_len)))
- for i, bond_i in enumerate(bonds):
- for j, bond_j in enumerate(bonds):
- # Ignore the diagonal.
- if i != j:
- if (bond_i[0] in bond_j) or (bond_i[1] in bond_j):
- if forces:
- am[i, j] = dot(forces[i], forces[j])
- else:
- am[i, j] = fneig_matrix[bond_i[0], bond_i[1]]
- return am
diff --git a/ml_exp/bob.py b/ml_exp/bob.py
deleted file mode 100644
index 86efecdb4..000000000
--- a/ml_exp/bob.py
+++ /dev/null
@@ -1,117 +0,0 @@
-"""MIT License
-
-Copyright (c) 2019 David Luevano Alvarado
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-"""
-from numpy import array, zeros
-from collections import Counter
-
-
-def check_bond(bags,
- bond):
- """
- Checks if a bond is in a bag.
- bags: list of bags, containing a bag per entry, which in turn
- contains a list of bond-values.
- bond: bond to check.
- """
- if bags == []:
- return False, None
-
- for i, bag in enumerate(bags):
- if bag[0] == bond:
- return True, i
-
- return False, None
-
-
-def bob(c_matrix,
- atoms,
- max_n=25,
- max_bond_len=325):
- """
- Creates the bag of bond using the coulomb matrix data.
- c_matrix: coulomb matrix.
- atoms: list of atoms.
- max_n: maximum amount of atoms.
- max_bond_len: maximum amount of bonds in molecule.
- """
- n = len(atoms)
- bond_n = (n * n - n) / 2 + n
- n_r = range(n)
-
- if max_n < n:
- print(''.join(['Error. Molecule matrix dimension (mol_n) is ',
- 'greater than max_len. Using mol_n.']))
- max_n = n
-
- if max_bond_len < bond_n:
- print(''.join(['Error. Molecule bond lenght (bond_n) is ',
- 'greater than max_bond_len. Using bond_n.']))
- max_bond_len = bond_n
-
- # List where each bag data is stored.
- bags = []
- for i in n_r:
- for j in n_r:
- # Work only in the upper triangle of the coulomb matrix.
- if j >= i:
- # Get the string of the current bond.
- if i == j:
- current_bond = atoms[i]
- else:
- current_bond = ''.join(sorted([atoms[i], atoms[j]]))
-
- # Check if that bond is already in a bag.
- checker = check_bond(bags, current_bond)
- # Either create a new bag or add values to an existing one.
- if not checker[0]:
- bags.append([current_bond, c_matrix[i, j]])
- else:
- bags[checker[1]].append(c_matrix[i, j])
-
- # Create the actual bond list ordered.
- atom_counter = Counter(atoms)
- atom_list = sorted(list(set(atoms)))
- bonds = []
- for i, a_i in enumerate(atom_list):
- if atom_counter[a_i] > 1:
- for a_j in atom_list[i:]:
- bonds.append(''.join(sorted([a_i, a_j])))
- bonds = atom_list + bonds
-
- # Create the final vector for the bob.
- bob = array(zeros(max_bond_len), dtype=float)
- c_i = 0
- for i, bond in enumerate(bonds):
- checker = check_bond(bags, bond)
- if checker[0]:
- for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]):
- # Use c_i as the index for bob if the zero padding should
- # be at the end of the vector instead of between each bond.
- bob[i*max_n + j] = num
- c_i += 1
- # This is set to false because this was a debugging measure.
- else:
- print(''.join([f'Error. Bond {bond} from bond list coudn\'t',
- ' be found in the bags list. This could be',
- ' a case where the atom is only present once',
- ' in the molecule.']))
- return bob
diff --git a/ml_exp/bostuff.py b/ml_exp/bostuff.py
deleted file mode 100644
index 1741e26cd..000000000
--- a/ml_exp/bostuff.py
+++ /dev/null
@@ -1,99 +0,0 @@
-"""MIT License
-
-Copyright (c) 2019 David Luevano Alvarado
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-"""
-from numpy import array, zeros
-from collections import Counter
-from ml_exp.bob import check_bond
-
-
-def bok_cx(adj_matrix,
- atoms,
- max_n=25,
- max_bond_len=325):
- """
- Creates the bag of k_cx using the adjacency matrix data.
- adj_matrix: adjacency matrix.
- max_n: maximum amount of atoms.
- max_bond_len: maximum amount of bonds in molecule.
- """
- n = len(atoms)
- bond_n = (n * n - n) / 2 + n
- n_r = range(n)
-
- if max_n < n:
- print(''.join(['Error. Molecule matrix dimension (mol_n) is ',
- 'greater than max_len. Using mol_n.']))
- max_n = n
-
- if max_bond_len < bond_n:
- print(''.join(['Error. Molecule bond length (bond_n) is ',
- 'greater than max_bond_len. Using bond_n.']))
- max_bond_len = bond_n
-
- # List where each bag data is stored.
- bags = []
- for i in n_r:
- for j in n_r:
- # Work only in the upper triangle of the coulomb matrix.
- if j >= i:
- # Get the string of the current bond.
- if i == j:
- current_bond = atoms[i]
- else:
- current_bond = ''.join(sorted([atoms[i], atoms[j]]))
-
- # Check if that bond is already in a bag.
- checker = check_bond(bags, current_bond)
- # Either create a new bag or add values to an existing one.
- if not checker[0]:
- bags.append([current_bond, adj_matrix[i, j]])
- else:
- bags[checker[1]].append(adj_matrix[i, j])
-
- # Create the actual bond list ordered.
- atom_counter = Counter(atoms)
- atom_list = sorted(list(set(atoms)))
- bonds = []
- for i, a_i in enumerate(atom_list):
- if atom_counter[a_i] > 1:
- for a_j in atom_list[i:]:
- bonds.append(''.join(sorted([a_i, a_j])))
- bonds = atom_list + bonds
-
- # Create the final vector for the bok_cx.
- bok_cx = array(zeros(max_bond_len), dtype=float)
- c_i = 0
- for i, bond in enumerate(bonds):
- checker = check_bond(bags, bond)
- if checker[0]:
- for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]):
- # Use c_i as the index for bok_cx if the zero padding should
- # be at the end of the vector instead of between each bond.
- bok_cx[i*max_n + j] = num
- c_i += 1
- # This is set to false because this was a debugging measure.
- else:
- print(''.join([f'Error. Bond {bond} from bond list coudn\'t',
- ' be found in the bags list. This could be',
- ' a case where the atom is only present once',
- ' in the molecule.']))
- return bok_cx
diff --git a/ml_exp/c_matrix.py b/ml_exp/c_matrix.py
deleted file mode 100644
index 00034a660..000000000
--- a/ml_exp/c_matrix.py
+++ /dev/null
@@ -1,179 +0,0 @@
-"""MIT License
-
-Copyright (c) 2019 David Luevano Alvarado
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-"""
-import time
-import math
-import numpy as np
-from numpy.linalg import eig
-from ml_exp.misc import printc
-
-
-def c_matrix(mol_data,
- nc_data,
- max_len=25,
- as_eig=True,
- bohr_radius_units=False):
- """
- Creates the Coulomb Matrix from the molecule data given.
- mol_data: molecule data, matrix of atom coordinates.
- nc_data: nuclear charge data, array of atom data.
- max_len: maximum amount of atoms in molecule.
- as_eig: if data should be returned as matrix or array of eigenvalues.
- bohr_radius_units: if units should be in bohr's radius units.
- """
- if bohr_radius_units:
- conversion_rate = 0.52917721067
- else:
- conversion_rate = 1
-
- mol_n = len(mol_data)
- mol_nr = range(mol_n)
-
- if not mol_n == len(nc_data):
- print(''.join(['Error. Molecule matrix dimension is different ',
- 'than the nuclear charge array dimension.']))
- else:
- if max_len < mol_n:
- print(''.join(['Error. Molecule matrix dimension (mol_n) is ',
- 'greater than max_len. Using mol_n.']))
- max_len = None
-
- if max_len:
- cm = np.zeros((max_len, max_len))
- ml_r = range(max_len)
-
- # Actual calculation of the coulomb matrix.
- for i in ml_r:
- if i < mol_n:
- x_i = mol_data[i, 0]
- y_i = mol_data[i, 1]
- z_i = mol_data[i, 2]
- Z_i = nc_data[i]
- else:
- break
-
- for j in ml_r:
- if j < mol_n:
- x_j = mol_data[j, 0]
- y_j = mol_data[j, 1]
- z_j = mol_data[j, 2]
- Z_j = nc_data[j]
-
- x = (x_i-x_j)**2
- y = (y_i-y_j)**2
- z = (z_i-z_j)**2
-
- if i == j:
- cm[i, j] = (0.5*Z_i**2.4)
- else:
- cm[i, j] = (conversion_rate*Z_i*Z_j/math.sqrt(x
- + y
- + z))
- else:
- break
-
- # Now the value will be returned.
- if as_eig:
- cm_sorted = np.sort(eig(cm)[0])[::-1]
- # Thanks to SO for the following lines of code.
- # https://stackoverflow.com/a/43011036
-
- # Keep zeros at the end.
- mask = cm_sorted != 0.
- f_mask = mask.sum(0, keepdims=1) >\
- np.arange(cm_sorted.shape[0]-1, -1, -1)
-
- f_mask = f_mask[::-1]
- cm_sorted[f_mask] = cm_sorted[mask]
- cm_sorted[~f_mask] = 0.
-
- return cm_sorted
-
- else:
- return cm
-
- else:
- cm_temp = []
- # Actual calculation of the coulomb matrix.
- for i in mol_nr:
- x_i = mol_data[i, 0]
- y_i = mol_data[i, 1]
- z_i = mol_data[i, 2]
- Z_i = nc_data[i]
-
- cm_row = []
- for j in mol_nr:
- x_j = mol_data[j, 0]
- y_j = mol_data[j, 1]
- z_j = mol_data[j, 2]
- Z_j = nc_data[j]
-
- x = (x_i-x_j)**2
- y = (y_i-y_j)**2
- z = (z_i-z_j)**2
-
- if i == j:
- cm_row.append(0.5*Z_i**2.4)
- else:
- cm_row.append(conversion_rate*Z_i*Z_j/math.sqrt(x
- + y
- + z))
-
- cm_temp.append(np.array(cm_row))
-
- cm = np.array(cm_temp)
- # Now the value will be returned.
- if as_eig:
- return np.sort(eig(cm)[0])[::-1]
- else:
- return cm
-
-
-def c_matrix_multiple(mol_data,
- nc_data,
- pipe=None,
- max_len=25,
- as_eig=True,
- bohr_radius_units=False):
- """
- Calculates the Coulomb Matrix of multiple molecules.
- mol_data: molecule data, matrix of atom coordinates.
- nc_data: nuclear charge data, array of atom data.
- pipe: for multiprocessing purposes. Sends the data calculated
- through a pipe.
- max_len: maximum amount of atoms in molecule.
- as_eig: if data should be returned as matrix or array of eigenvalues.
- bohr_radius_units: if units should be in bohr's radius units.
- """
- printc('Coulomb Matrices calculation started.', 'CYAN')
- tic = time.perf_counter()
-
- cm_data = np.array([c_matrix(mol, nc, max_len, as_eig, bohr_radius_units)
- for mol, nc in zip(mol_data, nc_data)])
-
- toc = time.perf_counter()
- printc('\tCM calculation took {:.4f} seconds.'.format(toc - tic), 'GREEN')
-
- if pipe:
- pipe.send(cm_data)
-
- return cm_data
diff --git a/ml_exp/compound.py b/ml_exp/compound.py
new file mode 100644
index 000000000..1598fa482
--- /dev/null
+++ b/ml_exp/compound.py
@@ -0,0 +1,153 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import numpy as np
+from ml_exp.data import NUCLEAR_CHARGE
+from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\
+ first_neighbor_matrix, adjacency_matrix, bag_of_stuff
+
+
+class Compound:
+ def __init__(self,
+ xyz=None):
+ """
+ Initialization of the Compound.
+ xyz: (path to) the xyz file.
+ """
+ self.name = None
+
+ self.n = None
+ self.extra = None # In case a comment is in the compound file.
+ self.atoms = None
+ self.atoms_nc = None
+ self.coordinates = None
+ self.pbe0 = None
+ self.delta = None
+
+ self.cm = None
+ self.ljm = None
+ self.am = None
+ self.bos = None
+
+ if xyz is not None:
+ self.read_xyz(xyz)
+
+ def gen_cm(self,
+ size=23,
+ as_eig=True,
+ bohr_ru=False):
+ """
+ Generate the Coulomb Matrix for the compund.
+ size: compound size.
+ as_eig: if the representation should be as the eigenvalues.
+ bohr_ru: if radius units should be in bohr's radius units.
+ """
+ self.cm = coulomb_matrix(self.coordinates,
+ self.atoms_nc,
+ size=size,
+ as_eig=as_eig,
+ bohr_ru=bohr_ru)
+
+ def gen_ljm(self,
+ diag_value=None,
+ sigma=1.0,
+ epsilon=1.0,
+ size=23,
+ as_eig=True,
+ bohr_ru=False):
+ """
+ Generate the Lennard-Jones Matrix for the compund.
+ diag_value: if special diagonal value is to be used.
+ sigma: sigma value.
+ epsilon: epsilon value.
+ size: compound size.
+ as_eig: if the representation should be as the eigenvalues.
+ bohr_ru: if radius units should be in bohr's radius units.
+ """
+ self.ljm = lennard_jones_matrix(self.coordinates,
+ self.atoms_nc,
+ diag_value=diag_value,
+ sigma=sigma,
+ epsilon=epsilon,
+ size=size,
+ as_eig=as_eig,
+ bohr_ru=bohr_ru)
+
+ def gen_am(self,
+ use_forces=False,
+ size=23,
+ bohr_ru=False):
+ """
+ Generate the Adjacency Matrix for the compund.
+ use_forces: if the use of forces instead of k_cx should be used.
+ size: compound size.
+ bohr_ru: if radius units should be in bohr's radius units.
+ """
+ # First, generate the first neighor matrix.
+ fnm, bonds, forces = first_neighbor_matrix(self.coordinates,
+ self.atoms_nc,
+ self.atoms,
+ size=size,
+ use_forces=use_forces,
+ bohr_ru=bohr_ru)
+
+ # Now, generate the adjacency matrix.
+ self.am = adjacency_matrix(fnm,
+ bonds,
+ forces,
+ size=size)
+
+ def gen_bos(self,
+ size=23,
+ stuff='bonds'):
+ """
+ Generate the Bag of Stuff for the compound.
+ size: compound size.
+ stuff: elements of the bag, by default the known bag of bonds.
+ """
+ self.bos = bag_of_stuff(self.cm,
+ self.atoms,
+ size=size,
+ stuff=stuff)
+
+ def read_xyz(self,
+ filename):
+ """
+ Reads an xyz file and adds the corresponding data to the Compound.
+ filename: (path to) the xyz file.
+ """
+ with open(filename, 'r') as f:
+ lines = f.readlines()
+
+ self.name = filename.split('/')[-1]
+ self.n = int(lines[0])
+ self.extra = lines[1]
+ self.atoms = []
+ self.atoms_nc = np.empty(self.n, dtype=int)
+ self.coordinates = np.empty((self.n, 3), dtype=float)
+
+ for i, atom in enumerate(lines[2:self.n + 2]):
+ atom_d = atom.split()
+
+ self.atoms.append(atom_d[0])
+ self.atoms_nc[i] = NUCLEAR_CHARGE[atom_d[0]]
+ self.coordinates[i] = np.asarray(atom_d[1:4], dtype=float)
diff --git a/ml_exp/data.py b/ml_exp/data.py
new file mode 100644
index 000000000..243aff168
--- /dev/null
+++ b/ml_exp/data.py
@@ -0,0 +1,141 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+NUCLEAR_CHARGE = {
+ 'H': 1,
+ 'He': 2,
+ 'Li': 3,
+ 'Be': 4,
+ 'B': 5,
+ 'C': 6,
+ 'N': 7,
+ 'O': 8,
+ 'F': 9,
+ 'Ne': 10,
+ 'Na': 11,
+ 'Mg': 12,
+ 'Al': 13,
+ 'Si': 14,
+ 'P': 15,
+ 'S': 16,
+ 'Cl': 17,
+ 'Ar': 18,
+ 'K': 19,
+ 'Ca': 20,
+ 'Sc': 21,
+ 'Ti': 22,
+ 'V': 23,
+ 'Cr': 24,
+ 'Mn': 25,
+ 'Fe': 26,
+ 'Co': 27,
+ 'Ni': 28,
+ 'Cu': 29,
+ 'Zn': 30,
+ 'Ga': 31,
+ 'Ge': 32,
+ 'As': 33,
+ 'Se': 34,
+ 'Br': 35,
+ 'Kr': 36,
+ 'Rb': 37,
+ 'Sr': 38,
+ 'Y': 39,
+ 'Zr': 40,
+ 'Nb': 41,
+ 'Mo': 42,
+ 'Tc': 43,
+ 'Ru': 44,
+ 'Rh': 45,
+ 'Pd': 46,
+ 'Ag': 47,
+ 'Cd': 48,
+ 'In': 49,
+ 'Sn': 50,
+ 'Sb': 51,
+ 'Te': 52,
+ 'I': 53,
+ 'Xe': 54,
+ 'Cs': 55,
+ 'Ba': 56,
+ 'La': 57,
+ 'Ce': 58,
+ 'Pr': 59,
+ 'Nd': 60,
+ 'Pm': 61,
+ 'Sm': 62,
+ 'Eu': 63,
+ 'Gd': 64,
+ 'Tb': 65,
+ 'Dy': 66,
+ 'Ho': 67,
+ 'Er': 68,
+ 'Tm': 69,
+ 'Yb': 70,
+ 'Lu': 71,
+ 'Hf': 72,
+ 'Ta': 73,
+ 'W': 74,
+ 'Re': 75,
+ 'Os': 76,
+ 'Ir': 77,
+ 'Pt': 78,
+ 'Au': 79,
+ 'Hg': 80,
+ 'Tl': 81,
+ 'Pb': 82,
+ 'Bi': 83,
+ 'Po': 84,
+ 'At': 85,
+ 'Rn': 86,
+ 'Fr': 87,
+ 'Ra': 88,
+ 'Ac': 89,
+ 'Th': 90,
+ 'Pa': 91,
+ 'U': 92,
+ 'Np': 93,
+ 'Pu': 94,
+ 'Am': 95,
+ 'Cm': 96,
+ 'Bk': 97,
+ 'Cf': 98,
+ 'Es': 99,
+ 'Fm': 100,
+ 'Md': 101,
+ 'No': 102,
+ 'Lr': 103,
+ 'Rf': 104,
+ 'Db': 105,
+ 'Sg': 106,
+ 'Bh': 107,
+ 'Hs': 108,
+ 'Mt': 109,
+ 'Ds ': 110,
+ 'Rg ': 111,
+ 'Cn ': 112,
+ 'Nh': 113,
+ 'Fl': 114,
+ 'Mc': 115,
+ 'Lv': 116,
+ 'Ts': 117,
+ 'Og': 118}
diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py
index 110242a1d..d8ee415bf 100644
--- a/ml_exp/do_ml.py
+++ b/ml_exp/do_ml.py
@@ -22,206 +22,206 @@ SOFTWARE.
"""
import time
import numpy as np
-from multiprocessing import Process, Pipe
from ml_exp.misc import printc
-from ml_exp.gauss_kernel import gauss_kernel
-from ml_exp.cholesky_solve import cholesky_solve
-from ml_exp.read_qm7_data import read_qm7_data
-from ml_exp.parallel_create_matrices import parallel_create_matrices
-
-
-def ml(desc_data,
- energy_data,
- training_size,
- desc_type=None,
- pipe=None,
- test_size=None,
- sigma=1000.0,
- show_msgs=True):
+from ml_exp.kernels import gaussian_kernel
+from ml_exp.math import cholesky_solve
+from ml_exp.qm7db import qm7db
+
+
+def simple_ml(descriptors,
+ energies,
+ training_size,
+ test_size=None,
+ sigma=1000.0,
+ identifier=None,
+ show_msgs=True):
"""
- Does the ML methodology.
- desc_data: descriptor (or representation) data.
- energy_data: energy data associated with desc_data.
+ Basic ML methodology for a single descriptor type.
+ descriptors: array of descriptors.
+ energies: array of energies.
training_size: size of the training set to use.
- desc_type: string with the name of the descriptor used.
- pipe: for multiprocessing purposes. Sends the data calculated
- through a pipe.
test_size: size of the test set to use. If no size is given,
the last remaining molecules are used.
sigma: depth of the kernel.
- show_msgs: Show debug messages or not.
- NOTE: desc_type is just a string and is only for identification purposes.
+ identifier: string with the name of the descriptor used.
+ show_msgs: if debug messages should be shown.
+ NOTE: identifier is just a string and is only for identification purposes.
Also, training is done with the first part of the data and
- testing with the ending part of the data.
+ testing with the ending part of the data.
"""
tic = time.perf_counter()
# Initial calculations for later use.
- d_len = len(desc_data)
- e_len = len(energy_data)
+ data_size = descriptors.shape[0]
- if not desc_type:
- desc_type = 'NOT SPECIFIED'
+ if not identifier:
+ identifier = 'NOT SPECIFIED'
- if d_len != e_len:
- printc(''.join(['ERROR. Descriptor data size different ',
- 'than energy data size.']), 'RED')
- return None
+ if not data_size == energies.shape[0]:
+ raise ValueError('Energies size is different than descriptors size.')
- if training_size >= d_len:
- printc('ERROR. Training size greater or equal than data size.', 'RED')
- return None
+ if training_size >= data_size:
+ raise ValueError('Training size is greater or equal to the data size.')
+ # If test_size is not set, it is set to a maximum size of 1500.
if not test_size:
- test_size = d_len - training_size
+ test_size = data_size - training_size
if test_size > 1500:
test_size = 1500
if show_msgs:
- printc('{} ML started.'.format(desc_type), 'GREEN')
- printc('\tTraining size: {}'.format(training_size), 'CYAN')
- printc('\tTest size: {}'.format(test_size), 'CYAN')
- printc('\tSigma: {}'.format(sigma), 'CYAN')
+ printc(f'{identifier} ML started.', 'GREEN')
+ printc(f'\tTraining size: {training_size}', 'CYAN')
+ printc(f'\tTest size: {test_size}', 'CYAN')
+ printc(f'\tSigma: {test_size}', 'CYAN')
- X_training = desc_data[:training_size]
- Y_training = energy_data[:training_size]
- K_training = gauss_kernel(X_training, X_training, sigma)
- alpha_ = cholesky_solve(K_training, Y_training)
+ X_training = descriptors[:training_size]
+ Y_training = energies[:training_size]
+ K_training = gaussian_kernel(X_training, X_training, sigma)
+ alpha = cholesky_solve(K_training, Y_training)
- X_test = desc_data[-test_size:]
- Y_test = energy_data[-test_size:]
- K_test = gauss_kernel(X_test, X_training, sigma)
- Y_predicted = np.dot(K_test, alpha_)
+ X_test = descriptors[-test_size:]
+ Y_test = energies[-test_size:]
+ K_test = gaussian_kernel(X_test, X_training, sigma)
+ Y_predicted = np.dot(K_test, alpha)
mae = np.mean(np.abs(Y_predicted - Y_test))
if show_msgs:
- printc('\tMAE for {}: {:.4f}'.format(desc_type, mae), 'GREEN')
+ printc(f'\tMAE for {identifier}: {mae:.4f}', 'GREEN')
toc = time.perf_counter()
tictoc = toc - tic
if show_msgs:
- printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc),
- 'GREEN')
- printc('\t\tTraining size: {}'.format(training_size), 'CYAN')
- printc('\t\tTest size: {}'.format(test_size), 'CYAN')
- printc('\t\tSigma: {}'.format(sigma), 'CYAN')
-
- if pipe:
- pipe.send([desc_type, training_size, test_size, sigma, mae, tictoc])
+ printc(f'\t{identifier} ML took {tictoc:.4f} seconds.', 'GREEN')
+ printc(f'\t\tTraining size: {training_size}', 'CYAN')
+ printc(f'\t\tTest size: {test_size}', 'CYAN')
+ printc(f'\t\tSigma: {sigma}', 'CYAN')
return mae, tictoc
-def do_ml(min_training_size,
- max_training_size=None,
- training_increment_size=500,
- test_size=None,
- ljm_diag_value=None,
- ljm_sigma=1.0,
- ljm_epsilon=1.0,
+def do_ml(db_path='data',
+ is_shuffled=True,
r_seed=111,
- save_benchmarks=False,
- max_len=25,
+ diag_value=None,
+ lj_sigma=1.0,
+ lj_epsilon=1.0,
+ use_forces=False,
+ stuff='bonds',
+ size=23,
as_eig=True,
- bohr_radius_units=False,
+ bohr_ru=False,
+ training_size=1500,
+ test_size=None,
sigma=1000.0,
+ identifiers=["CM"],
show_msgs=True):
"""
Main function that does the whole ML process.
- min_training_size: minimum training size.
- max_training_size: maximum training size.
- training_increment_size: training increment size.
+ db_path: path to the database directory.
+ is_shuffled: if the resulting list of compounds should be shuffled.
+ r_seed: random seed to use for the shuffling.
+ diag_value: if special diagonal value is to be used.
+ lj_sigma: sigma value.
+ lj_epsilon: epsilon value.
+ use_forces: if the use of forces instead of k_cx should be used.
+ stuff: elements of the bag, by default the known bag of bonds.
+ size: compound size.
+ as_eig: if the representation should be as the eigenvalues.
+ bohr_ru: if radius units should be in bohr's radius units.
+ training_size: size of the training set to use.
test_size: size of the test set to use. If no size is given,
the last remaining molecules are used.
- ljm_diag_value: if a special diagonal value should be used in lj matrix.
- ljm_sigma: sigma value for lj matrix.
- ljm_epsilon: epsilon value for lj matrix.
- r_seed: random seed to use for the shuffling.
- save_benchmarks: if benchmarks should be saved.
- max_len: maximum amount of atoms in molecule.
- as_eig: if data should be returned as matrix or array of eigenvalues.
- bohr_radius_units: if units should be in bohr's radius units.
sigma: depth of the kernel.
- show_msgs: Show debug messages or not.
+ identifiers: list of names (strings) of descriptors to use.
+ show_msgs: if debug messages should be shown.
"""
- # Initialization time.
+ if type(identifiers) != list:
+ raise TypeError('\'identifiers\' is not a list.')
+
init_time = time.perf_counter()
- if not max_training_size:
- max_training_size = min_training_size + training_increment_size
# Data reading.
- molecules, nuclear_charge, energy_pbe0, energy_delta =\
- read_qm7_data(r_seed=r_seed)
+ tic = time.perf_counter()
+ compounds, energy_pbe0, energy_delta = qm7db(db_path=db_path,
+ is_shuffled=is_shuffled,
+ r_seed=r_seed)
+ toc = time.perf_counter()
+ tictoc = toc - tic
+ if show_msgs:
+ printc(f'Data reading took {tictoc:.4f} seconds.', 'CYAN')
# Matrices calculation.
- cm_data, ljm_data = parallel_create_matrices(molecules,
- nuclear_charge,
- ljm_diag_value,
- ljm_sigma,
- ljm_epsilon,
- max_len,
- as_eig,
- bohr_radius_units)
+ tic = time.perf_counter()
+ for compound in compounds:
+ if 'CM' in identifiers:
+ compound.gen_cm(size=size,
+ as_eig=as_eig,
+ bohr_ru=bohr_ru)
+ if 'LJM' in identifiers:
+ compound.gen_ljm(diag_value=diag_value,
+ sigma=lj_sigma,
+ epsilon=lj_epsilon,
+ size=size,
+ as_eig=as_eig,
+ bohr_ru=bohr_ru)
+ if 'AM' in identifiers:
+ compound.gen_am(use_forces=use_forces,
+ size=size,
+ bohr_ru=bohr_ru)
+ if 'BOS' in identifiers:
+ compound.gen_bos(size=size,
+ stuff=stuff)
+
+ # Create a numpy array for the descriptors.
+ if 'CM' in identifiers:
+ cm_data = np.array([comp.cm for comp in compounds], dtype=float)
+ if 'LJM' in identifiers:
+ ljm_data = np.array([comp.ljm for comp in compounds], dtype=float)
+ if 'AM' in identifiers:
+ am_data = np.array([comp.cm for comp in compounds], dtype=float)
+ if 'BOS' in identifiers:
+ bos_data = np.array([comp.bos for comp in compounds], dtype=float)
+
+ toc = time.perf_counter()
+ tictoc = toc - tic
+ if show_msgs:
+ printc(f'Matrices calculation took {tictoc:.4f} seconds.', 'CYAN')
# ML calculation.
- procs = []
- cm_pipes = []
- ljm_pipes = []
- for i in range(min_training_size,
- max_training_size + 1,
- training_increment_size):
- cm_recv, cm_send = Pipe(False)
- p1 = Process(target=ml,
- args=(cm_data,
- energy_pbe0,
- i,
- 'CM',
- cm_send,
- test_size,
- sigma,
- show_msgs))
- procs.append(p1)
- cm_pipes.append(cm_recv)
- p1.start()
-
- ljm_recv, ljm_send = Pipe(False)
- p2 = Process(target=ml,
- args=(ljm_data,
- energy_pbe0,
- i,
- 'L-JM',
- ljm_send,
- test_size,
- sigma,
- show_msgs))
- procs.append(p2)
- ljm_pipes.append(ljm_recv)
- p2.start()
-
- cm_bench_results = []
- ljm_bench_results = []
- for cd_pipe, ljd_pipe in zip(cm_pipes, ljm_pipes):
- cm_bench_results.append(cd_pipe.recv())
- ljm_bench_results.append(ljd_pipe.recv())
-
- for proc in procs:
- proc.join()
-
- if save_benchmarks:
- with open('data\\benchmarks.csv', 'a') as save_file:
- # save_file.write(''.join(['ml_type,tr_size,te_size,kernel_s,',
- # 'mae,time,lj_s,lj_e,date_ran\n']))
- ltime = time.localtime()[:3][::-1]
- ljm_se = ',' + str(ljm_sigma) + ',' + str(ljm_epsilon) + ','
- date = '/'.join([str(field) for field in ltime])
- for cm, ljm, in zip(cm_bench_results, ljm_bench_results):
- cm_text = ','.join([str(field) for field in cm])\
- + ',' + date + '\n'
- ljm_text = ','.join([str(field) for field in ljm])\
- + ljm_se + date + '\n'
- save_file.write(cm_text)
- save_file.write(ljm_text)
+ if 'CM' in identifiers:
+ cm_mae, cm_tictoc = simple_ml(cm_data,
+ energy_pbe0,
+ training_size=training_size,
+ test_size=test_size,
+ sigma=sigma,
+ identifier='CM',
+ show_msgs=show_msgs)
+ if 'LJM' in identifiers:
+ ljm_mae, ljm_tictoc = simple_ml(ljm_data,
+ energy_pbe0,
+ training_size=training_size,
+ test_size=test_size,
+ sigma=sigma,
+ identifier='LJM',
+ show_msgs=show_msgs)
+ if 'AM' in identifiers:
+ am_mae, am_tictoc = simple_ml(am_data,
+ energy_pbe0,
+ training_size=training_size,
+ test_size=test_size,
+ sigma=sigma,
+ identifier='CM',
+ show_msgs=show_msgs)
+ if 'BOS' in identifiers:
+ bos_mae, bos_tictoc = simple_ml(bos_data,
+ energy_pbe0,
+ training_size=training_size,
+ test_size=test_size,
+ sigma=sigma,
+ identifier='CM',
+ show_msgs=show_msgs)
# End of program
end_time = time.perf_counter()
- printc('Program took {:.4f} seconds.'.format(end_time - init_time),
- 'CYAN')
+ totaltime = end_time - init_time
+ printc(f'Program took {totaltime:.4f} seconds.', 'CYAN')
diff --git a/ml_exp/gauss_kernel.py b/ml_exp/kernels.py
index 834d62408..3914ffc20 100644
--- a/ml_exp/gauss_kernel.py
+++ b/ml_exp/kernels.py
@@ -22,27 +22,23 @@ SOFTWARE.
"""
import math
import numpy as np
-from ml_exp.frob_norm import frob_norm
-def gauss_kernel(X_1, X_2, sigma):
+def gaussian_kernel(X1,
+ X2,
+ sigma):
"""
Calculates the Gaussian Kernel.
- X_1: first representations.
- X_2: second representations.
+ X1: first representations.
+ X2: second representations.
sigma: kernel width.
"""
- x1_l = len(X_1)
- x1_range = range(x1_l)
- x2_l = len(X_2)
- x2_range = range(x2_l)
-
inv_sigma = -0.5 / (sigma*sigma)
- K = np.zeros((x1_l, x2_l))
- for i in x1_range:
- for j in x2_range:
- f_norm = frob_norm(X_1[i] - X_2[j])
+ K = np.zeros((X1.shape[0], X2.shape[0]), dtype=float)
+ for i, x1 in enumerate(X1):
+ for j, x2 in enumerate(X2):
+ f_norm = np.linalg.norm(x1 - x2)
# print(f_norm)
K[i, j] = math.exp(inv_sigma * f_norm)
diff --git a/ml_exp/lj_matrix.py b/ml_exp/lj_matrix.py
deleted file mode 100644
index 549790755..000000000
--- a/ml_exp/lj_matrix.py
+++ /dev/null
@@ -1,222 +0,0 @@
-"""MIT License
-
-Copyright (c) 2019 David Luevano Alvarado
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-"""
-import time
-import math
-import numpy as np
-from numpy.linalg import eig
-from ml_exp.misc import printc
-
-
-def lj_matrix(mol_data,
- nc_data,
- diag_value=None,
- sigma=1.0,
- epsilon=1.0,
- max_len=25,
- as_eig=True,
- bohr_radius_units=False):
- """
- Creates the Lennard-Jones Matrix from the molecule data given.
- mol_data: molecule data, matrix of atom coordinates.
- nc_data: nuclear charge data, array of atom data.
- diag_value: if special diagonal value is to be used.
- sigma: sigma value.
- epsilon: epsilon value.
- max_len: maximum amount of atoms in molecule.
- as_eig: if data should be returned as matrix or array of eigenvalues.
- bohr_radius_units: if units should be in bohr's radius units.
- """
- if bohr_radius_units:
- conversion_rate = 0.52917721067
- else:
- conversion_rate = 1
-
- mol_n = len(mol_data)
- mol_nr = range(mol_n)
-
- if not mol_n == len(nc_data):
- print(''.join(['Error. Molecule matrix dimension is different ',
- 'than the nuclear charge array dimension.']))
- else:
- if max_len < mol_n:
- print(''.join(['Error. Molecule matrix dimension (mol_n) is ',
- 'greater than max_len. Using mol_n.']))
- max_len = None
-
- if max_len:
- lj = np.zeros((max_len, max_len))
- ml_r = range(max_len)
-
- # Actual calculation of the coulomb matrix.
- for i in ml_r:
- if i < mol_n:
- x_i = mol_data[i, 0]
- y_i = mol_data[i, 1]
- z_i = mol_data[i, 2]
- Z_i = nc_data[i]
- else:
- break
-
- for j in ml_r:
- if j < mol_n:
- x_j = mol_data[j, 0]
- y_j = mol_data[j, 1]
- z_j = mol_data[j, 2]
-
- x = (x_i-x_j)**2
- y = (y_i-y_j)**2
- z = (z_i-z_j)**2
-
- if i == j:
- if diag_value is None:
- lj[i, j] = (0.5*Z_i**2.4)
- else:
- lj[i, j] = diag_value
- else:
- # Calculations are done after i==j is checked
- # so no division by zero is done.
-
- # A little play with r exponents
- # so no square root is calculated.
- # Conversion factor is included in r^2.
-
- # 1/r^2
- r_2 = sigma**2/(conversion_rate**2*(x + y + z))
-
- r_6 = math.pow(r_2, 3)
- r_12 = math.pow(r_6, 2)
- lj[i, j] = (4*epsilon*(r_12 - r_6))
- else:
- break
-
- # Now the value will be returned.
- if as_eig:
- lj_sorted = np.sort(eig(lj)[0])[::-1]
- # Thanks to SO for the following lines of code.
- # https://stackoverflow.com/a/43011036
-
- # Keep zeros at the end.
- mask = lj_sorted != 0.
- f_mask = mask.sum(0, keepdims=1) >\
- np.arange(lj_sorted.shape[0]-1, -1, -1)
-
- f_mask = f_mask[::-1]
- lj_sorted[f_mask] = lj_sorted[mask]
- lj_sorted[~f_mask] = 0.
-
- return lj_sorted
-
- else:
- return lj
-
- else:
- lj_temp = []
- # Actual calculation of the coulomb matrix.
- for i in mol_nr:
- x_i = mol_data[i, 0]
- y_i = mol_data[i, 1]
- z_i = mol_data[i, 2]
- Z_i = nc_data[i]
-
- lj_row = []
- for j in mol_nr:
- x_j = mol_data[j, 0]
- y_j = mol_data[j, 1]
- z_j = mol_data[j, 2]
-
- x = (x_i-x_j)**2
- y = (y_i-y_j)**2
- z = (z_i-z_j)**2
-
- if i == j:
- if not diag_value:
- lj_row.append(0.5*Z_i**2.4)
- else:
- lj_row.append(diag_value)
- else:
- # Calculations are done after i==j is checked
- # so no division by zero is done.
-
- # A little play with r exponents
- # so no square root is calculated.
- # Conversion factor is included in r^2.
-
- # 1/r^2
- r_2 = sigma**2/(conversion_rate**2*(x + y + z))
-
- r_6 = math.pow(r_2, 3)
- r_12 = math.pow(r_6, 2)
- lj_row.append(4*epsilon*(r_12 - r_6))
-
- lj_temp.append(np.array(lj_row))
-
- lj = np.array(lj_temp)
- # Now the value will be returned.
- if as_eig:
- return np.sort(eig(lj)[0])[::-1]
- else:
- return lj
-
-
-def lj_matrix_multiple(mol_data,
- nc_data,
- pipe=None,
- diag_value=None,
- sigma=1.0,
- epsilon=1.0,
- max_len=25,
- as_eig=True,
- bohr_radius_units=False):
- """
- Calculates the Lennard-Jones Matrix of multiple molecules.
- mol_data: molecule data, matrix of atom coordinates.
- nc_data: nuclear charge data, array of atom data.
- pipe: for multiprocessing purposes. Sends the data calculated
- through a pipe.
- diag_value: if special diagonal value is to be used.
- sigma: sigma value.
- epsilon: epsilon value.
- max_len: maximum amount of atoms in molecule.
- as_eig: if data should be returned as matrix or array of eigenvalues.
- bohr_radius_units: if units should be in bohr's radius units.
- """
- printc('L-J Matrices calculation started.', 'CYAN')
- tic = time.perf_counter()
-
- ljm_data = np.array([lj_matrix(mol,
- nc,
- diag_value,
- sigma,
- epsilon,
- max_len,
- as_eig,
- bohr_radius_units)
- for mol, nc in zip(mol_data, nc_data)])
-
- toc = time.perf_counter()
- printc('\tL-JM calculation took {:.4f} seconds.'.format(toc-tic), 'GREEN')
-
- if pipe:
- pipe.send(ljm_data)
-
- return ljm_data
diff --git a/ml_exp/cholesky_solve.py b/ml_exp/math.py
index bc6a572a3..da1fdf08f 100644
--- a/ml_exp/cholesky_solve.py
+++ b/ml_exp/math.py
@@ -21,26 +21,25 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
import numpy as np
-from numpy.linalg import cholesky
-def cholesky_solve(K, y):
+def cholesky_solve(K,
+ y):
"""
- Applies Cholesky decomposition to obtain the 'alpha coeficients'.
+ Applies Cholesky decomposition to solve Ka=y. Where a are the alpha
+ coeficients.
K: kernel.
y: known parameters.
"""
- # The initial mathematical problem is to solve Ka=y.
-
- # First, add a small lambda value.
+ # Add a small lambda value.
K[np.diag_indices_from(K)] += 1e-8
# Get the Cholesky decomposition of the kernel.
- L = cholesky(K)
- size = len(L)
+ L = np.linalg.cholesky(K)
+ size = K.shape[0]
# Solve Lx=y for x.
- x = np.zeros(size)
+ x = np.zeros(size, dtype=float)
x[0] = y[0] / L[0, 0]
for i in range(1, size):
temp_sum = 0.0
@@ -50,12 +49,9 @@ def cholesky_solve(K, y):
# Now, solve LTa=x for a.
L2 = L.T
- a = np.zeros(size)
- a_ms = size - 1
- a[a_ms] = x[a_ms] / L2[a_ms, a_ms]
- # Because of the form of L2 (upper triangular matriz), an inversion of
- # range() needs to be done.
- for i in range(0, a_ms)[::-1]:
+ a = np.zeros(size, dtype=float)
+ a[size - 1] = x[size - 1] / L2[size - 1, size - 1]
+ for i in range(0, size - 1)[::-1]:
temp_sum = 0.0
for j in range(i, size)[::-1]:
temp_sum += L2[i, j] * a[j]
diff --git a/ml_exp/misc.py b/ml_exp/misc.py
index e9142b05f..4bd68eefc 100644
--- a/ml_exp/misc.py
+++ b/ml_exp/misc.py
@@ -29,29 +29,25 @@ init()
def printc(text, color):
"""
Prints texts normaly, but in color. Using colorama.
- text: string with the text to print.
+ text: text to print.
color: color to be used, same as available in colorama.
"""
- color_dic = {'BLACK': Fore.BLACK,
- 'RED': Fore.RED,
- 'GREEN': Fore.GREEN,
- 'YELLOW': Fore.YELLOW,
- 'BLUE': Fore.BLUE,
- 'MAGENTA': Fore.MAGENTA,
- 'CYAN': Fore.CYAN,
- 'WHITE': Fore.WHITE,
- 'RESET': Fore.RESET}
-
- color_dic_keys = color_dic.keys()
- if color not in color_dic_keys:
- print(Fore.RED
- + '\'{}\' not found, using default color.'.format(color)
- + Style.RESET_ALL)
- actual_color = Fore.RESET
- else:
- actual_color = color_dic[color]
-
- print(actual_color + text + Style.RESET_ALL)
+ color = color.upper()
+ colors = {'BLACK': Fore.BLACK,
+ 'RED': Fore.RED,
+ 'GREEN': Fore.GREEN,
+ 'YELLOW': Fore.YELLOW,
+ 'BLUE': Fore.BLUE,
+ 'MAGENTA': Fore.MAGENTA,
+ 'CYAN': Fore.CYAN,
+ 'WHITE': Fore.WHITE,
+ 'RESET': Fore.RESET}
+
+ av_colors = colors.keys()
+ if color not in av_colors:
+ raise KeyError(f'{color} not found.')
+
+ print(colors[color] + text + Style.RESET_ALL)
def plot_benchmarks():
diff --git a/ml_exp/parallel_create_matrices.py b/ml_exp/parallel_create_matrices.py
deleted file mode 100644
index a68122d48..000000000
--- a/ml_exp/parallel_create_matrices.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""MIT License
-
-Copyright (c) 2019 David Luevano Alvarado
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-"""
-from multiprocessing import Process, Pipe
-from ml_exp.c_matrix import c_matrix_multiple
-from ml_exp.lj_matrix import lj_matrix_multiple
-
-
-def parallel_create_matrices(mol_data,
- nc_data,
- ljm_diag_value=None,
- ljm_sigma=1.0,
- ljm_epsilon=1.0,
- max_len=25,
- as_eig=True,
- bohr_radius_units=False):
- """
- Creates the Coulomb and L-J matrices in parallel.
- mol_data: molecule data, matrix of atom coordinates.
- nc_data: nuclear charge data, array of atom data.
- ljm_diag_value: if special diagonal value is to be used for lj matrix.
- ljm_sigma: sigma value for lj matrix.
- ljm_epsilon: psilon value for lj matrix.
- max_len: maximum amount of atoms in molecule.
- as_eig: if data should be returned as matrix or array of eigenvalues.
- bohr_radius_units: if units should be in bohr's radius units.
- """
-
- # Matrices calculation.
- procs = []
- pipes = []
-
- cm_recv, cm_send = Pipe(False)
- p1 = Process(target=c_matrix_multiple,
- args=(mol_data,
- nc_data,
- cm_send,
- max_len,
- as_eig,
- bohr_radius_units))
- procs.append(p1)
- pipes.append(cm_recv)
- p1.start()
-
- ljm_recv, ljm_send = Pipe(False)
- p2 = Process(target=lj_matrix_multiple,
- args=(mol_data,
- nc_data,
- ljm_send,
- ljm_diag_value,
- ljm_sigma,
- ljm_epsilon,
- max_len,
- as_eig,
- bohr_radius_units))
- procs.append(p2)
- pipes.append(ljm_recv)
- p2.start()
-
- cm_data = pipes[0].recv()
- ljm_data = pipes[1].recv()
-
- for proc in procs:
- proc.join()
-
- return cm_data, ljm_data
diff --git a/ml_exp/frob_norm.py b/ml_exp/qm7db.py
index 4c3a2945d..3fd46b6bc 100644
--- a/ml_exp/frob_norm.py
+++ b/ml_exp/qm7db.py
@@ -20,32 +20,36 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
-import math
+from ml_exp.compound import Compound
+import numpy as np
+import random
-def frob_norm(array):
+def qm7db(db_path='data',
+ is_shuffled=True,
+ r_seed=111):
"""
- Calculates the frobenius norm of a given array or matrix.
- array: array of data.
+ Creates a list of compounds with the qm7 database.
+ db_path: path to the database directory.
+ is_shuffled: if the resulting list of compounds should be shuffled.
+ r_seed: random seed to use for the shuffling.
"""
+ fname = f'{db_path}/hof_qm7.txt'
+ with open(fname, 'r') as f:
+ lines = f.readlines()
- arr_sh_len = len(array.shape)
- arr_range = range(len(array))
- fn = 0.0
+ compounds = []
+ for i, line in enumerate(lines):
+ line = line.split()
+ compounds.append(Compound(f'{db_path}/{line[0]}'))
+ compounds[i].pbe0 = float(line[1])
+ compounds[i].delta = float(line[1]) - float(line[2])
- # If it is a 'vector'.
- if arr_sh_len == 1:
- for i in arr_range:
- fn += array[i]*array[i]
+ if is_shuffled:
+ random.seed(r_seed)
+ random.shuffle(compounds)
- return math.sqrt(fn)
+ e_pbe0 = np.array([compound.pbe0 for compound in compounds], dtype=float)
+ e_delta = np.array([compound.delta for compound in compounds], dtype=float)
- # If it is a matrix.
- elif arr_sh_len == 2:
- for i in arr_range:
- for j in arr_range:
- fn += array[i, j]*array[i, j]
-
- return math.sqrt(fn)
- else:
- print('Error. Array size greater than 2 ({}).'.format(arr_sh_len))
+ return compounds, e_pbe0, e_delta
diff --git a/ml_exp/read_qm7_data.py b/ml_exp/read_qm7_data.py
deleted file mode 100644
index 81be05a17..000000000
--- a/ml_exp/read_qm7_data.py
+++ /dev/null
@@ -1,162 +0,0 @@
-"""MIT License
-
-Copyright (c) 2019 David Luevano Alvarado
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-"""
-import os
-import time
-import numpy as np
-import random
-from ml_exp.misc import printc
-
-
-# 'periodic_table_of_elements.txt' retrieved from
-# https://gist.github.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee
-def read_nc_data(data_path):
- """
- Reads nuclear charge data from file and returns a dictionary.
- data_path: path to the data directory.
- """
- fname = 'periodic_table_of_elements.txt'
- with open(''.join([data_path, '/', fname]), 'r') as infile:
- temp_lines = infile.readlines()
-
- del temp_lines[0]
-
- lines = []
- for temp_line in temp_lines:
- new_line = temp_line.split(sep=',')
- lines.append(new_line)
-
- # Dictionary of nuclear charge.
- return {line[2]: int(line[0]) for line in lines}
-
-
-# 'hof_qm7.txt.txt' retrieved from
-# https://github.com/qmlcode/tutorial
-def read_db_data(zi_data,
- data_path,
- r_seed=111,
- return_atoms=False):
- """
- Reads molecule database and extracts
- its contents as usable variables.
- zi_data: dictionary containing nuclear charge data.
- data_path: path to the data directory.
- r_seed: random seed to use for the shuffling.
- return_atoms: if atom list should be returned.
- """
- os.chdir(data_path)
-
- fname = 'hof_qm7.txt'
- with open(fname, 'r') as infile:
- lines = infile.readlines()
-
- # Temporary energy dictionary.
- energy_temp = dict()
-
- for line in lines:
- xyz_data = line.split()
-
- xyz_name = xyz_data[0]
- hof = float(xyz_data[1])
- dftb = float(xyz_data[2])
- # print(xyz_name, hof, dftb)
-
- energy_temp[xyz_name] = np.array([hof, hof - dftb])
-
- # Use a random seed.
- random.seed(r_seed)
-
- et_keys = list(energy_temp.keys())
- random.shuffle(et_keys)
-
- # Temporary energy dictionary, shuffled.
- energy_temp_shuffled = dict()
- for key in et_keys:
- energy_temp_shuffled.update({key: energy_temp[key]})
-
- mol_data = []
- mol_nc_data = []
- atoms = []
- # Actual reading of the xyz files.
- for i, k in enumerate(energy_temp_shuffled.keys()):
- with open(k, 'r') as xyz_file:
- lines = xyz_file.readlines()
-
- len_lines = len(lines)
- mol_temp_data = []
- mol_nc_temp_data = np.array(np.zeros(len_lines-2))
- atoms_temp = []
- for j, line in enumerate(lines[2:len_lines]):
- line_list = line.split()
-
- atoms_temp.append(line_list[0])
- mol_nc_temp_data[j] = float(zi_data[line_list[0]])
- line_data = np.array(np.asarray(line_list[1:4], dtype=float))
- mol_temp_data.append(line_data)
-
- mol_data.append(mol_temp_data)
- mol_nc_data.append(mol_nc_temp_data)
- atoms.append(atoms_temp)
-
- # Convert everything to a numpy array.
- molecules = np.array([np.array(mol) for mol in mol_data])
- nuclear_charge = np.array([nc_d for nc_d in mol_nc_data])
- energy_pbe0 = np.array([energy_temp_shuffled[k][0]
- for k in energy_temp_shuffled.keys()])
- energy_delta = np.array([energy_temp_shuffled[k][1]
- for k in energy_temp_shuffled.keys()])
-
- if return_atoms:
- return molecules, nuclear_charge, energy_pbe0, energy_delta, atoms
- return molecules, nuclear_charge, energy_pbe0, energy_delta
-
-
-def read_qm7_data(data_path='data',
- r_seed=111,
- return_atoms=False):
- """
- Reads all the qm7 data.
- data_path: path to the data directory.
- r_seed: random seed to use for the shuffling.
- return_atoms: if atom list should be returned.
- """
- tic = time.perf_counter()
- printc('Data reading started.', 'CYAN')
-
- init_path = os.getcwd()
- os.chdir(data_path)
- data_path = os.getcwd()
-
- zi_data = read_nc_data(data_path)
- if return_atoms:
- molecules, nuclear_charge, energy_pbe0, energy_delta, atoms = \
- read_db_data(zi_data, data_path, r_seed, return_atoms)
- else:
- molecules, nuclear_charge, energy_pbe0, energy_delta = \
- read_db_data(zi_data, data_path, r_seed)
-
- os.chdir(init_path)
- toc = time.perf_counter()
- printc('\tData reading took {:.4f} seconds.'.format(toc-tic), 'GREEN')
- if return_atoms:
- return molecules, nuclear_charge, energy_pbe0, energy_delta, atoms
- return molecules, nuclear_charge, energy_pbe0, energy_delta
diff --git a/ml_exp/representations.py b/ml_exp/representations.py
new file mode 100644
index 000000000..ea079595e
--- /dev/null
+++ b/ml_exp/representations.py
@@ -0,0 +1,384 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import numpy as np
+from collections import Counter
+
+
+def coulomb_matrix(coords,
+ nc,
+ size=23,
+ as_eig=True,
+ bohr_ru=False):
+ """
+ Creates the Coulomb Matrix from the molecule data given.
+ coords: compound coordinates.
+ nc: nuclear charge data.
+ size: compound size.
+ as_eig: if the representation should be as the eigenvalues.
+ bohr_ru: if radius units should be in bohr's radius units.
+ """
+ if bohr_ru:
+ cr = 0.52917721067
+ else:
+ cr = 1.0
+
+ n = coords.shape[0]
+
+ if not n == nc.shape[0]:
+ raise ValueError('Compound size is different than the nuclear charge \
+size. Arrays are not of the right shape.')
+
+ if size < n:
+ print('Error. Compound size (n) is greater han (size). Using (n)',
+ 'instead of (size).')
+ size = n
+
+ cm = np.zeros((size, size), dtype=float)
+
+ # Actual calculation of the coulomb matrix.
+ for i, xyz_i in enumerate(coords):
+ for j, xyz_j in enumerate(coords):
+ rv = xyz_i - xyz_j
+ r = np.linalg.norm(rv)/cr
+ if i == j:
+ cm[i, j] = (0.5*nc[i]**2.4)
+ else:
+ cm[i, j] = nc[i]*nc[j]/r
+
+ # Now the value will be returned.
+ if as_eig:
+ cm_sorted = np.sort(np.linalg.eig(cm)[0])[::-1]
+ # Thanks to SO for the following lines of code.
+ # https://stackoverflow.com/a/43011036
+
+ # Keep zeros at the end.
+ mask = cm_sorted != 0.
+ f_mask = mask.sum(0, keepdims=1) >\
+ np.arange(cm_sorted.shape[0]-1, -1, -1)
+
+ f_mask = f_mask[::-1]
+ cm_sorted[f_mask] = cm_sorted[mask]
+ cm_sorted[~f_mask] = 0.
+
+ return cm_sorted
+ else:
+ return cm
+
+
+def lennard_jones_matrix(coords,
+ nc,
+ diag_value=None,
+ sigma=1.0,
+ epsilon=1.0,
+ size=23,
+ as_eig=True,
+ bohr_ru=False):
+ """
+ Creates the Lennard-Jones Matrix from the molecule data given.
+ coords: compound coordinates.
+ nc: nuclear charge data.
+ diag_value: if special diagonal value is to be used.
+ sigma: sigma value.
+ epsilon: epsilon value.
+ size: compound size.
+ as_eig: if the representation should be as the eigenvalues.
+ bohr_ru: if radius units should be in bohr's radius units.
+ """
+ if bohr_ru:
+ cr = 0.52917721067
+ else:
+ cr = 1.0
+
+ n = coords.shape[0]
+
+ if not n == nc.shape[0]:
+ raise ValueError('Compound size is different than the nuclear charge \
+size. Arrays are not of the right shape.')
+
+ if size < n:
+ print('Error. Compound size (n) is greater han (size). Using (n)',
+ 'instead of (size).')
+ size = n
+
+ lj = np.zeros((size, size), dtype=float)
+
+ # Actual calculation of the lennard-jones matrix.
+ for i, xyz_i in enumerate(coords):
+ for j, xyz_j in enumerate(coords):
+ if i == j:
+ if diag_value is None:
+ lj[i, j] = (0.5*nc[i]**2.4)
+ else:
+ lj[i, j] = diag_value
+ else:
+ # Calculations are done after i==j is checked
+ # so no division by zero is done.
+
+ # A little play with r exponents
+ # so no square root is calculated.
+ # Conversion factor is included in r^2.
+ rv = xyz_i - xyz_j
+ r = np.linalg.norm(rv)/cr
+
+ # 1/r^n
+ r_2 = sigma**2/r**2
+ r_6 = r_2**3
+ r_12 = r_6**2
+ lj[i, j] = (4*epsilon*(r_12 - r_6))
+
+ # Now the value will be returned.
+ if as_eig:
+ lj_sorted = np.sort(np.linalg.eig(lj)[0])[::-1]
+ # Thanks to SO for the following lines of code.
+ # https://stackoverflow.com/a/43011036
+
+ # Keep zeros at the end.
+ mask = lj_sorted != 0.
+ f_mask = mask.sum(0, keepdims=1) >\
+ np.arange(lj_sorted.shape[0]-1, -1, -1)
+
+ f_mask = f_mask[::-1]
+ lj_sorted[f_mask] = lj_sorted[mask]
+ lj_sorted[~f_mask] = 0.
+
+ return lj_sorted
+ else:
+ return lj
+
+
+def first_neighbor_matrix(coords,
+ nc,
+ atoms,
+ size=23,
+ use_forces=False,
+ bohr_ru=False):
+ """
+ Creates the First Neighbor Matrix from the molecule data given.
+ coords: compound coordinates.
+ nc: nuclear charge data.
+ atoms: list of atoms.
+ use_forces: if the use of forces instead of k_cx should be used.
+ bohr_ru: if radius units should be in bohr's radius units.
+ NOTE: Bond distance of carbon to other elements
+ are (for atoms present in the qm7 dataset):
+ C: 1.20 - 1.54 A (Edited to 1.19 - 1.54 A)
+ H: 1.06 - 1.12 A
+ O: 1.43 - 2.15 A
+ N: 1.47 - 2.10 A
+ S: 1.81 - 2.55 A
+ """
+ if bohr_ru:
+ cr = 0.52917721067
+ else:
+ cr = 1.0
+
+ n = coords.shape[0]
+
+ if not n == nc.shape[0]:
+ raise ValueError('Compound size is different than the nuclear charge \
+size. Arrays are not of the right shape.')
+
+ if size < n:
+ print('Error. Compound size (n) is greater han (size). Using (n)',
+ 'instead of (size).')
+ size = n
+
+ # Possible bonds.
+ cc_bond = sorted(['C', 'C'])
+ ch_bond = sorted(['C', 'H'])
+ co_bond = sorted(['C', 'O'])
+ cn_bond = sorted(['C', 'N'])
+ cs_bond = sorted(['C', 'S'])
+
+ fnm = np.zeros((size, size), dtype=float)
+
+ bonds = []
+ forces = []
+ for i, xyz_i in enumerate(coords):
+ for j, xyz_j in enumerate(coords):
+ # Ignore the diagonal.
+ if i != j:
+ bond = sorted([atoms[i], atoms[j]])
+ rv = xyz_i - xyz_j
+ r = np.linalg.norm(rv)/cr
+
+ # Check for each type of bond.
+ if (cc_bond == bond) and (r >= 1.19 and r <= 1.54):
+ fnm[i, j] = 1.0
+ if j > i:
+ bonds.append((i, j))
+ if use_forces:
+ forces.append(rv*nc[i]*nc[j]/r**3)
+ elif (ch_bond == bond) and (r >= 1.06 and r <= 1.12):
+ fnm[i, j] = 1.0
+ if j > i:
+ bonds.append((i, j))
+ if use_forces:
+ forces.append(rv*nc[i]*nc[j]/r**3)
+ elif (co_bond == bond) and (r >= 1.43 and r <= 2.15):
+ fnm[i, j] = 0.8
+ if j > i:
+ bonds.append((i, j))
+ if use_forces:
+ forces.append(rv*nc[i]*nc[j]/r**3)
+ elif (cn_bond == bond) and (r >= 1.47 and r <= 2.10):
+ fnm[i, j] = 1.0
+ if j > i:
+ bonds.append((i, j))
+ if use_forces:
+ forces.append(rv*nc[i]*nc[j]/r**3)
+ elif (cs_bond == bond) and (r >= 1.81 and r <= 2.55):
+ fnm[i, j] = 0.7
+ if j > i:
+ bonds.append((i, j))
+ if use_forces:
+ forces.append(rv*nc[i]*nc[j]/r**3)
+
+ return fnm, bonds, forces
+
+
+def adjacency_matrix(fnm,
+ bonds,
+ forces,
+ size=22):
+ """
+ Calculates the adjacency matrix given the bond list.
+ fnm: first neighbour matrix.
+ bonds: list of bonds (tuple of indexes).
+ forces: list of forces.
+ size: compund size.
+ """
+ n = len(bonds)
+
+ if size < n:
+ print('Error. Compound size (n) is greater han (size). Using (n)\
+ instead of (size).')
+ size = n
+
+ am = np.zeros((size, size), dtype=float)
+
+ for i, bond_i in enumerate(bonds):
+ for j, bond_j in enumerate(bonds):
+ # Ignore the diagonal.
+ if i != j:
+ if (bond_i[0] in bond_j) or (bond_i[1] in bond_j):
+ if forces:
+ am[i, j] = np.dot(forces[i], forces[j])
+ else:
+ am[i, j] = fnm[bond_i[0], bond_i[1]]
+
+ return am
+
+
+def check_bond(bags,
+ bond):
+ """
+ Checks if a bond is in a bag.
+ bags: list of bags, containing a bag per entry, which in turn
+ contains a list of bond-values.
+ bond: bond to check.
+ """
+ if bags == []:
+ return False, None
+
+ for i, bag in enumerate(bags):
+ if bag[0] == bond:
+ return True, i
+
+ return False, None
+
+
+def bag_of_stuff(cm,
+ atoms,
+ size=23,
+ stuff='bonds'):
+ """
+ Creates the Bag of Bonds using the Coulomb Matrix.
+ cm: coulomb matrix.
+ atoms: list of atoms.
+ size: compound size.
+ """
+ if cm is None:
+ raise ValueError('Coulomb Matrix hasn\'t been initialized for the \
+current compound.')
+
+ if cm.ndim == 1:
+ raise ValueError('Coulomb Matrix (CM) dimension is 1. Maybe it was \
+generated as the vector of eigenvalues, try (re-)generating the CM.')
+
+ n = len(atoms)
+
+ if size < n:
+ print('Error. Compound size (n) is greater han (size). Using (n)',
+ 'instead of (size).')
+ size = n
+
+ # Bond max length, calculated using only the upper triangular matrix.
+ bond_size = int((size * size - size)/2 + size)
+
+ # List where each bag data is stored.
+ bags = []
+ for i, atom_i in enumerate(atoms):
+ for j, atom_j in enumerate(atoms):
+ # Work only in the upper triangle of the coulomb matrix.
+ if j >= i:
+ # Get the string of the current bond.
+ if i == j:
+ current_bond = atom_i
+ else:
+ current_bond = ''.join(sorted([atom_i, atom_j]))
+
+ # Check if that bond is already in a bag.
+ checker = check_bond(bags, current_bond)
+ # Either create a new bag or add values to an existing one.
+ if not checker[0]:
+ bags.append([current_bond, cm[i, j]])
+ else:
+ bags[checker[1]].append(cm[i, j])
+
+ # Create the actual bond list ordered.
+ atom_counter = Counter(atoms)
+ atom_list = sorted(list(set(atoms)))
+ bonds = []
+ for i, a_i in enumerate(atom_list):
+ if atom_counter[a_i] > 1:
+ for a_j in atom_list[i:]:
+ bonds.append(''.join(sorted([a_i, a_j])))
+ bonds = atom_list + bonds
+
+ # Create the final vector for the bos.
+ bos = np.zeros(bond_size, dtype=float)
+ c_i = 0
+ for i, bond in enumerate(bonds):
+ checker = check_bond(bags, bond)
+ if checker[0]:
+ for j, num in enumerate(sorted(bags[checker[1]][1:])[::-1]):
+ # Use c_i as the index for bos if the zero padding should
+ # be at the end of the vector instead of between each bond.
+ bos[i*size + j] = num
+ c_i += 1
+ else:
+ print(f'Error. Bond {bond} from bond list coudn\'t be found',
+ 'in the bags list. This could be a case where the atom',
+ 'is only present oncce in the molecule.')
+ return bos
diff --git a/requirements.txt b/requirements.txt
index 437e0428a..4b242770c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
colorama==0.4.3
numpy==1.18.1
-pandas==0.25.3 \ No newline at end of file
+pandas==1.0.1