summaryrefslogtreecommitdiff
path: root/ml_exp/krr.py
diff options
context:
space:
mode:
Diffstat (limited to 'ml_exp/krr.py')
-rw-r--r--ml_exp/krr.py319
1 files changed, 319 insertions, 0 deletions
diff --git a/ml_exp/krr.py b/ml_exp/krr.py
new file mode 100644
index 000000000..dca1e14e6
--- /dev/null
+++ b/ml_exp/krr.py
@@ -0,0 +1,319 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import time
+import numpy as np
+from scipy import linalg as LA
+try:
+ import tensorflow as tf
+ TF_AV = True
+except ImportError:
+ print('Tensorflow couldn\'t be imported. Maybe it is not installed.')
+ TF_AV = False
+from ml_exp.misc import printc
+from ml_exp.kernels import laplauss_kernel
+from ml_exp.qm7db import qm7db
+
+
+def krr(descriptors,
+ energies,
+ training_size=1500,
+ test_size=None,
+ sigma=1000.0,
+ opt=True,
+ identifier=None,
+ laplauss='gauss',
+ use_tf=True,
+ show_msgs=True):
+ """
+ Basic krr methodology for a single descriptor type.
+ descriptors: array of descriptors.
+ energies: array of energies.
+ training_size: size of the training set to use.
+ test_size: size of the test set to use. If no size is given,
+ the last remaining molecules are used.
+ sigma: depth of the kernel.
+ opt: if the optimized algorithm should be used. For benchmarking purposes.
+ identifier: string with the name of the descriptor used.
+ use_tf: if tensorflow should be used.
+ show_msgs: if debug messages should be shown.
+ NOTE: identifier is just a string and is only for identification purposes.
+ Also, training is done with the first part of the data and
+ testing with the ending part of the data.
+ """
+ tic = time.perf_counter()
+ # Initial calculations for later use.
+ data_size = descriptors.shape[0]
+
+ if not identifier:
+ identifier = 'NOT SPECIFIED'
+
+ if not data_size == energies.shape[0]:
+ raise ValueError('Energies size is different than descriptors size.')
+
+ if training_size >= data_size:
+ raise ValueError('Training size is greater or equal to the data size.')
+
+ # If tf is to be used but couldn't be imported, don't try to use it.
+ if use_tf and not TF_AV:
+ use_tf = False
+
+ # If test_size is not set, it is set to a maximum size of 1500.
+ if not test_size:
+ test_size = data_size - training_size
+ if test_size > 1500:
+ test_size = 1500
+
+ if show_msgs:
+ printc(f'{identifier} ML started.', 'GREEN')
+ printc(f'\tTraining size: {training_size}', 'CYAN')
+ printc(f'\tTest size: {test_size}', 'CYAN')
+ printc(f'\tSigma: {test_size}', 'CYAN')
+
+ if use_tf:
+ if tf.config.experimental.list_physical_devices('GPU'):
+ with tf.device('GPU:0'):
+ X_tr = descriptors[:training_size]
+ Y_tr = energies[:training_size]
+ K_tr = laplauss_kernel(X_tr,
+ X_tr,
+ sigma,
+ laplauss=laplauss,
+ use_tf=use_tf)
+
+ # Adding a small value on the diagonal for cho_solve.
+ dv = tf.linalg.tensor_diag(tf.constant(1e-8,
+ shape=(training_size),
+ dtype=tf.float64))
+ K_tr += dv
+ Y_tr = tf.expand_dims(Y_tr, 1)
+ alpha = tf.linalg.cholesky_solve(tf.linalg.cholesky(K_tr),
+ Y_tr)
+
+ X_te = descriptors[-test_size:]
+ Y_te = energies[-test_size:]
+ K_te = laplauss_kernel(X_te,
+ X_tr,
+ sigma,
+ laplauss=laplauss,
+ use_tf=use_tf)
+
+ Y_te = tf.expand_dims(Y_te, 1)
+ Y_pr = tf.tensordot(K_te, alpha, 1)
+
+ mae = tf.reduce_mean(tf.abs(Y_pr - Y_te))
+ else:
+ raise TypeError('No GPU found, could not create Tensor objects.')
+ else:
+ X_tr = descriptors[:training_size]
+ Y_tr = energies[:training_size]
+ K_tr = laplauss_kernel(X_tr,
+ X_tr,
+ sigma,
+ laplauss=laplauss,
+ use_tf=use_tf)
+
+ # Adding a small value on the diagonal for cho_solve.
+ K_tr[np.diag_indices_from(K_tr)] += 1e-8
+ alpha = LA.cho_solve(LA.cho_factor(K_tr),
+ Y_tr)
+
+ X_te = descriptors[-test_size:]
+ Y_te = energies[-test_size:]
+ K_te = laplauss_kernel(X_te,
+ X_tr,
+ sigma,
+ laplauss=laplauss,
+ use_tf=use_tf)
+ Y_pr = np.dot(K_te, alpha)
+
+ mae = np.mean(np.abs(Y_pr - Y_te))
+
+ if show_msgs:
+ printc(f'\tMAE for {identifier}: {mae:.4f}', 'GREEN')
+
+ toc = time.perf_counter()
+ tictoc = toc - tic
+ if show_msgs:
+ printc(f'\t{identifier} ML took {tictoc:.4f} seconds.', 'GREEN')
+ printc(f'\t\tTraining size: {training_size}', 'CYAN')
+ printc(f'\t\tTest size: {test_size}', 'CYAN')
+ printc(f'\t\tSigma: {sigma}', 'CYAN')
+
+ return mae, tictoc
+
+
+def multi_krr(db_path='data',
+ is_shuffled=True,
+ r_seed=111,
+ diag_value=None,
+ lj_sigma=1.0,
+ lj_epsilon=1.0,
+ use_forces=False,
+ stuff='bonds',
+ size=23,
+ as_eig=True,
+ bohr_ru=False,
+ training_size=1500,
+ test_size=None,
+ sigma=1000.0,
+ identifiers=['CM'],
+ use_tf=True,
+ show_msgs=True):
+ """
+ Main function that does the whole ML process.
+ db_path: path to the database directory.
+ is_shuffled: if the resulting list of compounds should be shuffled.
+ r_seed: random seed to use for the shuffling.
+ diag_value: if special diagonal value is to be used.
+ lj_sigma: sigma value.
+ lj_epsilon: epsilon value.
+ use_forces: if the use of forces instead of k_cx should be used.
+ stuff: elements of the bag, by default the known bag of bonds.
+ size: compound size.
+ as_eig: if the representation should be as the eigenvalues.
+ bohr_ru: if radius units should be in bohr's radius units.
+ training_size: size of the training set to use.
+ test_size: size of the test set to use. If no size is given,
+ the last remaining molecules are used.
+ sigma: depth of the kernel.
+ identifiers: list of names (strings) of descriptors to use.
+ use_tf: if tensorflow should be used.
+ show_msgs: if debug messages should be shown.
+ """
+ if type(identifiers) != list:
+ raise TypeError('\'identifiers\' is not a list.')
+
+ # If tf is to be used but couldn't be imported, don't try to use it.
+ if use_tf and not TF_AV:
+ use_tf = False
+
+ init_time = time.perf_counter()
+
+ # Data reading.
+ tic = time.perf_counter()
+ compounds, energy_pbe0, energy_delta = qm7db(db_path=db_path,
+ is_shuffled=is_shuffled,
+ r_seed=r_seed,
+ use_tf=use_tf)
+ toc = time.perf_counter()
+ tictoc = toc - tic
+ if show_msgs:
+ printc(f'Data reading took {tictoc:.4f} seconds.', 'CYAN')
+
+ # Matrices calculation.
+ tic = time.perf_counter()
+ for compound in compounds:
+ if 'CM' in identifiers:
+ compound.gen_cm(size=size,
+ as_eig=as_eig,
+ bohr_ru=bohr_ru)
+ if 'LJM' in identifiers:
+ compound.gen_ljm(diag_value=diag_value,
+ sigma=lj_sigma,
+ epsilon=lj_epsilon,
+ size=size,
+ as_eig=as_eig,
+ bohr_ru=bohr_ru)
+ if 'AM' in identifiers:
+ compound.gen_hd(size=size,
+ bohr_ru=bohr_ru)
+ compound.gen_am(use_forces=use_forces,
+ size=size)
+ if 'BOB' in identifiers:
+ compound.gen_bob(size=size)
+
+ # Create a numpy array (or tensorflow tensor) for the descriptors.
+ if 'CM' in identifiers:
+ cm_data = np.array([comp.cm for comp in compounds], dtype=np.float64)
+ if 'LJM' in identifiers:
+ ljm_data = np.array([comp.ljm for comp in compounds], dtype=np.float64)
+ if 'AM' in identifiers:
+ am_data = np.array([comp.am for comp in compounds], dtype=np.float64)
+ if 'BOB' in identifiers:
+ bob_data = np.array([comp.bob for comp in compounds], dtype=np.float64)
+
+ if use_tf:
+ if tf.config.experimental.list_physical_devices('GPU'):
+ with tf.device('GPU:0'):
+ if 'CM' in identifiers:
+ cm_data = tf.convert_to_tensor(cm_data)
+ if 'LJM' in identifiers:
+ ljm_data = tf.convert_to_tensor(ljm_data)
+ if 'AM' in identifiers:
+ am_data = tf.convert_to_tensor(am_data)
+ if 'BOB' in identifiers:
+ bob_data = tf.convert_to_tensor(bob_data)
+ else:
+ raise TypeError('No GPU found, could not create Tensor objects.')
+
+ toc = time.perf_counter()
+ tictoc = toc - tic
+ if show_msgs:
+ printc(f'Matrices calculation took {tictoc:.4f} seconds.', 'CYAN')
+
+ # ML calculation.
+ if 'CM' in identifiers:
+ cm_mae, cm_tictoc = simple_ml(cm_data,
+ energy_pbe0,
+ training_size=training_size,
+ test_size=test_size,
+ sigma=sigma,
+ identifier='CM',
+ laplauss='gauss',
+ use_tf=use_tf,
+ show_msgs=show_msgs)
+ if 'LJM' in identifiers:
+ ljm_mae, ljm_tictoc = simple_ml(ljm_data,
+ energy_pbe0,
+ training_size=training_size,
+ test_size=test_size,
+ sigma=sigma,
+ identifier='LJM',
+ laplauss='gauss',
+ use_tf=use_tf,
+ show_msgs=show_msgs)
+ if 'AM' in identifiers:
+ am_mae, am_tictoc = simple_ml(am_data,
+ energy_pbe0,
+ training_size=training_size,
+ test_size=test_size,
+ sigma=sigma,
+ identifier='AM',
+ laplauss='gauss',
+ use_tf=use_tf,
+ show_msgs=show_msgs)
+ if 'BOB' in identifiers:
+ bob_mae, bob_tictoc = simple_ml(bob_data,
+ energy_pbe0,
+ training_size=training_size,
+ test_size=test_size,
+ sigma=sigma,
+ identifier='BOB',
+ laplauss='laplace',
+ use_tf=use_tf,
+ show_msgs=show_msgs)
+
+ # End of program
+ end_time = time.perf_counter()
+ totaltime = end_time - init_time
+ printc(f'Program took {totaltime:.4f} seconds.', 'CYAN')