summaryrefslogtreecommitdiff
path: root/ml_exp/do_ml.py
diff options
context:
space:
mode:
Diffstat (limited to 'ml_exp/do_ml.py')
-rw-r--r--ml_exp/do_ml.py302
1 files changed, 151 insertions, 151 deletions
diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py
index 110242a1d..d8ee415bf 100644
--- a/ml_exp/do_ml.py
+++ b/ml_exp/do_ml.py
@@ -22,206 +22,206 @@ SOFTWARE.
"""
import time
import numpy as np
-from multiprocessing import Process, Pipe
from ml_exp.misc import printc
-from ml_exp.gauss_kernel import gauss_kernel
-from ml_exp.cholesky_solve import cholesky_solve
-from ml_exp.read_qm7_data import read_qm7_data
-from ml_exp.parallel_create_matrices import parallel_create_matrices
-
-
-def ml(desc_data,
- energy_data,
- training_size,
- desc_type=None,
- pipe=None,
- test_size=None,
- sigma=1000.0,
- show_msgs=True):
+from ml_exp.kernels import gaussian_kernel
+from ml_exp.math import cholesky_solve
+from ml_exp.qm7db import qm7db
+
+
+def simple_ml(descriptors,
+ energies,
+ training_size,
+ test_size=None,
+ sigma=1000.0,
+ identifier=None,
+ show_msgs=True):
"""
- Does the ML methodology.
- desc_data: descriptor (or representation) data.
- energy_data: energy data associated with desc_data.
+ Basic ML methodology for a single descriptor type.
+ descriptors: array of descriptors.
+ energies: array of energies.
training_size: size of the training set to use.
- desc_type: string with the name of the descriptor used.
- pipe: for multiprocessing purposes. Sends the data calculated
- through a pipe.
test_size: size of the test set to use. If no size is given,
the last remaining molecules are used.
sigma: depth of the kernel.
- show_msgs: Show debug messages or not.
- NOTE: desc_type is just a string and is only for identification purposes.
+ identifier: string with the name of the descriptor used.
+ show_msgs: if debug messages should be shown.
+ NOTE: identifier is just a string and is only for identification purposes.
Also, training is done with the first part of the data and
- testing with the ending part of the data.
+ testing with the ending part of the data.
"""
tic = time.perf_counter()
# Initial calculations for later use.
- d_len = len(desc_data)
- e_len = len(energy_data)
+ data_size = descriptors.shape[0]
- if not desc_type:
- desc_type = 'NOT SPECIFIED'
+ if not identifier:
+ identifier = 'NOT SPECIFIED'
- if d_len != e_len:
- printc(''.join(['ERROR. Descriptor data size different ',
- 'than energy data size.']), 'RED')
- return None
+ if not data_size == energies.shape[0]:
+ raise ValueError('Energies size is different than descriptors size.')
- if training_size >= d_len:
- printc('ERROR. Training size greater or equal than data size.', 'RED')
- return None
+ if training_size >= data_size:
+ raise ValueError('Training size is greater or equal to the data size.')
+ # If test_size is not set, it is set to a maximum size of 1500.
if not test_size:
- test_size = d_len - training_size
+ test_size = data_size - training_size
if test_size > 1500:
test_size = 1500
if show_msgs:
- printc('{} ML started.'.format(desc_type), 'GREEN')
- printc('\tTraining size: {}'.format(training_size), 'CYAN')
- printc('\tTest size: {}'.format(test_size), 'CYAN')
- printc('\tSigma: {}'.format(sigma), 'CYAN')
+ printc(f'{identifier} ML started.', 'GREEN')
+ printc(f'\tTraining size: {training_size}', 'CYAN')
+ printc(f'\tTest size: {test_size}', 'CYAN')
+ printc(f'\tSigma: {test_size}', 'CYAN')
- X_training = desc_data[:training_size]
- Y_training = energy_data[:training_size]
- K_training = gauss_kernel(X_training, X_training, sigma)
- alpha_ = cholesky_solve(K_training, Y_training)
+ X_training = descriptors[:training_size]
+ Y_training = energies[:training_size]
+ K_training = gaussian_kernel(X_training, X_training, sigma)
+ alpha = cholesky_solve(K_training, Y_training)
- X_test = desc_data[-test_size:]
- Y_test = energy_data[-test_size:]
- K_test = gauss_kernel(X_test, X_training, sigma)
- Y_predicted = np.dot(K_test, alpha_)
+ X_test = descriptors[-test_size:]
+ Y_test = energies[-test_size:]
+ K_test = gaussian_kernel(X_test, X_training, sigma)
+ Y_predicted = np.dot(K_test, alpha)
mae = np.mean(np.abs(Y_predicted - Y_test))
if show_msgs:
- printc('\tMAE for {}: {:.4f}'.format(desc_type, mae), 'GREEN')
+ printc(f'\tMAE for {identifier}: {mae:.4f}', 'GREEN')
toc = time.perf_counter()
tictoc = toc - tic
if show_msgs:
- printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc),
- 'GREEN')
- printc('\t\tTraining size: {}'.format(training_size), 'CYAN')
- printc('\t\tTest size: {}'.format(test_size), 'CYAN')
- printc('\t\tSigma: {}'.format(sigma), 'CYAN')
-
- if pipe:
- pipe.send([desc_type, training_size, test_size, sigma, mae, tictoc])
+ printc(f'\t{identifier} ML took {tictoc:.4f} seconds.', 'GREEN')
+ printc(f'\t\tTraining size: {training_size}', 'CYAN')
+ printc(f'\t\tTest size: {test_size}', 'CYAN')
+ printc(f'\t\tSigma: {sigma}', 'CYAN')
return mae, tictoc
-def do_ml(min_training_size,
- max_training_size=None,
- training_increment_size=500,
- test_size=None,
- ljm_diag_value=None,
- ljm_sigma=1.0,
- ljm_epsilon=1.0,
+def do_ml(db_path='data',
+ is_shuffled=True,
r_seed=111,
- save_benchmarks=False,
- max_len=25,
+ diag_value=None,
+ lj_sigma=1.0,
+ lj_epsilon=1.0,
+ use_forces=False,
+ stuff='bonds',
+ size=23,
as_eig=True,
- bohr_radius_units=False,
+ bohr_ru=False,
+ training_size=1500,
+ test_size=None,
sigma=1000.0,
+ identifiers=["CM"],
show_msgs=True):
"""
Main function that does the whole ML process.
- min_training_size: minimum training size.
- max_training_size: maximum training size.
- training_increment_size: training increment size.
+ db_path: path to the database directory.
+ is_shuffled: if the resulting list of compounds should be shuffled.
+ r_seed: random seed to use for the shuffling.
+ diag_value: if special diagonal value is to be used.
+ lj_sigma: sigma value.
+ lj_epsilon: epsilon value.
+ use_forces: if the use of forces instead of k_cx should be used.
+ stuff: elements of the bag, by default the known bag of bonds.
+ size: compound size.
+ as_eig: if the representation should be as the eigenvalues.
+ bohr_ru: if radius units should be in bohr's radius units.
+ training_size: size of the training set to use.
test_size: size of the test set to use. If no size is given,
the last remaining molecules are used.
- ljm_diag_value: if a special diagonal value should be used in lj matrix.
- ljm_sigma: sigma value for lj matrix.
- ljm_epsilon: epsilon value for lj matrix.
- r_seed: random seed to use for the shuffling.
- save_benchmarks: if benchmarks should be saved.
- max_len: maximum amount of atoms in molecule.
- as_eig: if data should be returned as matrix or array of eigenvalues.
- bohr_radius_units: if units should be in bohr's radius units.
sigma: depth of the kernel.
- show_msgs: Show debug messages or not.
+ identifiers: list of names (strings) of descriptors to use.
+ show_msgs: if debug messages should be shown.
"""
- # Initialization time.
+ if type(identifiers) != list:
+ raise TypeError('\'identifiers\' is not a list.')
+
init_time = time.perf_counter()
- if not max_training_size:
- max_training_size = min_training_size + training_increment_size
# Data reading.
- molecules, nuclear_charge, energy_pbe0, energy_delta =\
- read_qm7_data(r_seed=r_seed)
+ tic = time.perf_counter()
+ compounds, energy_pbe0, energy_delta = qm7db(db_path=db_path,
+ is_shuffled=is_shuffled,
+ r_seed=r_seed)
+ toc = time.perf_counter()
+ tictoc = toc - tic
+ if show_msgs:
+ printc(f'Data reading took {tictoc:.4f} seconds.', 'CYAN')
# Matrices calculation.
- cm_data, ljm_data = parallel_create_matrices(molecules,
- nuclear_charge,
- ljm_diag_value,
- ljm_sigma,
- ljm_epsilon,
- max_len,
- as_eig,
- bohr_radius_units)
+ tic = time.perf_counter()
+ for compound in compounds:
+ if 'CM' in identifiers:
+ compound.gen_cm(size=size,
+ as_eig=as_eig,
+ bohr_ru=bohr_ru)
+ if 'LJM' in identifiers:
+ compound.gen_ljm(diag_value=diag_value,
+ sigma=lj_sigma,
+ epsilon=lj_epsilon,
+ size=size,
+ as_eig=as_eig,
+ bohr_ru=bohr_ru)
+ if 'AM' in identifiers:
+ compound.gen_am(use_forces=use_forces,
+ size=size,
+ bohr_ru=bohr_ru)
+ if 'BOS' in identifiers:
+ compound.gen_bos(size=size,
+ stuff=stuff)
+
+ # Create a numpy array for the descriptors.
+ if 'CM' in identifiers:
+ cm_data = np.array([comp.cm for comp in compounds], dtype=float)
+ if 'LJM' in identifiers:
+ ljm_data = np.array([comp.ljm for comp in compounds], dtype=float)
+ if 'AM' in identifiers:
+ am_data = np.array([comp.cm for comp in compounds], dtype=float)
+ if 'BOS' in identifiers:
+ bos_data = np.array([comp.bos for comp in compounds], dtype=float)
+
+ toc = time.perf_counter()
+ tictoc = toc - tic
+ if show_msgs:
+ printc(f'Matrices calculation took {tictoc:.4f} seconds.', 'CYAN')
# ML calculation.
- procs = []
- cm_pipes = []
- ljm_pipes = []
- for i in range(min_training_size,
- max_training_size + 1,
- training_increment_size):
- cm_recv, cm_send = Pipe(False)
- p1 = Process(target=ml,
- args=(cm_data,
- energy_pbe0,
- i,
- 'CM',
- cm_send,
- test_size,
- sigma,
- show_msgs))
- procs.append(p1)
- cm_pipes.append(cm_recv)
- p1.start()
-
- ljm_recv, ljm_send = Pipe(False)
- p2 = Process(target=ml,
- args=(ljm_data,
- energy_pbe0,
- i,
- 'L-JM',
- ljm_send,
- test_size,
- sigma,
- show_msgs))
- procs.append(p2)
- ljm_pipes.append(ljm_recv)
- p2.start()
-
- cm_bench_results = []
- ljm_bench_results = []
- for cd_pipe, ljd_pipe in zip(cm_pipes, ljm_pipes):
- cm_bench_results.append(cd_pipe.recv())
- ljm_bench_results.append(ljd_pipe.recv())
-
- for proc in procs:
- proc.join()
-
- if save_benchmarks:
- with open('data\\benchmarks.csv', 'a') as save_file:
- # save_file.write(''.join(['ml_type,tr_size,te_size,kernel_s,',
- # 'mae,time,lj_s,lj_e,date_ran\n']))
- ltime = time.localtime()[:3][::-1]
- ljm_se = ',' + str(ljm_sigma) + ',' + str(ljm_epsilon) + ','
- date = '/'.join([str(field) for field in ltime])
- for cm, ljm, in zip(cm_bench_results, ljm_bench_results):
- cm_text = ','.join([str(field) for field in cm])\
- + ',' + date + '\n'
- ljm_text = ','.join([str(field) for field in ljm])\
- + ljm_se + date + '\n'
- save_file.write(cm_text)
- save_file.write(ljm_text)
+ if 'CM' in identifiers:
+ cm_mae, cm_tictoc = simple_ml(cm_data,
+ energy_pbe0,
+ training_size=training_size,
+ test_size=test_size,
+ sigma=sigma,
+ identifier='CM',
+ show_msgs=show_msgs)
+ if 'LJM' in identifiers:
+ ljm_mae, ljm_tictoc = simple_ml(ljm_data,
+ energy_pbe0,
+ training_size=training_size,
+ test_size=test_size,
+ sigma=sigma,
+ identifier='LJM',
+ show_msgs=show_msgs)
+ if 'AM' in identifiers:
+ am_mae, am_tictoc = simple_ml(am_data,
+ energy_pbe0,
+ training_size=training_size,
+ test_size=test_size,
+ sigma=sigma,
+ identifier='CM',
+ show_msgs=show_msgs)
+ if 'BOS' in identifiers:
+ bos_mae, bos_tictoc = simple_ml(bos_data,
+ energy_pbe0,
+ training_size=training_size,
+ test_size=test_size,
+ sigma=sigma,
+ identifier='CM',
+ show_msgs=show_msgs)
# End of program
end_time = time.perf_counter()
- printc('Program took {:.4f} seconds.'.format(end_time - init_time),
- 'CYAN')
+ totaltime = end_time - init_time
+ printc(f'Program took {totaltime:.4f} seconds.', 'CYAN')