diff options
-rw-r--r-- | ml_exp/do_ml.py | 218 |
1 files changed, 45 insertions, 173 deletions
diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py index 110242a1d..de4d4061c 100644 --- a/ml_exp/do_ml.py +++ b/ml_exp/do_ml.py @@ -22,206 +22,78 @@ SOFTWARE. """ import time import numpy as np -from multiprocessing import Process, Pipe from ml_exp.misc import printc -from ml_exp.gauss_kernel import gauss_kernel -from ml_exp.cholesky_solve import cholesky_solve -from ml_exp.read_qm7_data import read_qm7_data -from ml_exp.parallel_create_matrices import parallel_create_matrices - - -def ml(desc_data, - energy_data, - training_size, - desc_type=None, - pipe=None, - test_size=None, - sigma=1000.0, - show_msgs=True): +from ml_exp.kernels import gaussian_kernel +from ml_exp.math import cholesky_solve +# from ml_exp.qm7db import qm7db + + +def simple_ml(descriptors, + energies, + training_size, + identifier=None, + test_size=None, + sigma=1000.0, + show_msgs=True): """ - Does the ML methodology. - desc_data: descriptor (or representation) data. - energy_data: energy data associated with desc_data. + Basic ML methodology for a single descriptor type. + descriptors: array of descriptors. + energies: array of energies. training_size: size of the training set to use. - desc_type: string with the name of the descriptor used. - pipe: for multiprocessing purposes. Sends the data calculated - through a pipe. + identifier: string with the name of the descriptor used. test_size: size of the test set to use. If no size is given, the last remaining molecules are used. sigma: depth of the kernel. - show_msgs: Show debug messages or not. - NOTE: desc_type is just a string and is only for identification purposes. + show_msgs: if debu messages should be shown. + NOTE: identifier is just a string and is only for identification purposes. Also, training is done with the first part of the data and - testing with the ending part of the data. + testing with the ending part of the data. """ tic = time.perf_counter() # Initial calculations for later use. - d_len = len(desc_data) - e_len = len(energy_data) + data_size = descriptors.shape[0] - if not desc_type: - desc_type = 'NOT SPECIFIED' + if not identifier: + identifier = 'NOT SPECIFIED' - if d_len != e_len: - printc(''.join(['ERROR. Descriptor data size different ', - 'than energy data size.']), 'RED') - return None + if not data_size == energies.shape[0]: + raise ValueError('Energies size is different than descriptors size.') - if training_size >= d_len: - printc('ERROR. Training size greater or equal than data size.', 'RED') - return None + if training_size >= data_size: + raise ValueError('Training size is greater or equal to the data size.') + # If test_size is not set, it is set to a maximum size of 1500. if not test_size: - test_size = d_len - training_size + test_size = data_size - training_size if test_size > 1500: test_size = 1500 if show_msgs: - printc('{} ML started.'.format(desc_type), 'GREEN') - printc('\tTraining size: {}'.format(training_size), 'CYAN') - printc('\tTest size: {}'.format(test_size), 'CYAN') - printc('\tSigma: {}'.format(sigma), 'CYAN') + printc(f'{identifier} ML started.', 'GREEN') + printc(f'\tTraining size: {training_size}', 'CYAN') + printc(f'\tTest size: {test_size}', 'CYAN') + printc(f'\tSigma: {test_size}', 'CYAN') - X_training = desc_data[:training_size] - Y_training = energy_data[:training_size] - K_training = gauss_kernel(X_training, X_training, sigma) - alpha_ = cholesky_solve(K_training, Y_training) + X_training = descriptors[:training_size] + Y_training = energies[:training_size] + K_training = gaussian_kernel(X_training, X_training, sigma) + alpha = cholesky_solve(K_training, Y_training) - X_test = desc_data[-test_size:] - Y_test = energy_data[-test_size:] - K_test = gauss_kernel(X_test, X_training, sigma) - Y_predicted = np.dot(K_test, alpha_) + X_test = descriptors[-test_size:] + Y_test = energies[-test_size:] + K_test = gaussian_kernel(X_test, X_training, sigma) + Y_predicted = np.dot(K_test, alpha) mae = np.mean(np.abs(Y_predicted - Y_test)) if show_msgs: - printc('\tMAE for {}: {:.4f}'.format(desc_type, mae), 'GREEN') + printc('\tMAE for {identifier}: {mae:.4f}', 'GREEN') toc = time.perf_counter() tictoc = toc - tic if show_msgs: - printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc), - 'GREEN') - printc('\t\tTraining size: {}'.format(training_size), 'CYAN') - printc('\t\tTest size: {}'.format(test_size), 'CYAN') - printc('\t\tSigma: {}'.format(sigma), 'CYAN') - - if pipe: - pipe.send([desc_type, training_size, test_size, sigma, mae, tictoc]) + printc(f'\t{identifier} ML took {tictoc:.4f} seconds.', 'GREEN') + printc(f'\t\tTraining size: {training_size}', 'CYAN') + printc(f'\t\tTest size: {test_size}', 'CYAN') + printc(f'\t\tSigma: {sigma}', 'CYAN') return mae, tictoc - - -def do_ml(min_training_size, - max_training_size=None, - training_increment_size=500, - test_size=None, - ljm_diag_value=None, - ljm_sigma=1.0, - ljm_epsilon=1.0, - r_seed=111, - save_benchmarks=False, - max_len=25, - as_eig=True, - bohr_radius_units=False, - sigma=1000.0, - show_msgs=True): - """ - Main function that does the whole ML process. - min_training_size: minimum training size. - max_training_size: maximum training size. - training_increment_size: training increment size. - test_size: size of the test set to use. If no size is given, - the last remaining molecules are used. - ljm_diag_value: if a special diagonal value should be used in lj matrix. - ljm_sigma: sigma value for lj matrix. - ljm_epsilon: epsilon value for lj matrix. - r_seed: random seed to use for the shuffling. - save_benchmarks: if benchmarks should be saved. - max_len: maximum amount of atoms in molecule. - as_eig: if data should be returned as matrix or array of eigenvalues. - bohr_radius_units: if units should be in bohr's radius units. - sigma: depth of the kernel. - show_msgs: Show debug messages or not. - """ - # Initialization time. - init_time = time.perf_counter() - if not max_training_size: - max_training_size = min_training_size + training_increment_size - - # Data reading. - molecules, nuclear_charge, energy_pbe0, energy_delta =\ - read_qm7_data(r_seed=r_seed) - - # Matrices calculation. - cm_data, ljm_data = parallel_create_matrices(molecules, - nuclear_charge, - ljm_diag_value, - ljm_sigma, - ljm_epsilon, - max_len, - as_eig, - bohr_radius_units) - - # ML calculation. - procs = [] - cm_pipes = [] - ljm_pipes = [] - for i in range(min_training_size, - max_training_size + 1, - training_increment_size): - cm_recv, cm_send = Pipe(False) - p1 = Process(target=ml, - args=(cm_data, - energy_pbe0, - i, - 'CM', - cm_send, - test_size, - sigma, - show_msgs)) - procs.append(p1) - cm_pipes.append(cm_recv) - p1.start() - - ljm_recv, ljm_send = Pipe(False) - p2 = Process(target=ml, - args=(ljm_data, - energy_pbe0, - i, - 'L-JM', - ljm_send, - test_size, - sigma, - show_msgs)) - procs.append(p2) - ljm_pipes.append(ljm_recv) - p2.start() - - cm_bench_results = [] - ljm_bench_results = [] - for cd_pipe, ljd_pipe in zip(cm_pipes, ljm_pipes): - cm_bench_results.append(cd_pipe.recv()) - ljm_bench_results.append(ljd_pipe.recv()) - - for proc in procs: - proc.join() - - if save_benchmarks: - with open('data\\benchmarks.csv', 'a') as save_file: - # save_file.write(''.join(['ml_type,tr_size,te_size,kernel_s,', - # 'mae,time,lj_s,lj_e,date_ran\n'])) - ltime = time.localtime()[:3][::-1] - ljm_se = ',' + str(ljm_sigma) + ',' + str(ljm_epsilon) + ',' - date = '/'.join([str(field) for field in ltime]) - for cm, ljm, in zip(cm_bench_results, ljm_bench_results): - cm_text = ','.join([str(field) for field in cm])\ - + ',' + date + '\n' - ljm_text = ','.join([str(field) for field in ljm])\ - + ljm_se + date + '\n' - save_file.write(cm_text) - save_file.write(ljm_text) - - # End of program - end_time = time.perf_counter() - printc('Program took {:.4f} seconds.'.format(end_time - init_time), - 'CYAN') |