From 487bf8840846b5d4d694b38985268c308aadb36e Mon Sep 17 00:00:00 2001 From: David Luevano <55825613+luevano@users.noreply.github.com> Date: Wed, 18 Dec 2019 07:21:35 -0700 Subject: Refactor files --- lj_matrix/do_ml.py | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 lj_matrix/do_ml.py (limited to 'lj_matrix/do_ml.py') diff --git a/lj_matrix/do_ml.py b/lj_matrix/do_ml.py new file mode 100644 index 000000000..acf5455f4 --- /dev/null +++ b/lj_matrix/do_ml.py @@ -0,0 +1,108 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import time +from misc import printc +import numpy as np +from lj_matrix.gauss_kernel import gauss_kernel +from lj_matrix.cholesky_solve import cholesky_solve + + +def do_ml(desc_data, + energy_data, + training_size, + desc_type=None, + pipe=None, + test_size=None, + sigma=1000.0, + show_msgs=True): + """ + Does the ML methodology. + desc_data: descriptor (or representation) data. + energy_data: energy data associated with desc_data. + training_size: size of the training set to use. + desc_type: string with the name of the descriptor used. + pipe: for multiprocessing purposes. Sends the data calculated + through a pipe. + test_size: size of the test set to use. If no size is given, + the last remaining molecules are used. + sigma: depth of the kernel. + show_msgs: Show debug messages or not. + NOTE: desc_type is just a string and is only for identification purposes. + Also, training is done with the first part of the data and + testing with the ending part of the data. + """ + # Initial calculations for later use. + d_len = len(desc_data) + e_len = len(energy_data) + + if not desc_type: + desc_type = 'NOT SPECIFIED' + + if d_len != e_len: + printc(''.join(['ERROR. Descriptor data size different ', + 'than energy data size.']), 'RED') + return None + + if training_size >= d_len: + printc('ERROR. Training size greater or equal than data size.', 'RED') + return None + + if not test_size: + test_size = d_len - training_size + if test_size > 1500: + test_size = 1500 + + tic = time.perf_counter() + if show_msgs: + printc('{} ML started.'.format(desc_type), 'GREEN') + printc('\tTraining size: {}'.format(training_size), 'CYAN') + printc('\tTest size: {}'.format(test_size), 'CYAN') + printc('\tSigma: {}'.format(sigma), 'CYAN') + + Xcm_training = desc_data[:training_size] + Ycm_training = energy_data[:training_size] + Kcm_training = gauss_kernel(Xcm_training, Xcm_training, sigma) + alpha_cm = cholesky_solve(Kcm_training, Ycm_training) + + Xcm_test = desc_data[-test_size:] + Ycm_test = energy_data[-test_size:] + Kcm_test = gauss_kernel(Xcm_test, Xcm_training, sigma) + Ycm_predicted = np.dot(Kcm_test, alpha_cm) + + mae = np.mean(np.abs(Ycm_predicted - Ycm_test)) + if show_msgs: + printc('\tMAE for {}: {:.4f}'.format(desc_type, mae), 'GREEN') + + toc = time.perf_counter() + tictoc = toc - tic + if show_msgs: + printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc), + 'GREEN') + printc('\t\tTraining size: {}'.format(training_size), 'CYAN') + printc('\t\tTest size: {}'.format(test_size), 'CYAN') + printc('\t\tSigma: {}'.format(sigma), 'CYAN') + + if pipe: + pipe.send([desc_type, training_size, test_size, sigma, mae, tictoc]) + + return mae, tictoc -- cgit v1.2.3-70-g09d2