diff options
-rw-r--r-- | .gitignore | 4 | ||||
-rw-r--r-- | do_ml.py | 97 | ||||
-rw-r--r-- | lj_matrix/__init__.py | 48 | ||||
-rw-r--r-- | lj_matrix/__main__.py (renamed from misc.py) | 47 | ||||
-rw-r--r-- | lj_matrix/c_matrix.py (renamed from c_matrix.py) | 12 | ||||
-rw-r--r-- | lj_matrix/cholesky_solve.py (renamed from cholesky_solve.py) | 0 | ||||
-rw-r--r-- | lj_matrix/do_ml.py | 227 | ||||
-rw-r--r-- | lj_matrix/frob_norm.py (renamed from frob_norm.py) | 0 | ||||
-rw-r--r-- | lj_matrix/gauss_kernel.py (renamed from gauss_kernel.py) | 2 | ||||
-rw-r--r-- | lj_matrix/lj_matrix.py (renamed from lj_matrix.py) | 51 | ||||
-rw-r--r-- | lj_matrix/misc.py | 174 | ||||
-rw-r--r-- | lj_matrix/parallel_create_matrices.py | 85 | ||||
-rw-r--r-- | lj_matrix/read_qm7_data.py (renamed from read_qm7_data.py) | 13 | ||||
-rw-r--r-- | lj_matrix/version.py | 23 | ||||
-rw-r--r-- | requirements.txt | 6 | ||||
-rw-r--r-- | setup.py | 102 | ||||
-rw-r--r-- | test/__init__.py | 22 | ||||
-rw-r--r-- | test/test_c_matrix.py (renamed from main.py) | 49 |
18 files changed, 774 insertions, 188 deletions
diff --git a/.gitignore b/.gitignore index a1bdb4dde..02ab56ded 100644 --- a/.gitignore +++ b/.gitignore @@ -114,3 +114,7 @@ venv.bak/ # Original data. .original_data/ + +# Benchmarks and figures +benchmarks.csv +.figs/
\ No newline at end of file diff --git a/do_ml.py b/do_ml.py deleted file mode 100644 index 63a6fc671..000000000 --- a/do_ml.py +++ /dev/null @@ -1,97 +0,0 @@ -"""MIT License - -Copyright (c) 2019 David Luevano Alvarado - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -import time -from misc import printc -import numpy as np -from gauss_kernel import gauss_kernel -from cholesky_solve import cholesky_solve - - -def do_ml(desc_data, - energy_data, - training_size, - test_size=None, - sigma=1000.0, - desc_type=None, - show_msgs=True): - """ - Does the ML methodology. - desc_data: descriptor (or representation) data. - energy_data: energy data associated with desc_data. - training_size: size of the training set to use. - test_size: size of the test set to use. If no size is given, - the last remaining molecules are used. - sigma: depth of the kernel. - desc_type: string with the name of the descriptor used. - show_msgs: Show debug messages or not. - NOTE: desc_type is just a string and is only for identification purposes. - Also, training is done with the first part of the data and - testing with the ending part of the data. - """ - # Initial calculations for later use. - d_len = len(desc_data) - e_len = len(energy_data) - - if not desc_type: - desc_type = 'NOT SPECIFIED' - - if d_len != e_len: - printc(''.join(['ERROR. Descriptor data size different ', - 'than energy data size.']), 'RED') - return None - - if training_size >= d_len: - printc('ERROR. Training size greater or equal than data size.', 'RED') - return None - - if not test_size: - test_size = d_len - training_size - - tic = time.perf_counter() - if show_msgs: - printc('{} ML started, with parameters:'.format(desc_type), 'CYAN') - printc('\tTraining size: {}'.format(training_size), 'BLUE') - printc('\tTest size: {}'.format(test_size), 'BLUE') - printc('\tSigma: {}'.format(sigma), 'BLUE') - - Xcm_training = desc_data[:training_size] - Ycm_training = energy_data[:training_size] - Kcm_training = gauss_kernel(Xcm_training, Xcm_training, sigma) - alpha_cm = cholesky_solve(Kcm_training, Ycm_training) - - Xcm_test = desc_data[-test_size:] - Ycm_test = energy_data[-test_size:] - Kcm_test = gauss_kernel(Xcm_test, Xcm_training, sigma) - Ycm_predicted = np.dot(Kcm_test, alpha_cm) - - mae = np.mean(np.abs(Ycm_predicted - Ycm_test)) - if show_msgs: - print('\tMAE for {}: {:.4f}'.format(desc_type, mae)) - - toc = time.perf_counter() - tictoc = toc - tic - if show_msgs: - printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc), - 'GREEN') - - return mae, tictoc diff --git a/lj_matrix/__init__.py b/lj_matrix/__init__.py new file mode 100644 index 000000000..a430aac68 --- /dev/null +++ b/lj_matrix/__init__.py @@ -0,0 +1,48 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +from lj_matrix.read_qm7_data import read_nc_data, read_db_data, read_qm7_data +from lj_matrix.c_matrix import c_matrix, c_matrix_multiple +from lj_matrix.lj_matrix import lj_matrix, lj_matrix_multiple +from lj_matrix.frob_norm import frob_norm +from lj_matrix.gauss_kernel import gauss_kernel +from lj_matrix.cholesky_solve import cholesky_solve +from lj_matrix.do_ml import do_ml +from lj_matrix.parallel_create_matrices import parallel_create_matrices +from lj_matrix.misc import plot_benchmarks + + +# If somebody does "from package import *", this is what they will +# be able to access: +__all__ = ['read_nc_data', + 'read_db_data', + 'read_qm7_data', + 'c_matrix', + 'c_matrix_multiple', + 'lj_matrix', + 'lj_matrix_multiple', + 'frob_norm', + 'gauss_kernel', + 'cholesky_solve', + 'do_ml', + 'parallel_create_matrices', + 'plot_benchmarks'] diff --git a/misc.py b/lj_matrix/__main__.py index c50653a5c..688e5adcc 100644 --- a/misc.py +++ b/lj_matrix/__main__.py @@ -20,34 +20,19 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -from colorama import init, Fore, Style - -init() - - -def printc(text, color): - """ - Prints texts normaly, but in color. Using colorama. - text: string with the text to print. - color: color to be used, same as available in colorama. - """ - color_dic = {'BLACK': Fore.BLACK, - 'RED': Fore.RED, - 'GREEN': Fore.GREEN, - 'YELLOW': Fore.YELLOW, - 'BLUE': Fore.BLUE, - 'MAGENTA': Fore.MAGENTA, - 'CYAN': Fore.CYAN, - 'WHITE': Fore.WHITE, - 'RESET': Fore.RESET} - - color_dic_keys = color_dic.keys() - if color not in color_dic_keys: - print(Fore.RED - + '\'{}\' not found, using default color.'.format(color) - + Style.RESET_ALL) - actual_color = Fore.RESET - else: - actual_color = color_dic[color] - - print(actual_color + text + Style.RESET_ALL) +from lj_matrix.do_ml import do_ml +# from lj_matrix.misc import plot_benchmarks + +if __name__ == '__main__': + do_ml(min_training_size=1500, + max_training_size=2000, + training_increment_size=500, + test_size=None, + ljm_diag_value=None, + ljm_sigma=1.0, + ljm_epsilon=1.0, + r_seed=111, + save_benchmarks=False, + show_msgs=True) + # plot_benchmarks() + print('OK!') diff --git a/c_matrix.py b/lj_matrix/c_matrix.py index 2bc4d4c0c..f21ccfd8c 100644 --- a/c_matrix.py +++ b/lj_matrix/c_matrix.py @@ -21,16 +21,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import time -from misc import printc import math import numpy as np from numpy.linalg import eig +from lj_matrix.misc import printc def c_matrix(mol_data, nc_data, max_len=25, - as_eig=False, + as_eig=True, bohr_radius_units=False): """ Creates the Coulomb Matrix from the molecule data given. @@ -150,13 +150,16 @@ def c_matrix(mol_data, def c_matrix_multiple(mol_data, nc_data, + pipe=None, max_len=25, - as_eig=False, + as_eig=True, bohr_radius_units=False): """ Calculates the Coulomb Matrix of multiple molecules. mol_data: molecule data, matrix of atom coordinates. nc_data: nuclear charge data, array of atom data. + pipe: for multiprocessing purposes. Sends the data calculated + through a pipe. max_len: maximum amount of atoms in molecule. as_eig: if data should be returned as matrix or array of eigenvalues. bohr_radius_units: if units should be in bohr's radius units. @@ -170,4 +173,7 @@ def c_matrix_multiple(mol_data, toc = time.perf_counter() printc('\tCM calculation took {:.4f} seconds.'.format(toc - tic), 'GREEN') + if pipe: + pipe.send(cm_data) + return cm_data diff --git a/cholesky_solve.py b/lj_matrix/cholesky_solve.py index bc6a572a3..bc6a572a3 100644 --- a/cholesky_solve.py +++ b/lj_matrix/cholesky_solve.py diff --git a/lj_matrix/do_ml.py b/lj_matrix/do_ml.py new file mode 100644 index 000000000..25a55e823 --- /dev/null +++ b/lj_matrix/do_ml.py @@ -0,0 +1,227 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import time +import numpy as np +from multiprocessing import Process, Pipe +from lj_matrix.misc import printc +from lj_matrix.gauss_kernel import gauss_kernel +from lj_matrix.cholesky_solve import cholesky_solve +from lj_matrix.read_qm7_data import read_qm7_data +from lj_matrix.parallel_create_matrices import parallel_create_matrices + + +def ml(desc_data, + energy_data, + training_size, + desc_type=None, + pipe=None, + test_size=None, + sigma=1000.0, + show_msgs=True): + """ + Does the ML methodology. + desc_data: descriptor (or representation) data. + energy_data: energy data associated with desc_data. + training_size: size of the training set to use. + desc_type: string with the name of the descriptor used. + pipe: for multiprocessing purposes. Sends the data calculated + through a pipe. + test_size: size of the test set to use. If no size is given, + the last remaining molecules are used. + sigma: depth of the kernel. + show_msgs: Show debug messages or not. + NOTE: desc_type is just a string and is only for identification purposes. + Also, training is done with the first part of the data and + testing with the ending part of the data. + """ + tic = time.perf_counter() + # Initial calculations for later use. + d_len = len(desc_data) + e_len = len(energy_data) + + if not desc_type: + desc_type = 'NOT SPECIFIED' + + if d_len != e_len: + printc(''.join(['ERROR. Descriptor data size different ', + 'than energy data size.']), 'RED') + return None + + if training_size >= d_len: + printc('ERROR. Training size greater or equal than data size.', 'RED') + return None + + if not test_size: + test_size = d_len - training_size + if test_size > 1500: + test_size = 1500 + + if show_msgs: + printc('{} ML started.'.format(desc_type), 'GREEN') + printc('\tTraining size: {}'.format(training_size), 'CYAN') + printc('\tTest size: {}'.format(test_size), 'CYAN') + printc('\tSigma: {}'.format(sigma), 'CYAN') + + X_training = desc_data[:training_size] + Y_training = energy_data[:training_size] + K_training = gauss_kernel(X_training, X_training, sigma) + alpha_ = cholesky_solve(K_training, Y_training) + + X_test = desc_data[-test_size:] + Y_test = energy_data[-test_size:] + K_test = gauss_kernel(X_test, X_training, sigma) + Y_predicted = np.dot(K_test, alpha_) + + mae = np.mean(np.abs(Y_predicted - Y_test)) + if show_msgs: + printc('\tMAE for {}: {:.4f}'.format(desc_type, mae), 'GREEN') + + toc = time.perf_counter() + tictoc = toc - tic + if show_msgs: + printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc), + 'GREEN') + printc('\t\tTraining size: {}'.format(training_size), 'CYAN') + printc('\t\tTest size: {}'.format(test_size), 'CYAN') + printc('\t\tSigma: {}'.format(sigma), 'CYAN') + + if pipe: + pipe.send([desc_type, training_size, test_size, sigma, mae, tictoc]) + + return mae, tictoc + + +def do_ml(min_training_size, + max_training_size=None, + training_increment_size=500, + test_size=None, + ljm_diag_value=None, + ljm_sigma=1.0, + ljm_epsilon=1.0, + r_seed=111, + save_benchmarks=False, + max_len=25, + as_eig=True, + bohr_radius_units=False, + sigma=1000.0, + show_msgs=True): + """ + Main function that does the whole ML process. + min_training_size: minimum training size. + max_training_size: maximum training size. + training_increment_size: training increment size. + test_size: size of the test set to use. If no size is given, + the last remaining molecules are used. + ljm_diag_value: if a special diagonal value should be used in lj matrix. + ljm_sigma: sigma value for lj matrix. + ljm_epsilon: epsilon value for lj matrix. + r_seed: random seed to use for the shuffling. + save_benchmarks: if benchmarks should be saved. + max_len: maximum amount of atoms in molecule. + as_eig: if data should be returned as matrix or array of eigenvalues. + bohr_radius_units: if units should be in bohr's radius units. + sigma: depth of the kernel. + show_msgs: Show debug messages or not. + """ + # Initialization time. + init_time = time.perf_counter() + if not max_training_size: + max_training_size = min_training_size + training_increment_size + + # Data reading. + molecules, nuclear_charge, energy_pbe0, energy_delta =\ + read_qm7_data(r_seed) + + # Matrices calculation. + cm_data, ljm_data = parallel_create_matrices(molecules, + nuclear_charge, + ljm_diag_value, + ljm_sigma, + ljm_epsilon, + max_len, + as_eig, + bohr_radius_units) + + # ML calculation. + procs = [] + cm_pipes = [] + ljm_pipes = [] + for i in range(min_training_size, + max_training_size + 1, + training_increment_size): + cm_recv, cm_send = Pipe(False) + p1 = Process(target=ml, + args=(cm_data, + energy_pbe0, + i, + 'CM', + cm_send, + test_size, + sigma, + show_msgs)) + procs.append(p1) + cm_pipes.append(cm_recv) + p1.start() + + ljm_recv, ljm_send = Pipe(False) + p2 = Process(target=ml, + args=(ljm_data, + energy_pbe0, + i, + 'L-JM', + ljm_send, + test_size, + sigma, + show_msgs)) + procs.append(p2) + ljm_pipes.append(ljm_recv) + p2.start() + + cm_bench_results = [] + ljm_bench_results = [] + for cd_pipe, ljd_pipe in zip(cm_pipes, ljm_pipes): + cm_bench_results.append(cd_pipe.recv()) + ljm_bench_results.append(ljd_pipe.recv()) + + for proc in procs: + proc.join() + + if save_benchmarks: + with open('data\\benchmarks.csv', 'a') as save_file: + # save_file.write(''.join(['ml_type,tr_size,te_size,kernel_s,', + # 'mae,time,lj_s,lj_e,date_ran\n'])) + ltime = time.localtime()[:3][::-1] + ljm_se = ',' + str(ljm_sigma) + ',' + str(ljm_epsilon) + ',' + date = '/'.join([str(field) for field in ltime]) + for cm, ljm, in zip(cm_bench_results, ljm_bench_results): + cm_text = ','.join([str(field) for field in cm])\ + + ',' + date + '\n' + ljm_text = ','.join([str(field) for field in ljm])\ + + ljm_se + date + '\n' + save_file.write(cm_text) + save_file.write(ljm_text) + + # End of program + end_time = time.perf_counter() + printc('Program took {:.4f} seconds.'.format(end_time - init_time), + 'CYAN') diff --git a/frob_norm.py b/lj_matrix/frob_norm.py index 4c3a2945d..4c3a2945d 100644 --- a/frob_norm.py +++ b/lj_matrix/frob_norm.py diff --git a/gauss_kernel.py b/lj_matrix/gauss_kernel.py index 0dfc65d59..5dd8e6406 100644 --- a/gauss_kernel.py +++ b/lj_matrix/gauss_kernel.py @@ -22,7 +22,7 @@ SOFTWARE. """ import math import numpy as np -from frob_norm import frob_norm +from lj_matrix.frob_norm import frob_norm def gauss_kernel(X_1, X_2, sigma): diff --git a/lj_matrix.py b/lj_matrix/lj_matrix.py index 6769bc0c3..6739ae283 100644 --- a/lj_matrix.py +++ b/lj_matrix/lj_matrix.py @@ -21,21 +21,27 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import time -from misc import printc import math import numpy as np from numpy.linalg import eig +from lj_matrix.misc import printc def lj_matrix(mol_data, nc_data, + diag_value=None, + sigma=1.0, + epsilon=1.0, max_len=25, - as_eig=False, + as_eig=True, bohr_radius_units=False): """ Creates the Lennard-Jones Matrix from the molecule data given. mol_data: molecule data, matrix of atom coordinates. nc_data: nuclear charge data, array of atom data. + diag_value: if special diagonal value is to be used. + sigma: sigma value. + epsilon: epsilon value. max_len: maximum amount of atoms in molecule. as_eig: if data should be returned as matrix or array of eigenvalues. bohr_radius_units: if units should be in bohr's radius units. @@ -82,7 +88,10 @@ def lj_matrix(mol_data, z = (z_i-z_j)**2 if i == j: - lj[i, j] = (0.5*Z_i**2.4) + if diag_value is None: + lj[i, j] = (0.5*Z_i**2.4) + else: + lj[i, j] = diag_value else: # Calculations are done after i==j is checked # so no division by zero is done. @@ -92,11 +101,11 @@ def lj_matrix(mol_data, # Conversion factor is included in r^2. # 1/r^2 - r_2 = 1/(conversion_rate**2*(x + y + z)) + r_2 = sigma**2/(conversion_rate**2*(x + y + z)) r_6 = math.pow(r_2, 3) r_12 = math.pow(r_6, 2) - lj[i, j] = (4*(r_12 - r_6)) + lj[i, j] = (4*epsilon*(r_12 - r_6)) else: break @@ -140,7 +149,10 @@ def lj_matrix(mol_data, z = (z_i-z_j)**2 if i == j: - lj_row.append(0.5*Z_i**2.4) + if not diag_value: + lj_row.append(0.5*Z_i**2.4) + else: + lj_row.append(diag_value) else: # Calculations are done after i==j is checked # so no division by zero is done. @@ -150,11 +162,11 @@ def lj_matrix(mol_data, # Conversion factor is included in r^2. # 1/r^2 - r_2 = 1/(conversion_rate**2*(x + y + z)) + r_2 = sigma**2/(conversion_rate**2*(x + y + z)) r_6 = math.pow(r_2, 3) r_12 = math.pow(r_6, 2) - lj_row.append(4*(r_12 - r_6)) + lj_row.append(4*epsilon*(r_12 - r_6)) lj_temp.append(np.array(lj_row)) @@ -168,13 +180,22 @@ def lj_matrix(mol_data, def lj_matrix_multiple(mol_data, nc_data, + pipe=None, + diag_value=None, + sigma=1.0, + epsilon=1.0, max_len=25, - as_eig=False, + as_eig=True, bohr_radius_units=False): """ Calculates the Lennard-Jones Matrix of multiple molecules. mol_data: molecule data, matrix of atom coordinates. nc_data: nuclear charge data, array of atom data. + pipe: for multiprocessing purposes. Sends the data calculated + through a pipe. + diag_value: if special diagonal value is to be used. + sigma: sigma value. + epsilon: epsilon value. max_len: maximum amount of atoms in molecule. as_eig: if data should be returned as matrix or array of eigenvalues. bohr_radius_units: if units should be in bohr's radius units. @@ -182,10 +203,20 @@ def lj_matrix_multiple(mol_data, printc('L-J Matrices calculation started.', 'CYAN') tic = time.perf_counter() - ljm_data = np.array([lj_matrix(mol, nc, max_len, as_eig, bohr_radius_units) + ljm_data = np.array([lj_matrix(mol, + nc, + diag_value, + sigma, + epsilon, + max_len, + as_eig, + bohr_radius_units) for mol, nc in zip(mol_data, nc_data)]) toc = time.perf_counter() printc('\tL-JM calculation took {:.4f} seconds.'.format(toc-tic), 'GREEN') + if pipe: + pipe.send(ljm_data) + return ljm_data diff --git a/lj_matrix/misc.py b/lj_matrix/misc.py new file mode 100644 index 000000000..e9142b05f --- /dev/null +++ b/lj_matrix/misc.py @@ -0,0 +1,174 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +from colorama import init, Fore, Style +import pandas as pd + +init() + + +def printc(text, color): + """ + Prints texts normaly, but in color. Using colorama. + text: string with the text to print. + color: color to be used, same as available in colorama. + """ + color_dic = {'BLACK': Fore.BLACK, + 'RED': Fore.RED, + 'GREEN': Fore.GREEN, + 'YELLOW': Fore.YELLOW, + 'BLUE': Fore.BLUE, + 'MAGENTA': Fore.MAGENTA, + 'CYAN': Fore.CYAN, + 'WHITE': Fore.WHITE, + 'RESET': Fore.RESET} + + color_dic_keys = color_dic.keys() + if color not in color_dic_keys: + print(Fore.RED + + '\'{}\' not found, using default color.'.format(color) + + Style.RESET_ALL) + actual_color = Fore.RESET + else: + actual_color = color_dic[color] + + print(actual_color + text + Style.RESET_ALL) + + +def plot_benchmarks(): + """ + For plotting the benchmarks. + """ + # Original columns. + or_cols = ['ml_type', + 'tr_size', + 'te_size', + 'kernel_s', + 'mae', + 'time', + 'lj_s', + 'lj_e', + 'date_ran'] + # Drop some original columns. + dor_cols = ['te_size', + 'kernel_s', + 'time', + 'date_ran'] + + # Read benchmarks data and drop some columns. + data_temp = pd.read_csv('data\\benchmarks.csv',) + data = pd.DataFrame(data_temp, columns=or_cols) + data = data.drop(columns=dor_cols) + + # Get the data of the first benchmarks and drop unnecesary columns. + first_data = pd.DataFrame(data, index=range(0, 22)) + first_data = first_data.drop(columns=['lj_s', 'lj_e']) + + # Columns to keep temporarily. + fd_columns = ['ml_type', + 'tr_size', + 'mae'] + + # Create new dataframes for each matrix descriptor and fill them. + first_data_cm = pd.DataFrame(columns=fd_columns) + first_data_ljm = pd.DataFrame(columns=fd_columns) + for i in range(first_data.shape[0]): + temp_df = first_data.iloc[[i]] + if first_data.at[i, 'ml_type'] == 'CM': + first_data_cm = first_data_cm.append(temp_df) + else: + first_data_ljm = first_data_ljm.append(temp_df) + + # Drop unnecesary column and rename 'mae' for later use. + first_data_cm = first_data_cm.drop(columns=['ml_type'])\ + .rename(columns={'mae': 'cm_mae'}) + first_data_ljm = first_data_ljm.drop(columns=['ml_type'])\ + .rename(columns={'mae': 'ljm_mae'}) + # print(first_data_cm) + # print(first_data_ljm) + + # Get the cm data axis so it can be joined with the ljm data axis. + cm_axis = first_data_cm.plot(x='tr_size', + y='cm_mae', + kind='line') + # Get the ljm data axis and join it with the cm one. + plot_axis = first_data_ljm.plot(ax=cm_axis, + x='tr_size', + y='ljm_mae', + kind='line') + plot_axis.set_xlabel('tr_size') + plot_axis.set_ylabel('mae') + plot_axis.set_title('mae for different tr_sizes') + # Get the figure and save it. + # plot_axis.get_figure().savefig('.figs\\mae_diff_tr_sizes.pdf') + + # Get the rest of the benchmark data and drop unnecesary column. + new_data = data.drop(index=range(0, 22)) + new_data = new_data.drop(columns=['ml_type']) + + # Get the first set and rename it. + nd_first = first_data_ljm.rename(columns={'ljm_mae': '1, 1'}) + ndf_axis = nd_first.plot(x='tr_size', + y='1, 1', + kind='line') + last_axis = ndf_axis + for i in range(22, 99, 11): + lj_s = new_data['lj_s'][i] + lj_e = new_data['lj_e'][i] + new_mae = '{}, {}'.format(lj_s, lj_e) + nd_temp = pd.DataFrame(new_data, index=range(i, i + 11))\ + .drop(columns=['lj_s', 'lj_e'])\ + .rename(columns={'mae': new_mae}) + last_axis = nd_temp.plot(ax=last_axis, + x='tr_size', + y=new_mae, + kind='line') + print(nd_temp) + + last_axis.set_xlabel('tr_size') + last_axis.set_ylabel('mae') + last_axis.set_title('mae for different parameters of lj(s)') + + last_axis.get_figure().savefig('.figs\\mae_diff_param_lj_s.pdf') + + ndf_axis = nd_first.plot(x='tr_size', + y='1, 1', + kind='line') + last_axis = ndf_axis + for i in range(99, data.shape[0], 11): + lj_s = new_data['lj_s'][i] + lj_e = new_data['lj_e'][i] + new_mae = '{}, {}'.format(lj_s, lj_e) + nd_temp = pd.DataFrame(new_data, index=range(i, i + 11))\ + .drop(columns=['lj_s', 'lj_e'])\ + .rename(columns={'mae': new_mae}) + last_axis = nd_temp.plot(ax=last_axis, + x='tr_size', + y=new_mae, + kind='line') + print(nd_temp) + + last_axis.set_xlabel('tr_size') + last_axis.set_ylabel('mae') + last_axis.set_title('mae for different parameters of lj(e)') + + last_axis.get_figure().savefig('.figs\\mae_diff_param_lj_e.pdf') diff --git a/lj_matrix/parallel_create_matrices.py b/lj_matrix/parallel_create_matrices.py new file mode 100644 index 000000000..cd5ef5c8e --- /dev/null +++ b/lj_matrix/parallel_create_matrices.py @@ -0,0 +1,85 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +from multiprocessing import Process, Pipe +from lj_matrix.c_matrix import c_matrix_multiple +from lj_matrix.lj_matrix import lj_matrix_multiple + + +def parallel_create_matrices(mol_data, + nc_data, + ljm_diag_value=None, + ljm_sigma=1.0, + ljm_epsilon=1.0, + max_len=25, + as_eig=True, + bohr_radius_units=False): + """ + Creates the Coulomb and L-J matrices in parallel. + mol_data: molecule data, matrix of atom coordinates. + nc_data: nuclear charge data, array of atom data. + ljm_diag_value: if special diagonal value is to be used for lj matrix. + ljm_sigma: sigma value for lj matrix. + ljm_epsilon: psilon value for lj matrix. + max_len: maximum amount of atoms in molecule. + as_eig: if data should be returned as matrix or array of eigenvalues. + bohr_radius_units: if units should be in bohr's radius units. + """ + + # Matrices calculation. + procs = [] + pipes = [] + + cm_recv, cm_send = Pipe(False) + p1 = Process(target=c_matrix_multiple, + args=(mol_data, + nc_data, + cm_send, + max_len, + as_eig, + bohr_radius_units)) + procs.append(p1) + pipes.append(cm_recv) + p1.start() + + ljm_recv, ljm_send = Pipe(False) + p2 = Process(target=lj_matrix_multiple, + args=(mol_data, + nc_data, + ljm_send, + ljm_diag_value, + ljm_sigma, + ljm_epsilon, + max_len, + as_eig, + bohr_radius_units)) + procs.append(p2) + pipes.append(ljm_recv) + p2.start() + + cm_data = pipes[0].recv() + ljm_data = pipes[1].recv() + + for proc in procs: + proc.join() + + return cm_data, ljm_data diff --git a/read_qm7_data.py b/lj_matrix/read_qm7_data.py index 068ea1a42..4401ca1c0 100644 --- a/read_qm7_data.py +++ b/lj_matrix/read_qm7_data.py @@ -24,7 +24,7 @@ import os import time import numpy as np import random -from misc import printc +from lj_matrix.misc import printc # 'periodic_table_of_elements.txt' retrieved from @@ -51,7 +51,7 @@ def read_nc_data(data_path): # 'hof_qm7.txt.txt' retrieved from # https://github.com/qmlcode/tutorial -def reas_db_data(zi_data, +def read_db_data(zi_data, data_path, r_seed=111): """ @@ -59,7 +59,7 @@ def reas_db_data(zi_data, its contents as usable variables. zi_data: dictionary containing nuclear charge data. data_path: path to the data directory. - r_seed: random seed. + r_seed: random seed to use for the shuffling. """ os.chdir(data_path) @@ -122,9 +122,10 @@ def reas_db_data(zi_data, return molecules, nuclear_charge, energy_pbe0, energy_delta -def read_qm7_data(): +def read_qm7_data(r_seed=111): """ Reads all the qm7 data. + r_seed: random seed to use for the shuffling. """ tic = time.perf_counter() printc('Data reading started.', 'CYAN') @@ -135,10 +136,10 @@ def read_qm7_data(): zi_data = read_nc_data(data_path) molecules, nuclear_charge, energy_pbe0, energy_delta = \ - reas_db_data(zi_data, data_path) + read_db_data(zi_data, data_path, r_seed) os.chdir(init_path) toc = time.perf_counter() printc('\tData reading took {:.4f} seconds.'.format(toc-tic), 'GREEN') - return zi_data, molecules, nuclear_charge, energy_pbe0, energy_delta + return molecules, nuclear_charge, energy_pbe0, energy_delta diff --git a/lj_matrix/version.py b/lj_matrix/version.py new file mode 100644 index 000000000..fab58433d --- /dev/null +++ b/lj_matrix/version.py @@ -0,0 +1,23 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +__version__ = '0.0.1' diff --git a/requirements.txt b/requirements.txt index f91fd71c2..28b557ddb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ -colorama==0.4.1 -numpy==1.17.4 +colorama==0.4.3 +numpy==1.18.0 +pandas==0.25.3 +matplotlib==3.1.2
\ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..719ef3ce0 --- /dev/null +++ b/setup.py @@ -0,0 +1,102 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +# This setup.py template was obtained from +# https://github.com/navdeep-G/setup.py/blob/master/setup.py +# ---------------------------------------------------------------------- +# Note: To use the 'upload' functionality of this file, you must: +# $ pipenv install twine --dev + +import io +import os + +from setuptools import find_packages, setup + +from lj_matrix.version import __version__ + +# Package meta-data. +NAME = 'lj_matrix' +DESCRIPTION = 'A Lennard Jones matrix exploration.' +URL = 'https://github.com/luevano/lj_matrix' +EMAIL = 'a301436@uach.mx' +AUTHOR = 'David Luevano Alvarado' +REQUIRES_PYTHON = '>=3.7' +VERSION = __version__ +# VERSION = '0.0.1' + +# What packages are required for this module to be executed? +REQUIRED = [ + # 'requests', 'maya', 'records', +] + +# What packages are optional? +EXTRAS = { + # 'fancy feature': ['django'], +} + +# The rest you shouldn't have to touch too much :) +# ------------------------------------------------ +# Except, perhaps the License and Trove Classifiers! +# If you do change the License, remember to change +# the Trove Classifier for that! + +here = os.path.abspath(os.path.dirname(__file__)) + +# Import the README and use it as the long-description. +# Note: this will only work if 'README.md' +# is present in your MANIFEST.in file! +try: + with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: + long_description = '\n' + f.read() +except FileNotFoundError: + long_description = DESCRIPTION + +# Where the magic happens: +setup( + name=NAME, + version=VERSION, + description=DESCRIPTION, + long_description=long_description, + long_description_content_type='text/markdown', + author=AUTHOR, + author_email=EMAIL, + python_requires=REQUIRES_PYTHON, + url=URL, + packages=find_packages(exclude=["tests", + "*.tests", + "*.tests.*", + "tests.*"]), + # If your package is a single module, use this instead of 'packages': + # py_modules=['mypackage'], + install_requires=REQUIRED, + extras_require=EXTRAS, + include_package_data=True, + license='MIT', + classifiers=[ + # Trove classifiers + # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers + 'License :: OSI Approved :: MIT License', + 'Programming Language :: Python', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.7' + ] +) diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 000000000..48cd14913 --- /dev/null +++ b/test/__init__.py @@ -0,0 +1,22 @@ +"""MIT License + +Copyright (c) 2019 David Luevano Alvarado + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" diff --git a/main.py b/test/test_c_matrix.py index 734069920..a8bb5ae34 100644 --- a/main.py +++ b/test/test_c_matrix.py @@ -20,41 +20,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import time -from misc import printc -# import matplotlib.pyplot as plt -from read_qm7_data import read_qm7_data -from c_matrix import c_matrix_multiple -from lj_matrix import lj_matrix_multiple -from do_ml import do_ml - - -# Initialization time. -init_time = time.perf_counter() - -# Data reading. -zi_data, molecules, nuclear_charge, energy_pbe0, energy_delta =\ - read_qm7_data() - -# Matrices calculation. -cm_data = c_matrix_multiple(molecules, nuclear_charge, as_eig=True) -ljm_data = lj_matrix_multiple(molecules, nuclear_charge, as_eig=True) - -# ML calculation. -do_ml(cm_data, - energy_pbe0, - 1000, - test_size=100, - sigma=1000.0, - desc_type='CM') -do_ml(ljm_data, - energy_pbe0, - 1000, - test_size=100, - sigma=1000.0, - desc_type='L-JM') - -# End of program -end_time = time.perf_counter() -printc('Program took {:.4f} seconds of runtime.'.format(end_time - init_time), - 'CYAN') +import unittest +from lj_matrix.c_matrix import c_matrix + + +class TestCMatrix(unittest.TestCase): + def test_c_matrix(self): + self.assertAlmostEqual(1, 1) + + +if __name__ == '__main__': + unittest.main() |