summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore4
-rw-r--r--do_ml.py97
-rw-r--r--lj_matrix/__init__.py48
-rw-r--r--lj_matrix/__main__.py (renamed from misc.py)47
-rw-r--r--lj_matrix/c_matrix.py (renamed from c_matrix.py)12
-rw-r--r--lj_matrix/cholesky_solve.py (renamed from cholesky_solve.py)0
-rw-r--r--lj_matrix/do_ml.py227
-rw-r--r--lj_matrix/frob_norm.py (renamed from frob_norm.py)0
-rw-r--r--lj_matrix/gauss_kernel.py (renamed from gauss_kernel.py)2
-rw-r--r--lj_matrix/lj_matrix.py (renamed from lj_matrix.py)51
-rw-r--r--lj_matrix/misc.py174
-rw-r--r--lj_matrix/parallel_create_matrices.py85
-rw-r--r--lj_matrix/read_qm7_data.py (renamed from read_qm7_data.py)13
-rw-r--r--lj_matrix/version.py23
-rw-r--r--requirements.txt6
-rw-r--r--setup.py102
-rw-r--r--test/__init__.py22
-rw-r--r--test/test_c_matrix.py (renamed from main.py)49
18 files changed, 774 insertions, 188 deletions
diff --git a/.gitignore b/.gitignore
index a1bdb4dde..02ab56ded 100644
--- a/.gitignore
+++ b/.gitignore
@@ -114,3 +114,7 @@ venv.bak/
# Original data.
.original_data/
+
+# Benchmarks and figures
+benchmarks.csv
+.figs/ \ No newline at end of file
diff --git a/do_ml.py b/do_ml.py
deleted file mode 100644
index 63a6fc671..000000000
--- a/do_ml.py
+++ /dev/null
@@ -1,97 +0,0 @@
-"""MIT License
-
-Copyright (c) 2019 David Luevano Alvarado
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-"""
-import time
-from misc import printc
-import numpy as np
-from gauss_kernel import gauss_kernel
-from cholesky_solve import cholesky_solve
-
-
-def do_ml(desc_data,
- energy_data,
- training_size,
- test_size=None,
- sigma=1000.0,
- desc_type=None,
- show_msgs=True):
- """
- Does the ML methodology.
- desc_data: descriptor (or representation) data.
- energy_data: energy data associated with desc_data.
- training_size: size of the training set to use.
- test_size: size of the test set to use. If no size is given,
- the last remaining molecules are used.
- sigma: depth of the kernel.
- desc_type: string with the name of the descriptor used.
- show_msgs: Show debug messages or not.
- NOTE: desc_type is just a string and is only for identification purposes.
- Also, training is done with the first part of the data and
- testing with the ending part of the data.
- """
- # Initial calculations for later use.
- d_len = len(desc_data)
- e_len = len(energy_data)
-
- if not desc_type:
- desc_type = 'NOT SPECIFIED'
-
- if d_len != e_len:
- printc(''.join(['ERROR. Descriptor data size different ',
- 'than energy data size.']), 'RED')
- return None
-
- if training_size >= d_len:
- printc('ERROR. Training size greater or equal than data size.', 'RED')
- return None
-
- if not test_size:
- test_size = d_len - training_size
-
- tic = time.perf_counter()
- if show_msgs:
- printc('{} ML started, with parameters:'.format(desc_type), 'CYAN')
- printc('\tTraining size: {}'.format(training_size), 'BLUE')
- printc('\tTest size: {}'.format(test_size), 'BLUE')
- printc('\tSigma: {}'.format(sigma), 'BLUE')
-
- Xcm_training = desc_data[:training_size]
- Ycm_training = energy_data[:training_size]
- Kcm_training = gauss_kernel(Xcm_training, Xcm_training, sigma)
- alpha_cm = cholesky_solve(Kcm_training, Ycm_training)
-
- Xcm_test = desc_data[-test_size:]
- Ycm_test = energy_data[-test_size:]
- Kcm_test = gauss_kernel(Xcm_test, Xcm_training, sigma)
- Ycm_predicted = np.dot(Kcm_test, alpha_cm)
-
- mae = np.mean(np.abs(Ycm_predicted - Ycm_test))
- if show_msgs:
- print('\tMAE for {}: {:.4f}'.format(desc_type, mae))
-
- toc = time.perf_counter()
- tictoc = toc - tic
- if show_msgs:
- printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc),
- 'GREEN')
-
- return mae, tictoc
diff --git a/lj_matrix/__init__.py b/lj_matrix/__init__.py
new file mode 100644
index 000000000..a430aac68
--- /dev/null
+++ b/lj_matrix/__init__.py
@@ -0,0 +1,48 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+from lj_matrix.read_qm7_data import read_nc_data, read_db_data, read_qm7_data
+from lj_matrix.c_matrix import c_matrix, c_matrix_multiple
+from lj_matrix.lj_matrix import lj_matrix, lj_matrix_multiple
+from lj_matrix.frob_norm import frob_norm
+from lj_matrix.gauss_kernel import gauss_kernel
+from lj_matrix.cholesky_solve import cholesky_solve
+from lj_matrix.do_ml import do_ml
+from lj_matrix.parallel_create_matrices import parallel_create_matrices
+from lj_matrix.misc import plot_benchmarks
+
+
+# If somebody does "from package import *", this is what they will
+# be able to access:
+__all__ = ['read_nc_data',
+ 'read_db_data',
+ 'read_qm7_data',
+ 'c_matrix',
+ 'c_matrix_multiple',
+ 'lj_matrix',
+ 'lj_matrix_multiple',
+ 'frob_norm',
+ 'gauss_kernel',
+ 'cholesky_solve',
+ 'do_ml',
+ 'parallel_create_matrices',
+ 'plot_benchmarks']
diff --git a/misc.py b/lj_matrix/__main__.py
index c50653a5c..688e5adcc 100644
--- a/misc.py
+++ b/lj_matrix/__main__.py
@@ -20,34 +20,19 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
-from colorama import init, Fore, Style
-
-init()
-
-
-def printc(text, color):
- """
- Prints texts normaly, but in color. Using colorama.
- text: string with the text to print.
- color: color to be used, same as available in colorama.
- """
- color_dic = {'BLACK': Fore.BLACK,
- 'RED': Fore.RED,
- 'GREEN': Fore.GREEN,
- 'YELLOW': Fore.YELLOW,
- 'BLUE': Fore.BLUE,
- 'MAGENTA': Fore.MAGENTA,
- 'CYAN': Fore.CYAN,
- 'WHITE': Fore.WHITE,
- 'RESET': Fore.RESET}
-
- color_dic_keys = color_dic.keys()
- if color not in color_dic_keys:
- print(Fore.RED
- + '\'{}\' not found, using default color.'.format(color)
- + Style.RESET_ALL)
- actual_color = Fore.RESET
- else:
- actual_color = color_dic[color]
-
- print(actual_color + text + Style.RESET_ALL)
+from lj_matrix.do_ml import do_ml
+# from lj_matrix.misc import plot_benchmarks
+
+if __name__ == '__main__':
+ do_ml(min_training_size=1500,
+ max_training_size=2000,
+ training_increment_size=500,
+ test_size=None,
+ ljm_diag_value=None,
+ ljm_sigma=1.0,
+ ljm_epsilon=1.0,
+ r_seed=111,
+ save_benchmarks=False,
+ show_msgs=True)
+ # plot_benchmarks()
+ print('OK!')
diff --git a/c_matrix.py b/lj_matrix/c_matrix.py
index 2bc4d4c0c..f21ccfd8c 100644
--- a/c_matrix.py
+++ b/lj_matrix/c_matrix.py
@@ -21,16 +21,16 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
import time
-from misc import printc
import math
import numpy as np
from numpy.linalg import eig
+from lj_matrix.misc import printc
def c_matrix(mol_data,
nc_data,
max_len=25,
- as_eig=False,
+ as_eig=True,
bohr_radius_units=False):
"""
Creates the Coulomb Matrix from the molecule data given.
@@ -150,13 +150,16 @@ def c_matrix(mol_data,
def c_matrix_multiple(mol_data,
nc_data,
+ pipe=None,
max_len=25,
- as_eig=False,
+ as_eig=True,
bohr_radius_units=False):
"""
Calculates the Coulomb Matrix of multiple molecules.
mol_data: molecule data, matrix of atom coordinates.
nc_data: nuclear charge data, array of atom data.
+ pipe: for multiprocessing purposes. Sends the data calculated
+ through a pipe.
max_len: maximum amount of atoms in molecule.
as_eig: if data should be returned as matrix or array of eigenvalues.
bohr_radius_units: if units should be in bohr's radius units.
@@ -170,4 +173,7 @@ def c_matrix_multiple(mol_data,
toc = time.perf_counter()
printc('\tCM calculation took {:.4f} seconds.'.format(toc - tic), 'GREEN')
+ if pipe:
+ pipe.send(cm_data)
+
return cm_data
diff --git a/cholesky_solve.py b/lj_matrix/cholesky_solve.py
index bc6a572a3..bc6a572a3 100644
--- a/cholesky_solve.py
+++ b/lj_matrix/cholesky_solve.py
diff --git a/lj_matrix/do_ml.py b/lj_matrix/do_ml.py
new file mode 100644
index 000000000..25a55e823
--- /dev/null
+++ b/lj_matrix/do_ml.py
@@ -0,0 +1,227 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import time
+import numpy as np
+from multiprocessing import Process, Pipe
+from lj_matrix.misc import printc
+from lj_matrix.gauss_kernel import gauss_kernel
+from lj_matrix.cholesky_solve import cholesky_solve
+from lj_matrix.read_qm7_data import read_qm7_data
+from lj_matrix.parallel_create_matrices import parallel_create_matrices
+
+
+def ml(desc_data,
+ energy_data,
+ training_size,
+ desc_type=None,
+ pipe=None,
+ test_size=None,
+ sigma=1000.0,
+ show_msgs=True):
+ """
+ Does the ML methodology.
+ desc_data: descriptor (or representation) data.
+ energy_data: energy data associated with desc_data.
+ training_size: size of the training set to use.
+ desc_type: string with the name of the descriptor used.
+ pipe: for multiprocessing purposes. Sends the data calculated
+ through a pipe.
+ test_size: size of the test set to use. If no size is given,
+ the last remaining molecules are used.
+ sigma: depth of the kernel.
+ show_msgs: Show debug messages or not.
+ NOTE: desc_type is just a string and is only for identification purposes.
+ Also, training is done with the first part of the data and
+ testing with the ending part of the data.
+ """
+ tic = time.perf_counter()
+ # Initial calculations for later use.
+ d_len = len(desc_data)
+ e_len = len(energy_data)
+
+ if not desc_type:
+ desc_type = 'NOT SPECIFIED'
+
+ if d_len != e_len:
+ printc(''.join(['ERROR. Descriptor data size different ',
+ 'than energy data size.']), 'RED')
+ return None
+
+ if training_size >= d_len:
+ printc('ERROR. Training size greater or equal than data size.', 'RED')
+ return None
+
+ if not test_size:
+ test_size = d_len - training_size
+ if test_size > 1500:
+ test_size = 1500
+
+ if show_msgs:
+ printc('{} ML started.'.format(desc_type), 'GREEN')
+ printc('\tTraining size: {}'.format(training_size), 'CYAN')
+ printc('\tTest size: {}'.format(test_size), 'CYAN')
+ printc('\tSigma: {}'.format(sigma), 'CYAN')
+
+ X_training = desc_data[:training_size]
+ Y_training = energy_data[:training_size]
+ K_training = gauss_kernel(X_training, X_training, sigma)
+ alpha_ = cholesky_solve(K_training, Y_training)
+
+ X_test = desc_data[-test_size:]
+ Y_test = energy_data[-test_size:]
+ K_test = gauss_kernel(X_test, X_training, sigma)
+ Y_predicted = np.dot(K_test, alpha_)
+
+ mae = np.mean(np.abs(Y_predicted - Y_test))
+ if show_msgs:
+ printc('\tMAE for {}: {:.4f}'.format(desc_type, mae), 'GREEN')
+
+ toc = time.perf_counter()
+ tictoc = toc - tic
+ if show_msgs:
+ printc('\t{} ML took {:.4f} seconds.'.format(desc_type, tictoc),
+ 'GREEN')
+ printc('\t\tTraining size: {}'.format(training_size), 'CYAN')
+ printc('\t\tTest size: {}'.format(test_size), 'CYAN')
+ printc('\t\tSigma: {}'.format(sigma), 'CYAN')
+
+ if pipe:
+ pipe.send([desc_type, training_size, test_size, sigma, mae, tictoc])
+
+ return mae, tictoc
+
+
+def do_ml(min_training_size,
+ max_training_size=None,
+ training_increment_size=500,
+ test_size=None,
+ ljm_diag_value=None,
+ ljm_sigma=1.0,
+ ljm_epsilon=1.0,
+ r_seed=111,
+ save_benchmarks=False,
+ max_len=25,
+ as_eig=True,
+ bohr_radius_units=False,
+ sigma=1000.0,
+ show_msgs=True):
+ """
+ Main function that does the whole ML process.
+ min_training_size: minimum training size.
+ max_training_size: maximum training size.
+ training_increment_size: training increment size.
+ test_size: size of the test set to use. If no size is given,
+ the last remaining molecules are used.
+ ljm_diag_value: if a special diagonal value should be used in lj matrix.
+ ljm_sigma: sigma value for lj matrix.
+ ljm_epsilon: epsilon value for lj matrix.
+ r_seed: random seed to use for the shuffling.
+ save_benchmarks: if benchmarks should be saved.
+ max_len: maximum amount of atoms in molecule.
+ as_eig: if data should be returned as matrix or array of eigenvalues.
+ bohr_radius_units: if units should be in bohr's radius units.
+ sigma: depth of the kernel.
+ show_msgs: Show debug messages or not.
+ """
+ # Initialization time.
+ init_time = time.perf_counter()
+ if not max_training_size:
+ max_training_size = min_training_size + training_increment_size
+
+ # Data reading.
+ molecules, nuclear_charge, energy_pbe0, energy_delta =\
+ read_qm7_data(r_seed)
+
+ # Matrices calculation.
+ cm_data, ljm_data = parallel_create_matrices(molecules,
+ nuclear_charge,
+ ljm_diag_value,
+ ljm_sigma,
+ ljm_epsilon,
+ max_len,
+ as_eig,
+ bohr_radius_units)
+
+ # ML calculation.
+ procs = []
+ cm_pipes = []
+ ljm_pipes = []
+ for i in range(min_training_size,
+ max_training_size + 1,
+ training_increment_size):
+ cm_recv, cm_send = Pipe(False)
+ p1 = Process(target=ml,
+ args=(cm_data,
+ energy_pbe0,
+ i,
+ 'CM',
+ cm_send,
+ test_size,
+ sigma,
+ show_msgs))
+ procs.append(p1)
+ cm_pipes.append(cm_recv)
+ p1.start()
+
+ ljm_recv, ljm_send = Pipe(False)
+ p2 = Process(target=ml,
+ args=(ljm_data,
+ energy_pbe0,
+ i,
+ 'L-JM',
+ ljm_send,
+ test_size,
+ sigma,
+ show_msgs))
+ procs.append(p2)
+ ljm_pipes.append(ljm_recv)
+ p2.start()
+
+ cm_bench_results = []
+ ljm_bench_results = []
+ for cd_pipe, ljd_pipe in zip(cm_pipes, ljm_pipes):
+ cm_bench_results.append(cd_pipe.recv())
+ ljm_bench_results.append(ljd_pipe.recv())
+
+ for proc in procs:
+ proc.join()
+
+ if save_benchmarks:
+ with open('data\\benchmarks.csv', 'a') as save_file:
+ # save_file.write(''.join(['ml_type,tr_size,te_size,kernel_s,',
+ # 'mae,time,lj_s,lj_e,date_ran\n']))
+ ltime = time.localtime()[:3][::-1]
+ ljm_se = ',' + str(ljm_sigma) + ',' + str(ljm_epsilon) + ','
+ date = '/'.join([str(field) for field in ltime])
+ for cm, ljm, in zip(cm_bench_results, ljm_bench_results):
+ cm_text = ','.join([str(field) for field in cm])\
+ + ',' + date + '\n'
+ ljm_text = ','.join([str(field) for field in ljm])\
+ + ljm_se + date + '\n'
+ save_file.write(cm_text)
+ save_file.write(ljm_text)
+
+ # End of program
+ end_time = time.perf_counter()
+ printc('Program took {:.4f} seconds.'.format(end_time - init_time),
+ 'CYAN')
diff --git a/frob_norm.py b/lj_matrix/frob_norm.py
index 4c3a2945d..4c3a2945d 100644
--- a/frob_norm.py
+++ b/lj_matrix/frob_norm.py
diff --git a/gauss_kernel.py b/lj_matrix/gauss_kernel.py
index 0dfc65d59..5dd8e6406 100644
--- a/gauss_kernel.py
+++ b/lj_matrix/gauss_kernel.py
@@ -22,7 +22,7 @@ SOFTWARE.
"""
import math
import numpy as np
-from frob_norm import frob_norm
+from lj_matrix.frob_norm import frob_norm
def gauss_kernel(X_1, X_2, sigma):
diff --git a/lj_matrix.py b/lj_matrix/lj_matrix.py
index 6769bc0c3..6739ae283 100644
--- a/lj_matrix.py
+++ b/lj_matrix/lj_matrix.py
@@ -21,21 +21,27 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
import time
-from misc import printc
import math
import numpy as np
from numpy.linalg import eig
+from lj_matrix.misc import printc
def lj_matrix(mol_data,
nc_data,
+ diag_value=None,
+ sigma=1.0,
+ epsilon=1.0,
max_len=25,
- as_eig=False,
+ as_eig=True,
bohr_radius_units=False):
"""
Creates the Lennard-Jones Matrix from the molecule data given.
mol_data: molecule data, matrix of atom coordinates.
nc_data: nuclear charge data, array of atom data.
+ diag_value: if special diagonal value is to be used.
+ sigma: sigma value.
+ epsilon: epsilon value.
max_len: maximum amount of atoms in molecule.
as_eig: if data should be returned as matrix or array of eigenvalues.
bohr_radius_units: if units should be in bohr's radius units.
@@ -82,7 +88,10 @@ def lj_matrix(mol_data,
z = (z_i-z_j)**2
if i == j:
- lj[i, j] = (0.5*Z_i**2.4)
+ if diag_value is None:
+ lj[i, j] = (0.5*Z_i**2.4)
+ else:
+ lj[i, j] = diag_value
else:
# Calculations are done after i==j is checked
# so no division by zero is done.
@@ -92,11 +101,11 @@ def lj_matrix(mol_data,
# Conversion factor is included in r^2.
# 1/r^2
- r_2 = 1/(conversion_rate**2*(x + y + z))
+ r_2 = sigma**2/(conversion_rate**2*(x + y + z))
r_6 = math.pow(r_2, 3)
r_12 = math.pow(r_6, 2)
- lj[i, j] = (4*(r_12 - r_6))
+ lj[i, j] = (4*epsilon*(r_12 - r_6))
else:
break
@@ -140,7 +149,10 @@ def lj_matrix(mol_data,
z = (z_i-z_j)**2
if i == j:
- lj_row.append(0.5*Z_i**2.4)
+ if not diag_value:
+ lj_row.append(0.5*Z_i**2.4)
+ else:
+ lj_row.append(diag_value)
else:
# Calculations are done after i==j is checked
# so no division by zero is done.
@@ -150,11 +162,11 @@ def lj_matrix(mol_data,
# Conversion factor is included in r^2.
# 1/r^2
- r_2 = 1/(conversion_rate**2*(x + y + z))
+ r_2 = sigma**2/(conversion_rate**2*(x + y + z))
r_6 = math.pow(r_2, 3)
r_12 = math.pow(r_6, 2)
- lj_row.append(4*(r_12 - r_6))
+ lj_row.append(4*epsilon*(r_12 - r_6))
lj_temp.append(np.array(lj_row))
@@ -168,13 +180,22 @@ def lj_matrix(mol_data,
def lj_matrix_multiple(mol_data,
nc_data,
+ pipe=None,
+ diag_value=None,
+ sigma=1.0,
+ epsilon=1.0,
max_len=25,
- as_eig=False,
+ as_eig=True,
bohr_radius_units=False):
"""
Calculates the Lennard-Jones Matrix of multiple molecules.
mol_data: molecule data, matrix of atom coordinates.
nc_data: nuclear charge data, array of atom data.
+ pipe: for multiprocessing purposes. Sends the data calculated
+ through a pipe.
+ diag_value: if special diagonal value is to be used.
+ sigma: sigma value.
+ epsilon: epsilon value.
max_len: maximum amount of atoms in molecule.
as_eig: if data should be returned as matrix or array of eigenvalues.
bohr_radius_units: if units should be in bohr's radius units.
@@ -182,10 +203,20 @@ def lj_matrix_multiple(mol_data,
printc('L-J Matrices calculation started.', 'CYAN')
tic = time.perf_counter()
- ljm_data = np.array([lj_matrix(mol, nc, max_len, as_eig, bohr_radius_units)
+ ljm_data = np.array([lj_matrix(mol,
+ nc,
+ diag_value,
+ sigma,
+ epsilon,
+ max_len,
+ as_eig,
+ bohr_radius_units)
for mol, nc in zip(mol_data, nc_data)])
toc = time.perf_counter()
printc('\tL-JM calculation took {:.4f} seconds.'.format(toc-tic), 'GREEN')
+ if pipe:
+ pipe.send(ljm_data)
+
return ljm_data
diff --git a/lj_matrix/misc.py b/lj_matrix/misc.py
new file mode 100644
index 000000000..e9142b05f
--- /dev/null
+++ b/lj_matrix/misc.py
@@ -0,0 +1,174 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+from colorama import init, Fore, Style
+import pandas as pd
+
+init()
+
+
+def printc(text, color):
+ """
+ Prints texts normaly, but in color. Using colorama.
+ text: string with the text to print.
+ color: color to be used, same as available in colorama.
+ """
+ color_dic = {'BLACK': Fore.BLACK,
+ 'RED': Fore.RED,
+ 'GREEN': Fore.GREEN,
+ 'YELLOW': Fore.YELLOW,
+ 'BLUE': Fore.BLUE,
+ 'MAGENTA': Fore.MAGENTA,
+ 'CYAN': Fore.CYAN,
+ 'WHITE': Fore.WHITE,
+ 'RESET': Fore.RESET}
+
+ color_dic_keys = color_dic.keys()
+ if color not in color_dic_keys:
+ print(Fore.RED
+ + '\'{}\' not found, using default color.'.format(color)
+ + Style.RESET_ALL)
+ actual_color = Fore.RESET
+ else:
+ actual_color = color_dic[color]
+
+ print(actual_color + text + Style.RESET_ALL)
+
+
+def plot_benchmarks():
+ """
+ For plotting the benchmarks.
+ """
+ # Original columns.
+ or_cols = ['ml_type',
+ 'tr_size',
+ 'te_size',
+ 'kernel_s',
+ 'mae',
+ 'time',
+ 'lj_s',
+ 'lj_e',
+ 'date_ran']
+ # Drop some original columns.
+ dor_cols = ['te_size',
+ 'kernel_s',
+ 'time',
+ 'date_ran']
+
+ # Read benchmarks data and drop some columns.
+ data_temp = pd.read_csv('data\\benchmarks.csv',)
+ data = pd.DataFrame(data_temp, columns=or_cols)
+ data = data.drop(columns=dor_cols)
+
+ # Get the data of the first benchmarks and drop unnecesary columns.
+ first_data = pd.DataFrame(data, index=range(0, 22))
+ first_data = first_data.drop(columns=['lj_s', 'lj_e'])
+
+ # Columns to keep temporarily.
+ fd_columns = ['ml_type',
+ 'tr_size',
+ 'mae']
+
+ # Create new dataframes for each matrix descriptor and fill them.
+ first_data_cm = pd.DataFrame(columns=fd_columns)
+ first_data_ljm = pd.DataFrame(columns=fd_columns)
+ for i in range(first_data.shape[0]):
+ temp_df = first_data.iloc[[i]]
+ if first_data.at[i, 'ml_type'] == 'CM':
+ first_data_cm = first_data_cm.append(temp_df)
+ else:
+ first_data_ljm = first_data_ljm.append(temp_df)
+
+ # Drop unnecesary column and rename 'mae' for later use.
+ first_data_cm = first_data_cm.drop(columns=['ml_type'])\
+ .rename(columns={'mae': 'cm_mae'})
+ first_data_ljm = first_data_ljm.drop(columns=['ml_type'])\
+ .rename(columns={'mae': 'ljm_mae'})
+ # print(first_data_cm)
+ # print(first_data_ljm)
+
+ # Get the cm data axis so it can be joined with the ljm data axis.
+ cm_axis = first_data_cm.plot(x='tr_size',
+ y='cm_mae',
+ kind='line')
+ # Get the ljm data axis and join it with the cm one.
+ plot_axis = first_data_ljm.plot(ax=cm_axis,
+ x='tr_size',
+ y='ljm_mae',
+ kind='line')
+ plot_axis.set_xlabel('tr_size')
+ plot_axis.set_ylabel('mae')
+ plot_axis.set_title('mae for different tr_sizes')
+ # Get the figure and save it.
+ # plot_axis.get_figure().savefig('.figs\\mae_diff_tr_sizes.pdf')
+
+ # Get the rest of the benchmark data and drop unnecesary column.
+ new_data = data.drop(index=range(0, 22))
+ new_data = new_data.drop(columns=['ml_type'])
+
+ # Get the first set and rename it.
+ nd_first = first_data_ljm.rename(columns={'ljm_mae': '1, 1'})
+ ndf_axis = nd_first.plot(x='tr_size',
+ y='1, 1',
+ kind='line')
+ last_axis = ndf_axis
+ for i in range(22, 99, 11):
+ lj_s = new_data['lj_s'][i]
+ lj_e = new_data['lj_e'][i]
+ new_mae = '{}, {}'.format(lj_s, lj_e)
+ nd_temp = pd.DataFrame(new_data, index=range(i, i + 11))\
+ .drop(columns=['lj_s', 'lj_e'])\
+ .rename(columns={'mae': new_mae})
+ last_axis = nd_temp.plot(ax=last_axis,
+ x='tr_size',
+ y=new_mae,
+ kind='line')
+ print(nd_temp)
+
+ last_axis.set_xlabel('tr_size')
+ last_axis.set_ylabel('mae')
+ last_axis.set_title('mae for different parameters of lj(s)')
+
+ last_axis.get_figure().savefig('.figs\\mae_diff_param_lj_s.pdf')
+
+ ndf_axis = nd_first.plot(x='tr_size',
+ y='1, 1',
+ kind='line')
+ last_axis = ndf_axis
+ for i in range(99, data.shape[0], 11):
+ lj_s = new_data['lj_s'][i]
+ lj_e = new_data['lj_e'][i]
+ new_mae = '{}, {}'.format(lj_s, lj_e)
+ nd_temp = pd.DataFrame(new_data, index=range(i, i + 11))\
+ .drop(columns=['lj_s', 'lj_e'])\
+ .rename(columns={'mae': new_mae})
+ last_axis = nd_temp.plot(ax=last_axis,
+ x='tr_size',
+ y=new_mae,
+ kind='line')
+ print(nd_temp)
+
+ last_axis.set_xlabel('tr_size')
+ last_axis.set_ylabel('mae')
+ last_axis.set_title('mae for different parameters of lj(e)')
+
+ last_axis.get_figure().savefig('.figs\\mae_diff_param_lj_e.pdf')
diff --git a/lj_matrix/parallel_create_matrices.py b/lj_matrix/parallel_create_matrices.py
new file mode 100644
index 000000000..cd5ef5c8e
--- /dev/null
+++ b/lj_matrix/parallel_create_matrices.py
@@ -0,0 +1,85 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+from multiprocessing import Process, Pipe
+from lj_matrix.c_matrix import c_matrix_multiple
+from lj_matrix.lj_matrix import lj_matrix_multiple
+
+
+def parallel_create_matrices(mol_data,
+ nc_data,
+ ljm_diag_value=None,
+ ljm_sigma=1.0,
+ ljm_epsilon=1.0,
+ max_len=25,
+ as_eig=True,
+ bohr_radius_units=False):
+ """
+ Creates the Coulomb and L-J matrices in parallel.
+ mol_data: molecule data, matrix of atom coordinates.
+ nc_data: nuclear charge data, array of atom data.
+ ljm_diag_value: if special diagonal value is to be used for lj matrix.
+ ljm_sigma: sigma value for lj matrix.
+ ljm_epsilon: psilon value for lj matrix.
+ max_len: maximum amount of atoms in molecule.
+ as_eig: if data should be returned as matrix or array of eigenvalues.
+ bohr_radius_units: if units should be in bohr's radius units.
+ """
+
+ # Matrices calculation.
+ procs = []
+ pipes = []
+
+ cm_recv, cm_send = Pipe(False)
+ p1 = Process(target=c_matrix_multiple,
+ args=(mol_data,
+ nc_data,
+ cm_send,
+ max_len,
+ as_eig,
+ bohr_radius_units))
+ procs.append(p1)
+ pipes.append(cm_recv)
+ p1.start()
+
+ ljm_recv, ljm_send = Pipe(False)
+ p2 = Process(target=lj_matrix_multiple,
+ args=(mol_data,
+ nc_data,
+ ljm_send,
+ ljm_diag_value,
+ ljm_sigma,
+ ljm_epsilon,
+ max_len,
+ as_eig,
+ bohr_radius_units))
+ procs.append(p2)
+ pipes.append(ljm_recv)
+ p2.start()
+
+ cm_data = pipes[0].recv()
+ ljm_data = pipes[1].recv()
+
+ for proc in procs:
+ proc.join()
+
+ return cm_data, ljm_data
diff --git a/read_qm7_data.py b/lj_matrix/read_qm7_data.py
index 068ea1a42..4401ca1c0 100644
--- a/read_qm7_data.py
+++ b/lj_matrix/read_qm7_data.py
@@ -24,7 +24,7 @@ import os
import time
import numpy as np
import random
-from misc import printc
+from lj_matrix.misc import printc
# 'periodic_table_of_elements.txt' retrieved from
@@ -51,7 +51,7 @@ def read_nc_data(data_path):
# 'hof_qm7.txt.txt' retrieved from
# https://github.com/qmlcode/tutorial
-def reas_db_data(zi_data,
+def read_db_data(zi_data,
data_path,
r_seed=111):
"""
@@ -59,7 +59,7 @@ def reas_db_data(zi_data,
its contents as usable variables.
zi_data: dictionary containing nuclear charge data.
data_path: path to the data directory.
- r_seed: random seed.
+ r_seed: random seed to use for the shuffling.
"""
os.chdir(data_path)
@@ -122,9 +122,10 @@ def reas_db_data(zi_data,
return molecules, nuclear_charge, energy_pbe0, energy_delta
-def read_qm7_data():
+def read_qm7_data(r_seed=111):
"""
Reads all the qm7 data.
+ r_seed: random seed to use for the shuffling.
"""
tic = time.perf_counter()
printc('Data reading started.', 'CYAN')
@@ -135,10 +136,10 @@ def read_qm7_data():
zi_data = read_nc_data(data_path)
molecules, nuclear_charge, energy_pbe0, energy_delta = \
- reas_db_data(zi_data, data_path)
+ read_db_data(zi_data, data_path, r_seed)
os.chdir(init_path)
toc = time.perf_counter()
printc('\tData reading took {:.4f} seconds.'.format(toc-tic), 'GREEN')
- return zi_data, molecules, nuclear_charge, energy_pbe0, energy_delta
+ return molecules, nuclear_charge, energy_pbe0, energy_delta
diff --git a/lj_matrix/version.py b/lj_matrix/version.py
new file mode 100644
index 000000000..fab58433d
--- /dev/null
+++ b/lj_matrix/version.py
@@ -0,0 +1,23 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+__version__ = '0.0.1'
diff --git a/requirements.txt b/requirements.txt
index f91fd71c2..28b557ddb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,4 @@
-colorama==0.4.1
-numpy==1.17.4
+colorama==0.4.3
+numpy==1.18.0
+pandas==0.25.3
+matplotlib==3.1.2 \ No newline at end of file
diff --git a/setup.py b/setup.py
new file mode 100644
index 000000000..719ef3ce0
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,102 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+# This setup.py template was obtained from
+# https://github.com/navdeep-G/setup.py/blob/master/setup.py
+# ----------------------------------------------------------------------
+# Note: To use the 'upload' functionality of this file, you must:
+# $ pipenv install twine --dev
+
+import io
+import os
+
+from setuptools import find_packages, setup
+
+from lj_matrix.version import __version__
+
+# Package meta-data.
+NAME = 'lj_matrix'
+DESCRIPTION = 'A Lennard Jones matrix exploration.'
+URL = 'https://github.com/luevano/lj_matrix'
+EMAIL = 'a301436@uach.mx'
+AUTHOR = 'David Luevano Alvarado'
+REQUIRES_PYTHON = '>=3.7'
+VERSION = __version__
+# VERSION = '0.0.1'
+
+# What packages are required for this module to be executed?
+REQUIRED = [
+ # 'requests', 'maya', 'records',
+]
+
+# What packages are optional?
+EXTRAS = {
+ # 'fancy feature': ['django'],
+}
+
+# The rest you shouldn't have to touch too much :)
+# ------------------------------------------------
+# Except, perhaps the License and Trove Classifiers!
+# If you do change the License, remember to change
+# the Trove Classifier for that!
+
+here = os.path.abspath(os.path.dirname(__file__))
+
+# Import the README and use it as the long-description.
+# Note: this will only work if 'README.md'
+# is present in your MANIFEST.in file!
+try:
+ with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
+ long_description = '\n' + f.read()
+except FileNotFoundError:
+ long_description = DESCRIPTION
+
+# Where the magic happens:
+setup(
+ name=NAME,
+ version=VERSION,
+ description=DESCRIPTION,
+ long_description=long_description,
+ long_description_content_type='text/markdown',
+ author=AUTHOR,
+ author_email=EMAIL,
+ python_requires=REQUIRES_PYTHON,
+ url=URL,
+ packages=find_packages(exclude=["tests",
+ "*.tests",
+ "*.tests.*",
+ "tests.*"]),
+ # If your package is a single module, use this instead of 'packages':
+ # py_modules=['mypackage'],
+ install_requires=REQUIRED,
+ extras_require=EXTRAS,
+ include_package_data=True,
+ license='MIT',
+ classifiers=[
+ # Trove classifiers
+ # Full list: https://pypi.python.org/pypi?%3Aaction=list_classifiers
+ 'License :: OSI Approved :: MIT License',
+ 'Programming Language :: Python',
+ 'Programming Language :: Python :: 3',
+ 'Programming Language :: Python :: 3.7'
+ ]
+)
diff --git a/test/__init__.py b/test/__init__.py
new file mode 100644
index 000000000..48cd14913
--- /dev/null
+++ b/test/__init__.py
@@ -0,0 +1,22 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
diff --git a/main.py b/test/test_c_matrix.py
index 734069920..a8bb5ae34 100644
--- a/main.py
+++ b/test/test_c_matrix.py
@@ -20,41 +20,14 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
-import time
-from misc import printc
-# import matplotlib.pyplot as plt
-from read_qm7_data import read_qm7_data
-from c_matrix import c_matrix_multiple
-from lj_matrix import lj_matrix_multiple
-from do_ml import do_ml
-
-
-# Initialization time.
-init_time = time.perf_counter()
-
-# Data reading.
-zi_data, molecules, nuclear_charge, energy_pbe0, energy_delta =\
- read_qm7_data()
-
-# Matrices calculation.
-cm_data = c_matrix_multiple(molecules, nuclear_charge, as_eig=True)
-ljm_data = lj_matrix_multiple(molecules, nuclear_charge, as_eig=True)
-
-# ML calculation.
-do_ml(cm_data,
- energy_pbe0,
- 1000,
- test_size=100,
- sigma=1000.0,
- desc_type='CM')
-do_ml(ljm_data,
- energy_pbe0,
- 1000,
- test_size=100,
- sigma=1000.0,
- desc_type='L-JM')
-
-# End of program
-end_time = time.perf_counter()
-printc('Program took {:.4f} seconds of runtime.'.format(end_time - init_time),
- 'CYAN')
+import unittest
+from lj_matrix.c_matrix import c_matrix
+
+
+class TestCMatrix(unittest.TestCase):
+ def test_c_matrix(self):
+ self.assertAlmostEqual(1, 1)
+
+
+if __name__ == '__main__':
+ unittest.main()