summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Luevano Alvarado <55825613+luevano@users.noreply.github.com>2020-02-27 16:50:57 -0700
committerDavid Luevano Alvarado <55825613+luevano@users.noreply.github.com>2020-02-27 16:50:57 -0700
commitc4d30a346d114185ab93eb5e88e241b922b5a538 (patch)
treec2bff1ba9802078904c57783e8bd397c36d030eb
parentb937df41c1c5e996be94a3a690908ea989e281dc (diff)
Add simple ml methodology, fix bugs
-rw-r--r--ml_exp/__init__.py5
-rw-r--r--ml_exp/compound.py2
-rw-r--r--ml_exp/do_ml.py65
-rw-r--r--ml_exp/kernels.py6
4 files changed, 56 insertions, 22 deletions
diff --git a/ml_exp/__init__.py b/ml_exp/__init__.py
index c62141a58..48ece56c8 100644
--- a/ml_exp/__init__.py
+++ b/ml_exp/__init__.py
@@ -25,6 +25,7 @@ from ml_exp.representations import coulomb_matrix, lennard_jones_matrix,\
first_neighbor_matrix, adjacency_matrix, check_bond, bag_of_stuff
from ml_exp.math import cholesky_solve
from ml_exp.qm7db import qm7db
+from ml_exp.do_ml import simple_ml, do_ml
__all__ = ['Compound',
'coulomb_matrix',
@@ -34,4 +35,6 @@ __all__ = ['Compound',
'check_bond',
'bag_of_stuff',
'cholesky_solve',
- 'qm7db']
+ 'qm7db',
+ 'simple_ml',
+ 'do_ml']
diff --git a/ml_exp/compound.py b/ml_exp/compound.py
index 9c7102860..1598fa482 100644
--- a/ml_exp/compound.py
+++ b/ml_exp/compound.py
@@ -93,8 +93,8 @@ class Compound:
bohr_ru=bohr_ru)
def gen_am(self,
- size=23,
use_forces=False,
+ size=23,
bohr_ru=False):
"""
Generate the Adjacency Matrix for the compund.
diff --git a/ml_exp/do_ml.py b/ml_exp/do_ml.py
index f32480367..40243d413 100644
--- a/ml_exp/do_ml.py
+++ b/ml_exp/do_ml.py
@@ -31,19 +31,19 @@ from ml_exp.qm7db import qm7db
def simple_ml(descriptors,
energies,
training_size,
- identifier=None,
test_size=None,
sigma=1000.0,
+ identifier=None,
show_msgs=True):
"""
Basic ML methodology for a single descriptor type.
descriptors: array of descriptors.
energies: array of energies.
training_size: size of the training set to use.
- identifier: string with the name of the descriptor used.
test_size: size of the test set to use. If no size is given,
the last remaining molecules are used.
sigma: depth of the kernel.
+ identifier: string with the name of the descriptor used.
show_msgs: if debug messages should be shown.
NOTE: identifier is just a string and is only for identification purposes.
Also, training is done with the first part of the data and
@@ -86,7 +86,7 @@ def simple_ml(descriptors,
mae = np.mean(np.abs(Y_predicted - Y_test))
if show_msgs:
- printc('\tMAE for {identifier}: {mae:.4f}', 'GREEN')
+ printc(f'\tMAE for {identifier}: {mae:.4f}', 'GREEN')
toc = time.perf_counter()
tictoc = toc - tic
@@ -102,20 +102,27 @@ def simple_ml(descriptors,
def do_ml(db_path='data',
is_shuffled=True,
r_seed=111,
+ diag_value=None,
+ lj_sigma=1.0,
+ lj_epsilon=1.0,
+ use_forces=False,
+ size=23,
+ as_eig=True,
+ bohr_ru=False,
+ sigma=1000.0,
show_msgs=True):
"""
Main function that does the whole ML process.
- training_size: minimum training size.
- test_size: size of the test set to use. If no size is given,
- the last remaining molecules are used.
- ljm_diag_value: if a special diagonal value should be used in lj matrix.
- ljm_sigma: sigma value for lj matrix.
- ljm_epsilon: epsilon value for lj matrix.
+ db_path: path to the database directory.
+ is_shuffled: if the resulting list of compounds should be shuffled.
r_seed: random seed to use for the shuffling.
- save_benchmarks: if benchmarks should be saved.
- size: maximum amount of atoms in molecule.
- as_eig: if data should be returned as matrix or array of eigenvalues.
- bohr_radius_units: if units should be in bohr's radius units.
+ diag_value: if special diagonal value is to be used.
+ lj_sigma: sigma value.
+ lj_epsilon: epsilon value.
+ use_forces: if the use of forces instead of k_cx should be used.
+ size: compound size.
+ as_eig: if the representation should be as the eigenvalues.
+ bohr_ru: if radius units should be in bohr's radius units.
sigma: depth of the kernel.
show_msgs: if debug messages should be shown.
"""
@@ -134,16 +141,40 @@ def do_ml(db_path='data',
# Matrices calculation.
tic = time.perf_counter()
for compound in compounds:
- compound.gen_cm()
- compound.gen_ljm()
- compound.gen_am()
+ compound.gen_cm(size=size,
+ as_eig=as_eig,
+ bohr_ru=bohr_ru)
+ compound.gen_ljm(diag_value=diag_value,
+ sigma=lj_sigma,
+ epsilon=lj_epsilon,
+ size=size,
+ as_eig=as_eig,
+ bohr_ru=bohr_ru)
+ compound.gen_am(use_forces=use_forces,
+ size=size,
+ bohr_ru=bohr_ru)
+
+ # Create a numpy array for the descriptors.
+ cm_data = np.array([compound.cm for compound in compounds], dtype=float)
+ ljm_data = np.array([compound.ljm for compound in compounds], dtype=float)
+ am_data = np.array([compound.cm for compound in compounds], dtype=float)
+ print(cm_data.shape, ljm_data.shape, am_data.shape)
+
toc = time.perf_counter()
tictoc = toc - tic
if show_msgs:
printc(f'Matrices calculation took {tictoc:.4f} seconds.', 'CYAN')
# ML calculation.
- # PLHLDR
+ # CM
+ cm_mae, cm_tictoc = simple_ml(cm_data,
+ energy_pbe0,
+ training_size=5000,
+ test_size=1500,
+ sigma=1000.0,
+ identifier='CM',
+ show_msgs=show_msgs)
+ print(cm_mae, cm_tictoc)
# End of program
end_time = time.perf_counter()
diff --git a/ml_exp/kernels.py b/ml_exp/kernels.py
index 7a61a1e1e..3914ffc20 100644
--- a/ml_exp/kernels.py
+++ b/ml_exp/kernels.py
@@ -36,9 +36,9 @@ def gaussian_kernel(X1,
inv_sigma = -0.5 / (sigma*sigma)
K = np.zeros((X1.shape[0], X2.shape[0]), dtype=float)
- for i in X1:
- for j in X2:
- f_norm = np.linalg.norm(i - j)
+ for i, x1 in enumerate(X1):
+ for j, x2 in enumerate(X2):
+ f_norm = np.linalg.norm(x1 - x2)
# print(f_norm)
K[i, j] = math.exp(inv_sigma * f_norm)