First commit

author: David Luevano <55825613+luevano@users.noreply.github.com> 2019-12-08 21:54:19 -0700
committer: David Luevano <55825613+luevano@users.noreply.github.com> 2019-12-08 21:54:19 -0700
commit: 7c8b391dddd65ff58417df80dfc79cadddf4f4e6 (patch)
tree: 0bad908f5b40848addc3307de4307b1dae4c6e7e
11 files changed, 718 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..440f0a8ae
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,113 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+#
+#
+# PERSONAL IGNORES.
+#
+#
+
+# vscode
+.vscode/
diff --git a/c_matrix.py b/c_matrix.py
new file mode 100644
index 000000000..b47e15aac
--- /dev/null
+++ b/c_matrix.py
@@ -0,0 +1,124 @@
+import math
+import numpy as np
+from numpy.linalg import eig
+
+
+def c_matrix(mol_data,
+             nc_data,
+             max_len=25,
+             as_eig=False,
+             bohr_radius_units=False):
+    """
+    Creates the coulomb matrix from the molecule data given.
+    mol_data: molecule data, matrix of atom coordinates.
+    nc_data: nuclear charge data, array of atom data.
+    max_len: maximum amount of atoms in molecule.
+    as_eig: if data should be returned as matrix or array of eigenvalues.
+    bohr_radius_units: if units should be in bohr's radius units.
+    """
+    if bohr_radius_units:
+        conversion_rate = 0.52917721067
+    else:
+        conversion_rate = 1
+
+    mol_n = len(mol_data)
+    mol_nr = range(mol_n)
+
+    if not mol_n == len(nc_data):
+        print(''.join(['Error. Molecule matrix dimension is different ',
+                       'than the nuclear charge array dimension.']))
+    else:
+        if max_len < mol_n:
+            print(''.join(['Error. Molecule matrix dimension (mol_n) is ',
+                           'greater than max_len. Using mol_n.']))
+            max_len = None
+
+        if max_len:
+            cm = np.zeros((max_len, max_len))
+            ml_r = range(max_len)
+
+            # Actual calculation of the coulomb matrix.
+            for i in ml_r:
+                if i < mol_n:
+                    x_i = mol_data[i, 0]
+                    y_i = mol_data[i, 1]
+                    z_i = mol_data[i, 2]
+                    Z_i = nc_data[i]
+                else:
+                    break
+
+                for j in ml_r:
+                    if j < mol_n:
+                        x_j = mol_data[j, 0]
+                        y_j = mol_data[j, 1]
+                        z_j = mol_data[j, 2]
+                        Z_j = nc_data[j]
+
+                        x = (x_i-x_j)**2
+                        y = (y_i-y_j)**2
+                        z = (z_i-z_j)**2
+
+                        if i == j:
+                            cm[i, j] = (0.5*Z_i**2.4)
+                        else:
+                            cm[i, j] = (conversion_rate*Z_i*Z_j/math.sqrt(x
+                                                                          + y
+                                                                          + z))
+                    else:
+                        break
+
+            # Now the value will be returned.
+            if as_eig:
+                cm_sorted = np.sort(eig(cm)[0])[::-1]
+                # Thanks to SO for the following lines of code.
+                # https://stackoverflow.com/a/43011036
+
+                # Keep zeros at the end.
+                mask = cm_sorted != 0.
+                f_mask = mask.sum(0, keepdims=1) >\
+                    np.arange(cm_sorted.shape[0]-1, -1, -1)
+
+                f_mask = f_mask[::-1]
+                cm_sorted[f_mask] = cm_sorted[mask]
+                cm_sorted[~f_mask] = 0.
+
+                return cm_sorted
+
+            else:
+                return cm
+
+        else:
+            cm_temp = []
+            # Actual calculation of the coulomb matrix.
+            for i in mol_nr:
+                x_i = mol_data[i, 0]
+                y_i = mol_data[i, 1]
+                z_i = mol_data[i, 2]
+                Z_i = nc_data[i]
+
+                cm_row = []
+                for j in mol_nr:
+                    x_j = mol_data[j, 0]
+                    y_j = mol_data[j, 1]
+                    z_j = mol_data[j, 2]
+                    Z_j = nc_data[j]
+
+                    x = (x_i-x_j)**2
+                    y = (y_i-y_j)**2
+                    z = (z_i-z_j)**2
+
+                    if i == j:
+                        cm_row.append(0.5*Z_i**2.4)
+                    else:
+                        cm_row.append(conversion_rate*Z_i*Z_j/math.sqrt(x
+                                                                        + y
+                                                                        + z))
+
+                cm_temp.append(np.array(cm_row))
+
+            cm = np.array(cm_temp)
+            # Now the value will be returned.
+            if as_eig:
+                return np.sort(eig(cm)[0])[::-1]
+            else:
+                return cm
diff --git a/cholesky_solve.py b/cholesky_solve.py
new file mode 100644
index 000000000..b2addc01c
--- /dev/null
+++ b/cholesky_solve.py
@@ -0,0 +1,42 @@
+import numpy as np
+from numpy.linalg import cholesky
+
+
+def cholesky_solve(K, y):
+    """
+    Applies Cholesky decomposition to obtain the 'alpha coeficients'.
+    K: kernel.
+    y: known parameters.
+    """
+    # The initial mathematical problem is to solve Ka=y.
+
+    # First, add a small lambda value.
+    K[np.diag_indices_from(K)] += 1e-8
+
+    # Get the Cholesky decomposition of the kernel.
+    L = cholesky(K)
+    size = len(L)
+
+    # Solve Lx=y for x.
+    x = np.zeros(size)
+    x[0] = y[0] / L[0, 0]
+    for i in range(1, size):
+        temp_sum = 0.0
+        for j in range(i):
+            temp_sum += L[i, j] * x[j]
+        x[i] = (y[i] - temp_sum) / L[i, i]
+
+    # Now, solve LTa=x for a.
+    L2 = L.T
+    a = np.zeros(size)
+    a_ms = size - 1
+    a[a_ms] = x[a_ms] / L2[a_ms, a_ms]
+    # Because of the form of L2 (upper triangular matriz), an inversion of
+    # range() needs to be done.
+    for i in range(0, a_ms)[::-1]:
+        temp_sum = 0.0
+        for j in range(i, size)[::-1]:
+            temp_sum += L2[i, j] * a[j]
+        a[i] = (x[i] - temp_sum) / L2[i, i]
+
+    return a
diff --git a/desktop.ini b/desktop.ini
new file mode 100644
index 000000000..dce88a3ef
--- /dev/null
+++ b/desktop.ini
@@ -0,0 +1,5 @@
+[.ShellClassInfo]
+InfoTip=This folder is shared online.
+IconFile=C:\Program Files\Google\Drive\googledrivesync.exe
+IconIndex=16
+    
+\ No newline at end of file
diff --git a/frob_norm.py b/frob_norm.py
new file mode 100644
index 000000000..8f8dc48bd
--- /dev/null
+++ b/frob_norm.py
@@ -0,0 +1,29 @@
+import math
+
+
+def frob_norm(array):
+    """
+    Calculates the frobenius norm of a given array or matrix.
+    array: array of data.
+    """
+
+    arr_sh_len = len(array.shape)
+    arr_range = range(len(array))
+    fn = 0.0
+
+    # If it is a 'vector'.
+    if arr_sh_len == 1:
+        for i in arr_range:
+            fn += array[i]*array[i]
+
+        return math.sqrt(fn)
+
+    # If it is a matrix.
+    elif arr_sh_len == 2:
+        for i in arr_range:
+            for j in arr_range:
+                fn += array[i, j]*array[i, j]
+
+        return math.sqrt(fn)
+    else:
+        print('Error. Array size greater than 2 ({}).'.format(arr_sh_len))
diff --git a/gauss_kernel.py b/gauss_kernel.py
new file mode 100644
index 000000000..3b0a5a198
--- /dev/null
+++ b/gauss_kernel.py
@@ -0,0 +1,27 @@
+import math
+import numpy as np
+from frob_norm import frob_norm
+
+
+def gauss_kernel(X_1, X_2, sigma):
+    """
+    Calculates the Gaussian Kernel.
+    X_1: first representations.
+    X_2: second representations.
+    sigma: kernel width.
+    """
+    x1_l = len(X_1)
+    x1_range = range(x1_l)
+    x2_l = len(X_2)
+    x2_range = range(x2_l)
+
+    inv_sigma = -0.5 / (sigma*sigma)
+
+    K = np.zeros((x1_l, x2_l))
+    for i in x1_range:
+        for j in x2_range:
+            f_norm = frob_norm(X_1[i] - X_2[j])
+            # print(f_norm)
+            K[i, j] = math.exp(inv_sigma * f_norm)
+
+    return K
diff --git a/lj_matrix.py b/lj_matrix.py
new file mode 100644
index 000000000..bb5506fdb
--- /dev/null
+++ b/lj_matrix.py
@@ -0,0 +1,142 @@
+import math
+import numpy as np
+from numpy.linalg import eig
+
+
+def lj_matrix(mol_data,
+              nc_data,
+              max_len=25,
+              as_eig=False,
+              bohr_radius_units=False):
+    """
+    Creates the coulomb matrix from the molecule data given.
+    mol_data: molecule data, matrix of atom coordinates.
+    nc_data: nuclear charge data, array of atom data.
+    max_len: maximum amount of atoms in molecule.
+    as_eig: if data should be returned as matrix or array of eigenvalues.
+    bohr_radius_units: if units should be in bohr's radius units.
+    """
+    if bohr_radius_units:
+        conversion_rate = 0.52917721067
+    else:
+        conversion_rate = 1
+
+    mol_n = len(mol_data)
+    mol_nr = range(mol_n)
+
+    if not mol_n == len(nc_data):
+        print(''.join(['Error. Molecule matrix dimension is different ',
+                       'than the nuclear charge array dimension.']))
+    else:
+        if max_len < mol_n:
+            print(''.join(['Error. Molecule matrix dimension (mol_n) is ',
+                           'greater than max_len. Using mol_n.']))
+            max_len = None
+
+        if max_len:
+            lj = np.zeros((max_len, max_len))
+            ml_r = range(max_len)
+
+            # Actual calculation of the coulomb matrix.
+            for i in ml_r:
+                if i < mol_n:
+                    x_i = mol_data[i, 0]
+                    y_i = mol_data[i, 1]
+                    z_i = mol_data[i, 2]
+                    Z_i = nc_data[i]
+                else:
+                    break
+
+                for j in ml_r:
+                    if j < mol_n:
+                        x_j = mol_data[j, 0]
+                        y_j = mol_data[j, 1]
+                        z_j = mol_data[j, 2]
+
+                        x = (x_i-x_j)**2
+                        y = (y_i-y_j)**2
+                        z = (z_i-z_j)**2
+
+                        if i == j:
+                            lj[i, j] = (0.5*Z_i**2.4)
+                        else:
+                            # Calculations are done after i==j is checked
+                            # so no division by zero is done.
+
+                            # A little play with r exponents
+                            # so no square root is calculated.
+                            # Conversion factor is included in r^2.
+
+                            # 1/r^2
+                            r_2 = 1/(conversion_rate**2*(x + y + z))
+
+                            r_6 = math.pow(r_2, 3)
+                            r_12 = math.pow(r_6, 2)
+                            lj[i, j] = (4*(r_12 - r_6))
+                    else:
+                        break
+
+            # Now the value will be returned.
+            if as_eig:
+                lj_sorted = np.sort(eig(lj)[0])[::-1]
+                # Thanks to SO for the following lines of code.
+                # https://stackoverflow.com/a/43011036
+
+                # Keep zeros at the end.
+                mask = lj_sorted != 0.
+                f_mask = mask.sum(0, keepdims=1) >\
+                    np.arange(lj_sorted.shape[0]-1, -1, -1)
+
+                f_mask = f_mask[::-1]
+                lj_sorted[f_mask] = lj_sorted[mask]
+                lj_sorted[~f_mask] = 0.
+
+                return lj_sorted
+
+            else:
+                return lj
+
+        else:
+            lj_temp = []
+            # Actual calculation of the coulomb matrix.
+            for i in mol_nr:
+                x_i = mol_data[i, 0]
+                y_i = mol_data[i, 1]
+                z_i = mol_data[i, 2]
+                Z_i = nc_data[i]
+
+                lj_row = []
+                for j in mol_nr:
+                    x_j = mol_data[j, 0]
+                    y_j = mol_data[j, 1]
+                    z_j = mol_data[j, 2]
+
+                    x = (x_i-x_j)**2
+                    y = (y_i-y_j)**2
+                    z = (z_i-z_j)**2
+
+                    if i == j:
+                        lj_row.append(0.5*Z_i**2.4)
+                    else:
+                        # Calculations are done after i==j is checked
+                        # so no division by zero is done.
+
+                        # A little play with r exponents
+                        # so no square root is calculated.
+                        # Conversion factor is included in r^2.
+
+                        # 1/r^2
+                        r_2 = 1/(conversion_rate**2*(x + y + z))
+
+                        r_6 = math.pow(r_2, 3)
+                        r_12 = math.pow(r_6, 2)
+                        lj_row.append(4*(r_12 - r_6))
+
+                lj_temp.append(np.array(lj_row))
+
+            lj = np.array(lj_temp)
+            # Now the value will be returned.
+            if as_eig:
+                return np.sort(eig(lj)[0])[::-1]
+            else:
+                return lj
diff --git a/main.py b/main.py
new file mode 100644
index 000000000..0a934e814
--- /dev/null
+++ b/main.py
@@ -0,0 +1,135 @@
+import os
+import time
+from colorama import init, Fore, Style
+# import math
+# import random
+import numpy as np
+# from numpy.linalg import cholesky, eig
+# import matplotlib.pyplot as plt
+from read_nc_data import read_nc_data
+from read_db_edata import read_db_edata
+from c_matrix import c_matrix
+from lj_matrix import lj_matrix
+# from frob_norm import frob_norm
+from gauss_kernel import gauss_kernel
+from cholesky_solve import cholesky_solve
+
+# Initialization.
+init_time = time.perf_counter()
+init()
+
+# Move to data folder.
+init_path = os.getcwd()
+os.chdir('../../data')
+data_path = os.getcwd()
+
+print(Fore.CYAN
+      + 'Data reading started.'
+      + Style.RESET_ALL)
+tic = time.perf_counter()
+
+# Read nuclear charge data.
+zi_data = read_nc_data(data_path)
+
+# Read molecule data.
+molecules, nuclear_charge, energy_pbe0, energy_delta = \
+    read_db_edata(zi_data, data_path)
+
+toc = time.perf_counter()
+print(Fore.GREEN
+      + '\tData reading took {:.4f} seconds.'.format(toc-tic)
+      + Style.RESET_ALL)
+
+# Go back to main folder.
+os.chdir(init_path)
+
+print(Fore.CYAN
+      + 'Coulomb Matrices calculation started.'
+      + Style.RESET_ALL)
+tic = time.perf_counter()
+
+cm_data = np.array([c_matrix(mol, nc, as_eig=True)
+                    for mol, nc in zip(molecules, nuclear_charge)])
+
+toc = time.perf_counter()
+print(Fore.GREEN
+      + '\tCoulomb matrices calculation took {:.4f} seconds.'.format(toc-tic)
+      + Style.RESET_ALL)
+
+print(Fore.CYAN
+      + 'L-J Matrices calculation started.'
+      + Style.RESET_ALL)
+tic = time.perf_counter()
+
+ljm_data = np.array([lj_matrix(mol, nc, as_eig=True)
+                     for mol, nc in zip(molecules, nuclear_charge)])
+
+toc = time.perf_counter()
+print(Fore.GREEN
+      + '\tL-J matrices calculation took {:.4f} seconds.'.format(toc-tic)
+      + Style.RESET_ALL)
+
+#
+# Problem solving with Coulomb Matrix.
+#
+print(Fore.CYAN
+      + 'CM ML started.'
+      + Style.RESET_ALL)
+tic = time.perf_counter()
+
+sigma = 1000.0
+
+Xcm_training = cm_data[:6000]
+Ycm_training = energy_pbe0[:6000]
+Kcm_training = gauss_kernel(Xcm_training, Xcm_training, sigma)
+alpha_cm = cholesky_solve(Kcm_training, Ycm_training)
+
+Xcm_test = cm_data[-1000:]
+Ycm_test = energy_pbe0[-1000:]
+Kcm_test = gauss_kernel(Xcm_test, Xcm_training, sigma)
+Ycm_predicted = np.dot(Kcm_test, alpha_cm)
+
+print('\tMean absolute error for CM: {}'.format(np.mean(np.abs(Ycm_predicted
+                                                               - Ycm_test))))
+
+toc = time.perf_counter()
+print(Fore.GREEN
+      + '\tCM ML took {:.4f} seconds.'.format(toc-tic)
+      + Style.RESET_ALL)
+
+
+#
+# Problem solving with L-J Matrix.
+#
+print(Fore.CYAN
+      + 'L-JM ML started.'
+      + Style.RESET_ALL)
+tic = time.perf_counter()
+
+sigma = 1000.0
+
+Xljm_training = ljm_data[:6000]
+Yljm_training = energy_pbe0[:6000]
+Kljm_training = gauss_kernel(Xljm_training, Xljm_training, sigma)
+alpha_ljm = cholesky_solve(Kljm_training, Yljm_training)
+
+Xljm_test = ljm_data[-1000:]
+Yljm_test = energy_pbe0[-1000:]
+Kljm_test = gauss_kernel(Xljm_test, Xljm_training, sigma)
+Yljm_predicted = np.dot(Kljm_test, alpha_ljm)
+
+print('\tMean absolute error for LJM: {}'.format(np.mean(np.abs(Yljm_predicted
+                                                                - Yljm_test))))
+
+toc = time.perf_counter()
+print(Fore.GREEN
+      + '\tL-JM ML took {:.4f} seconds.'.format(toc-tic)
+      + Style.RESET_ALL)
+
+
+# End of program
+end_time = time.perf_counter()
+print(Fore.CYAN
+      + 'The program took {:.4f} seconds of runtime.'.format(end_time
+                                                             - init_time)
+      + Style.RESET_ALL)
diff --git a/read_db_edata.py b/read_db_edata.py
new file mode 100644
index 000000000..bf006ded2
--- /dev/null
+++ b/read_db_edata.py
@@ -0,0 +1,76 @@
+import os
+import numpy as np
+import random
+
+
+# 'hof_qm7.txt.txt' retrieved from
+# https://github.com/qmlcode/tutorial
+def read_db_edata(zi_data,
+                  data_path,
+                  r_seed=111):
+    """
+    Reads molecule database and extracts
+    its contents as usable variables.
+    zi_data: dictionary containing nuclear charge data.
+    data_path: path to the data directory.
+    r_seed: random seed.
+    """
+    os.chdir(data_path)
+
+    fname = 'hof_qm7.txt'
+    with open(fname, 'r') as infile:
+        lines = infile.readlines()
+
+    # Temporary energy dictionary.
+    energy_temp = dict()
+
+    for line in lines:
+        xyz_data = line.split()
+
+        xyz_name = xyz_data[0]
+        hof = float(xyz_data[1])
+        dftb = float(xyz_data[2])
+        # print(xyz_name, hof, dftb)
+
+        energy_temp[xyz_name] = np.array([hof, hof - dftb])
+
+    # Use a random seed.
+    random.seed(r_seed)
+
+    et_keys = list(energy_temp.keys())
+    random.shuffle(et_keys)
+
+    # Temporary energy dictionary, shuffled.
+    energy_temp_shuffled = dict()
+    for key in et_keys:
+        energy_temp_shuffled.update({key: energy_temp[key]})
+
+    mol_data = []
+    mol_nc_data = []
+    # Actual reading of the xyz files.
+    for i, k in enumerate(energy_temp_shuffled.keys()):
+        with open(k, 'r') as xyz_file:
+            lines = xyz_file.readlines()
+
+        len_lines = len(lines)
+        mol_temp_data = []
+        mol_nc_temp_data = np.array(np.zeros(len_lines-2))
+        for j, line in enumerate(lines[2:len_lines]):
+            line_list = line.split()
+
+            mol_nc_temp_data[j] = float(zi_data[line_list[0]])
+            line_data = np.array(np.asarray(line_list[1:4], dtype=float))
+            mol_temp_data.append(line_data)
+
+        mol_data.append(mol_temp_data)
+        mol_nc_data.append(mol_nc_temp_data)
+
+    # Convert everything to a numpy array.
+    molecules = np.array([np.array(mol) for mol in mol_data])
+    nuclear_charge = np.array([nc_d for nc_d in mol_nc_data])
+    energy_pbe0 = np.array([energy_temp_shuffled[k][0]
+                            for k in energy_temp_shuffled.keys()])
+    energy_delta = np.array([energy_temp_shuffled[k][1]
+                             for k in energy_temp_shuffled.keys()])
+
+    return molecules, nuclear_charge, energy_pbe0, energy_delta
diff --git a/read_nc_data.py b/read_nc_data.py
new file mode 100644
index 000000000..b00cd4dba
--- /dev/null
+++ b/read_nc_data.py
@@ -0,0 +1,22 @@
+# 'periodic_table_of_elements.txt' retrieved from
+# https://gist.github.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee
+
+
+def read_nc_data(data_path):
+    """
+    Reads nuclear charge data from file and returns a dictionary.
+    data_path: path to the data directory.
+    """
+    fname = 'periodic_table_of_elements.txt'
+    with open(''.join([data_path, '\\', fname]), 'r') as infile:
+        temp_lines = infile.readlines()
+
+    del temp_lines[0]
+
+    lines = []
+    for temp_line in temp_lines:
+        new_line = temp_line.split(sep=',')
+        lines.append(new_line)
+
+    # Dictionary of nuclear charge.
+    return {line[2]: int(line[0]) for line in lines}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..fd676ab6a
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+colorama==0.4.1
+numpy==1.17.4
+termcolor==1.1.0
author	David Luevano <55825613+luevano@users.noreply.github.com>	2019-12-08 21:54:19 -0700
committer	David Luevano <55825613+luevano@users.noreply.github.com>	2019-12-08 21:54:19 -0700
commit	7c8b391dddd65ff58417df80dfc79cadddf4f4e6 (patch)
tree	0bad908f5b40848addc3307de4307b1dae4c6e7e