summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore113
-rw-r--r--c_matrix.py124
-rw-r--r--cholesky_solve.py42
-rw-r--r--desktop.ini5
-rw-r--r--frob_norm.py29
-rw-r--r--gauss_kernel.py27
-rw-r--r--lj_matrix.py142
-rw-r--r--main.py135
-rw-r--r--read_db_edata.py76
-rw-r--r--read_nc_data.py22
-rw-r--r--requirements.txt3
11 files changed, 718 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..440f0a8ae
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,113 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+#
+#
+# PERSONAL IGNORES.
+#
+#
+
+# vscode
+.vscode/
diff --git a/c_matrix.py b/c_matrix.py
new file mode 100644
index 000000000..b47e15aac
--- /dev/null
+++ b/c_matrix.py
@@ -0,0 +1,124 @@
+import math
+import numpy as np
+from numpy.linalg import eig
+
+
+def c_matrix(mol_data,
+ nc_data,
+ max_len=25,
+ as_eig=False,
+ bohr_radius_units=False):
+ """
+ Creates the coulomb matrix from the molecule data given.
+ mol_data: molecule data, matrix of atom coordinates.
+ nc_data: nuclear charge data, array of atom data.
+ max_len: maximum amount of atoms in molecule.
+ as_eig: if data should be returned as matrix or array of eigenvalues.
+ bohr_radius_units: if units should be in bohr's radius units.
+ """
+ if bohr_radius_units:
+ conversion_rate = 0.52917721067
+ else:
+ conversion_rate = 1
+
+ mol_n = len(mol_data)
+ mol_nr = range(mol_n)
+
+ if not mol_n == len(nc_data):
+ print(''.join(['Error. Molecule matrix dimension is different ',
+ 'than the nuclear charge array dimension.']))
+ else:
+ if max_len < mol_n:
+ print(''.join(['Error. Molecule matrix dimension (mol_n) is ',
+ 'greater than max_len. Using mol_n.']))
+ max_len = None
+
+ if max_len:
+ cm = np.zeros((max_len, max_len))
+ ml_r = range(max_len)
+
+ # Actual calculation of the coulomb matrix.
+ for i in ml_r:
+ if i < mol_n:
+ x_i = mol_data[i, 0]
+ y_i = mol_data[i, 1]
+ z_i = mol_data[i, 2]
+ Z_i = nc_data[i]
+ else:
+ break
+
+ for j in ml_r:
+ if j < mol_n:
+ x_j = mol_data[j, 0]
+ y_j = mol_data[j, 1]
+ z_j = mol_data[j, 2]
+ Z_j = nc_data[j]
+
+ x = (x_i-x_j)**2
+ y = (y_i-y_j)**2
+ z = (z_i-z_j)**2
+
+ if i == j:
+ cm[i, j] = (0.5*Z_i**2.4)
+ else:
+ cm[i, j] = (conversion_rate*Z_i*Z_j/math.sqrt(x
+ + y
+ + z))
+ else:
+ break
+
+ # Now the value will be returned.
+ if as_eig:
+ cm_sorted = np.sort(eig(cm)[0])[::-1]
+ # Thanks to SO for the following lines of code.
+ # https://stackoverflow.com/a/43011036
+
+ # Keep zeros at the end.
+ mask = cm_sorted != 0.
+ f_mask = mask.sum(0, keepdims=1) >\
+ np.arange(cm_sorted.shape[0]-1, -1, -1)
+
+ f_mask = f_mask[::-1]
+ cm_sorted[f_mask] = cm_sorted[mask]
+ cm_sorted[~f_mask] = 0.
+
+ return cm_sorted
+
+ else:
+ return cm
+
+ else:
+ cm_temp = []
+ # Actual calculation of the coulomb matrix.
+ for i in mol_nr:
+ x_i = mol_data[i, 0]
+ y_i = mol_data[i, 1]
+ z_i = mol_data[i, 2]
+ Z_i = nc_data[i]
+
+ cm_row = []
+ for j in mol_nr:
+ x_j = mol_data[j, 0]
+ y_j = mol_data[j, 1]
+ z_j = mol_data[j, 2]
+ Z_j = nc_data[j]
+
+ x = (x_i-x_j)**2
+ y = (y_i-y_j)**2
+ z = (z_i-z_j)**2
+
+ if i == j:
+ cm_row.append(0.5*Z_i**2.4)
+ else:
+ cm_row.append(conversion_rate*Z_i*Z_j/math.sqrt(x
+ + y
+ + z))
+
+ cm_temp.append(np.array(cm_row))
+
+ cm = np.array(cm_temp)
+ # Now the value will be returned.
+ if as_eig:
+ return np.sort(eig(cm)[0])[::-1]
+ else:
+ return cm
diff --git a/cholesky_solve.py b/cholesky_solve.py
new file mode 100644
index 000000000..b2addc01c
--- /dev/null
+++ b/cholesky_solve.py
@@ -0,0 +1,42 @@
+import numpy as np
+from numpy.linalg import cholesky
+
+
+def cholesky_solve(K, y):
+ """
+ Applies Cholesky decomposition to obtain the 'alpha coeficients'.
+ K: kernel.
+ y: known parameters.
+ """
+ # The initial mathematical problem is to solve Ka=y.
+
+ # First, add a small lambda value.
+ K[np.diag_indices_from(K)] += 1e-8
+
+ # Get the Cholesky decomposition of the kernel.
+ L = cholesky(K)
+ size = len(L)
+
+ # Solve Lx=y for x.
+ x = np.zeros(size)
+ x[0] = y[0] / L[0, 0]
+ for i in range(1, size):
+ temp_sum = 0.0
+ for j in range(i):
+ temp_sum += L[i, j] * x[j]
+ x[i] = (y[i] - temp_sum) / L[i, i]
+
+ # Now, solve LTa=x for a.
+ L2 = L.T
+ a = np.zeros(size)
+ a_ms = size - 1
+ a[a_ms] = x[a_ms] / L2[a_ms, a_ms]
+ # Because of the form of L2 (upper triangular matriz), an inversion of
+ # range() needs to be done.
+ for i in range(0, a_ms)[::-1]:
+ temp_sum = 0.0
+ for j in range(i, size)[::-1]:
+ temp_sum += L2[i, j] * a[j]
+ a[i] = (x[i] - temp_sum) / L2[i, i]
+
+ return a
diff --git a/desktop.ini b/desktop.ini
new file mode 100644
index 000000000..dce88a3ef
--- /dev/null
+++ b/desktop.ini
@@ -0,0 +1,5 @@
+[.ShellClassInfo]
+InfoTip=This folder is shared online.
+IconFile=C:\Program Files\Google\Drive\googledrivesync.exe
+IconIndex=16
+ \ No newline at end of file
diff --git a/frob_norm.py b/frob_norm.py
new file mode 100644
index 000000000..8f8dc48bd
--- /dev/null
+++ b/frob_norm.py
@@ -0,0 +1,29 @@
+import math
+
+
+def frob_norm(array):
+ """
+ Calculates the frobenius norm of a given array or matrix.
+ array: array of data.
+ """
+
+ arr_sh_len = len(array.shape)
+ arr_range = range(len(array))
+ fn = 0.0
+
+ # If it is a 'vector'.
+ if arr_sh_len == 1:
+ for i in arr_range:
+ fn += array[i]*array[i]
+
+ return math.sqrt(fn)
+
+ # If it is a matrix.
+ elif arr_sh_len == 2:
+ for i in arr_range:
+ for j in arr_range:
+ fn += array[i, j]*array[i, j]
+
+ return math.sqrt(fn)
+ else:
+ print('Error. Array size greater than 2 ({}).'.format(arr_sh_len))
diff --git a/gauss_kernel.py b/gauss_kernel.py
new file mode 100644
index 000000000..3b0a5a198
--- /dev/null
+++ b/gauss_kernel.py
@@ -0,0 +1,27 @@
+import math
+import numpy as np
+from frob_norm import frob_norm
+
+
+def gauss_kernel(X_1, X_2, sigma):
+ """
+ Calculates the Gaussian Kernel.
+ X_1: first representations.
+ X_2: second representations.
+ sigma: kernel width.
+ """
+ x1_l = len(X_1)
+ x1_range = range(x1_l)
+ x2_l = len(X_2)
+ x2_range = range(x2_l)
+
+ inv_sigma = -0.5 / (sigma*sigma)
+
+ K = np.zeros((x1_l, x2_l))
+ for i in x1_range:
+ for j in x2_range:
+ f_norm = frob_norm(X_1[i] - X_2[j])
+ # print(f_norm)
+ K[i, j] = math.exp(inv_sigma * f_norm)
+
+ return K
diff --git a/lj_matrix.py b/lj_matrix.py
new file mode 100644
index 000000000..bb5506fdb
--- /dev/null
+++ b/lj_matrix.py
@@ -0,0 +1,142 @@
+import math
+import numpy as np
+from numpy.linalg import eig
+
+
+def lj_matrix(mol_data,
+ nc_data,
+ max_len=25,
+ as_eig=False,
+ bohr_radius_units=False):
+ """
+ Creates the coulomb matrix from the molecule data given.
+ mol_data: molecule data, matrix of atom coordinates.
+ nc_data: nuclear charge data, array of atom data.
+ max_len: maximum amount of atoms in molecule.
+ as_eig: if data should be returned as matrix or array of eigenvalues.
+ bohr_radius_units: if units should be in bohr's radius units.
+ """
+ if bohr_radius_units:
+ conversion_rate = 0.52917721067
+ else:
+ conversion_rate = 1
+
+ mol_n = len(mol_data)
+ mol_nr = range(mol_n)
+
+ if not mol_n == len(nc_data):
+ print(''.join(['Error. Molecule matrix dimension is different ',
+ 'than the nuclear charge array dimension.']))
+ else:
+ if max_len < mol_n:
+ print(''.join(['Error. Molecule matrix dimension (mol_n) is ',
+ 'greater than max_len. Using mol_n.']))
+ max_len = None
+
+ if max_len:
+ lj = np.zeros((max_len, max_len))
+ ml_r = range(max_len)
+
+ # Actual calculation of the coulomb matrix.
+ for i in ml_r:
+ if i < mol_n:
+ x_i = mol_data[i, 0]
+ y_i = mol_data[i, 1]
+ z_i = mol_data[i, 2]
+ Z_i = nc_data[i]
+ else:
+ break
+
+ for j in ml_r:
+ if j < mol_n:
+ x_j = mol_data[j, 0]
+ y_j = mol_data[j, 1]
+ z_j = mol_data[j, 2]
+
+ x = (x_i-x_j)**2
+ y = (y_i-y_j)**2
+ z = (z_i-z_j)**2
+
+ if i == j:
+ lj[i, j] = (0.5*Z_i**2.4)
+ else:
+ # Calculations are done after i==j is checked
+ # so no division by zero is done.
+
+ # A little play with r exponents
+ # so no square root is calculated.
+ # Conversion factor is included in r^2.
+
+ # 1/r^2
+ r_2 = 1/(conversion_rate**2*(x + y + z))
+
+ r_6 = math.pow(r_2, 3)
+ r_12 = math.pow(r_6, 2)
+ lj[i, j] = (4*(r_12 - r_6))
+ else:
+ break
+
+ # Now the value will be returned.
+ if as_eig:
+ lj_sorted = np.sort(eig(lj)[0])[::-1]
+ # Thanks to SO for the following lines of code.
+ # https://stackoverflow.com/a/43011036
+
+ # Keep zeros at the end.
+ mask = lj_sorted != 0.
+ f_mask = mask.sum(0, keepdims=1) >\
+ np.arange(lj_sorted.shape[0]-1, -1, -1)
+
+ f_mask = f_mask[::-1]
+ lj_sorted[f_mask] = lj_sorted[mask]
+ lj_sorted[~f_mask] = 0.
+
+ return lj_sorted
+
+ else:
+ return lj
+
+ else:
+ lj_temp = []
+ # Actual calculation of the coulomb matrix.
+ for i in mol_nr:
+ x_i = mol_data[i, 0]
+ y_i = mol_data[i, 1]
+ z_i = mol_data[i, 2]
+ Z_i = nc_data[i]
+
+ lj_row = []
+ for j in mol_nr:
+ x_j = mol_data[j, 0]
+ y_j = mol_data[j, 1]
+ z_j = mol_data[j, 2]
+
+ x = (x_i-x_j)**2
+ y = (y_i-y_j)**2
+ z = (z_i-z_j)**2
+
+ if i == j:
+ lj_row.append(0.5*Z_i**2.4)
+ else:
+ # Calculations are done after i==j is checked
+ # so no division by zero is done.
+
+ # A little play with r exponents
+ # so no square root is calculated.
+ # Conversion factor is included in r^2.
+
+ # 1/r^2
+ r_2 = 1/(conversion_rate**2*(x + y + z))
+
+ r_6 = math.pow(r_2, 3)
+ r_12 = math.pow(r_6, 2)
+ lj_row.append(4*(r_12 - r_6))
+
+ lj_temp.append(np.array(lj_row))
+
+ lj = np.array(lj_temp)
+ # Now the value will be returned.
+ if as_eig:
+ return np.sort(eig(lj)[0])[::-1]
+ else:
+ return lj
diff --git a/main.py b/main.py
new file mode 100644
index 000000000..0a934e814
--- /dev/null
+++ b/main.py
@@ -0,0 +1,135 @@
+import os
+import time
+from colorama import init, Fore, Style
+# import math
+# import random
+import numpy as np
+# from numpy.linalg import cholesky, eig
+# import matplotlib.pyplot as plt
+from read_nc_data import read_nc_data
+from read_db_edata import read_db_edata
+from c_matrix import c_matrix
+from lj_matrix import lj_matrix
+# from frob_norm import frob_norm
+from gauss_kernel import gauss_kernel
+from cholesky_solve import cholesky_solve
+
+# Initialization.
+init_time = time.perf_counter()
+init()
+
+# Move to data folder.
+init_path = os.getcwd()
+os.chdir('../../data')
+data_path = os.getcwd()
+
+print(Fore.CYAN
+ + 'Data reading started.'
+ + Style.RESET_ALL)
+tic = time.perf_counter()
+
+# Read nuclear charge data.
+zi_data = read_nc_data(data_path)
+
+# Read molecule data.
+molecules, nuclear_charge, energy_pbe0, energy_delta = \
+ read_db_edata(zi_data, data_path)
+
+toc = time.perf_counter()
+print(Fore.GREEN
+ + '\tData reading took {:.4f} seconds.'.format(toc-tic)
+ + Style.RESET_ALL)
+
+# Go back to main folder.
+os.chdir(init_path)
+
+print(Fore.CYAN
+ + 'Coulomb Matrices calculation started.'
+ + Style.RESET_ALL)
+tic = time.perf_counter()
+
+cm_data = np.array([c_matrix(mol, nc, as_eig=True)
+ for mol, nc in zip(molecules, nuclear_charge)])
+
+toc = time.perf_counter()
+print(Fore.GREEN
+ + '\tCoulomb matrices calculation took {:.4f} seconds.'.format(toc-tic)
+ + Style.RESET_ALL)
+
+print(Fore.CYAN
+ + 'L-J Matrices calculation started.'
+ + Style.RESET_ALL)
+tic = time.perf_counter()
+
+ljm_data = np.array([lj_matrix(mol, nc, as_eig=True)
+ for mol, nc in zip(molecules, nuclear_charge)])
+
+toc = time.perf_counter()
+print(Fore.GREEN
+ + '\tL-J matrices calculation took {:.4f} seconds.'.format(toc-tic)
+ + Style.RESET_ALL)
+
+#
+# Problem solving with Coulomb Matrix.
+#
+print(Fore.CYAN
+ + 'CM ML started.'
+ + Style.RESET_ALL)
+tic = time.perf_counter()
+
+sigma = 1000.0
+
+Xcm_training = cm_data[:6000]
+Ycm_training = energy_pbe0[:6000]
+Kcm_training = gauss_kernel(Xcm_training, Xcm_training, sigma)
+alpha_cm = cholesky_solve(Kcm_training, Ycm_training)
+
+Xcm_test = cm_data[-1000:]
+Ycm_test = energy_pbe0[-1000:]
+Kcm_test = gauss_kernel(Xcm_test, Xcm_training, sigma)
+Ycm_predicted = np.dot(Kcm_test, alpha_cm)
+
+print('\tMean absolute error for CM: {}'.format(np.mean(np.abs(Ycm_predicted
+ - Ycm_test))))
+
+toc = time.perf_counter()
+print(Fore.GREEN
+ + '\tCM ML took {:.4f} seconds.'.format(toc-tic)
+ + Style.RESET_ALL)
+
+
+#
+# Problem solving with L-J Matrix.
+#
+print(Fore.CYAN
+ + 'L-JM ML started.'
+ + Style.RESET_ALL)
+tic = time.perf_counter()
+
+sigma = 1000.0
+
+Xljm_training = ljm_data[:6000]
+Yljm_training = energy_pbe0[:6000]
+Kljm_training = gauss_kernel(Xljm_training, Xljm_training, sigma)
+alpha_ljm = cholesky_solve(Kljm_training, Yljm_training)
+
+Xljm_test = ljm_data[-1000:]
+Yljm_test = energy_pbe0[-1000:]
+Kljm_test = gauss_kernel(Xljm_test, Xljm_training, sigma)
+Yljm_predicted = np.dot(Kljm_test, alpha_ljm)
+
+print('\tMean absolute error for LJM: {}'.format(np.mean(np.abs(Yljm_predicted
+ - Yljm_test))))
+
+toc = time.perf_counter()
+print(Fore.GREEN
+ + '\tL-JM ML took {:.4f} seconds.'.format(toc-tic)
+ + Style.RESET_ALL)
+
+
+# End of program
+end_time = time.perf_counter()
+print(Fore.CYAN
+ + 'The program took {:.4f} seconds of runtime.'.format(end_time
+ - init_time)
+ + Style.RESET_ALL)
diff --git a/read_db_edata.py b/read_db_edata.py
new file mode 100644
index 000000000..bf006ded2
--- /dev/null
+++ b/read_db_edata.py
@@ -0,0 +1,76 @@
+import os
+import numpy as np
+import random
+
+
+# 'hof_qm7.txt.txt' retrieved from
+# https://github.com/qmlcode/tutorial
+def read_db_edata(zi_data,
+ data_path,
+ r_seed=111):
+ """
+ Reads molecule database and extracts
+ its contents as usable variables.
+ zi_data: dictionary containing nuclear charge data.
+ data_path: path to the data directory.
+ r_seed: random seed.
+ """
+ os.chdir(data_path)
+
+ fname = 'hof_qm7.txt'
+ with open(fname, 'r') as infile:
+ lines = infile.readlines()
+
+ # Temporary energy dictionary.
+ energy_temp = dict()
+
+ for line in lines:
+ xyz_data = line.split()
+
+ xyz_name = xyz_data[0]
+ hof = float(xyz_data[1])
+ dftb = float(xyz_data[2])
+ # print(xyz_name, hof, dftb)
+
+ energy_temp[xyz_name] = np.array([hof, hof - dftb])
+
+ # Use a random seed.
+ random.seed(r_seed)
+
+ et_keys = list(energy_temp.keys())
+ random.shuffle(et_keys)
+
+ # Temporary energy dictionary, shuffled.
+ energy_temp_shuffled = dict()
+ for key in et_keys:
+ energy_temp_shuffled.update({key: energy_temp[key]})
+
+ mol_data = []
+ mol_nc_data = []
+ # Actual reading of the xyz files.
+ for i, k in enumerate(energy_temp_shuffled.keys()):
+ with open(k, 'r') as xyz_file:
+ lines = xyz_file.readlines()
+
+ len_lines = len(lines)
+ mol_temp_data = []
+ mol_nc_temp_data = np.array(np.zeros(len_lines-2))
+ for j, line in enumerate(lines[2:len_lines]):
+ line_list = line.split()
+
+ mol_nc_temp_data[j] = float(zi_data[line_list[0]])
+ line_data = np.array(np.asarray(line_list[1:4], dtype=float))
+ mol_temp_data.append(line_data)
+
+ mol_data.append(mol_temp_data)
+ mol_nc_data.append(mol_nc_temp_data)
+
+ # Convert everything to a numpy array.
+ molecules = np.array([np.array(mol) for mol in mol_data])
+ nuclear_charge = np.array([nc_d for nc_d in mol_nc_data])
+ energy_pbe0 = np.array([energy_temp_shuffled[k][0]
+ for k in energy_temp_shuffled.keys()])
+ energy_delta = np.array([energy_temp_shuffled[k][1]
+ for k in energy_temp_shuffled.keys()])
+
+ return molecules, nuclear_charge, energy_pbe0, energy_delta
diff --git a/read_nc_data.py b/read_nc_data.py
new file mode 100644
index 000000000..b00cd4dba
--- /dev/null
+++ b/read_nc_data.py
@@ -0,0 +1,22 @@
+# 'periodic_table_of_elements.txt' retrieved from
+# https://gist.github.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee
+
+
+def read_nc_data(data_path):
+ """
+ Reads nuclear charge data from file and returns a dictionary.
+ data_path: path to the data directory.
+ """
+ fname = 'periodic_table_of_elements.txt'
+ with open(''.join([data_path, '\\', fname]), 'r') as infile:
+ temp_lines = infile.readlines()
+
+ del temp_lines[0]
+
+ lines = []
+ for temp_line in temp_lines:
+ new_line = temp_line.split(sep=',')
+ lines.append(new_line)
+
+ # Dictionary of nuclear charge.
+ return {line[2]: int(line[0]) for line in lines}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..fd676ab6a
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,3 @@
+colorama==0.4.1
+numpy==1.17.4
+termcolor==1.1.0