From 7c8b391dddd65ff58417df80dfc79cadddf4f4e6 Mon Sep 17 00:00:00 2001 From: David Luevano <55825613+luevano@users.noreply.github.com> Date: Sun, 8 Dec 2019 21:54:19 -0700 Subject: First commit --- .gitignore | 113 +++++++++++++++++++++++++++++++++++++++++++ c_matrix.py | 124 +++++++++++++++++++++++++++++++++++++++++++++++ cholesky_solve.py | 42 ++++++++++++++++ desktop.ini | 5 ++ frob_norm.py | 29 +++++++++++ gauss_kernel.py | 27 +++++++++++ lj_matrix.py | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ main.py | 135 +++++++++++++++++++++++++++++++++++++++++++++++++++ read_db_edata.py | 76 +++++++++++++++++++++++++++++ read_nc_data.py | 22 +++++++++ requirements.txt | 3 ++ 11 files changed, 718 insertions(+) create mode 100644 .gitignore create mode 100644 c_matrix.py create mode 100644 cholesky_solve.py create mode 100644 desktop.ini create mode 100644 frob_norm.py create mode 100644 gauss_kernel.py create mode 100644 lj_matrix.py create mode 100644 main.py create mode 100644 read_db_edata.py create mode 100644 read_nc_data.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..440f0a8ae --- /dev/null +++ b/.gitignore @@ -0,0 +1,113 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +# +# +# PERSONAL IGNORES. +# +# + +# vscode +.vscode/ diff --git a/c_matrix.py b/c_matrix.py new file mode 100644 index 000000000..b47e15aac --- /dev/null +++ b/c_matrix.py @@ -0,0 +1,124 @@ +import math +import numpy as np +from numpy.linalg import eig + + +def c_matrix(mol_data, + nc_data, + max_len=25, + as_eig=False, + bohr_radius_units=False): + """ + Creates the coulomb matrix from the molecule data given. + mol_data: molecule data, matrix of atom coordinates. + nc_data: nuclear charge data, array of atom data. + max_len: maximum amount of atoms in molecule. + as_eig: if data should be returned as matrix or array of eigenvalues. + bohr_radius_units: if units should be in bohr's radius units. + """ + if bohr_radius_units: + conversion_rate = 0.52917721067 + else: + conversion_rate = 1 + + mol_n = len(mol_data) + mol_nr = range(mol_n) + + if not mol_n == len(nc_data): + print(''.join(['Error. Molecule matrix dimension is different ', + 'than the nuclear charge array dimension.'])) + else: + if max_len < mol_n: + print(''.join(['Error. Molecule matrix dimension (mol_n) is ', + 'greater than max_len. Using mol_n.'])) + max_len = None + + if max_len: + cm = np.zeros((max_len, max_len)) + ml_r = range(max_len) + + # Actual calculation of the coulomb matrix. + for i in ml_r: + if i < mol_n: + x_i = mol_data[i, 0] + y_i = mol_data[i, 1] + z_i = mol_data[i, 2] + Z_i = nc_data[i] + else: + break + + for j in ml_r: + if j < mol_n: + x_j = mol_data[j, 0] + y_j = mol_data[j, 1] + z_j = mol_data[j, 2] + Z_j = nc_data[j] + + x = (x_i-x_j)**2 + y = (y_i-y_j)**2 + z = (z_i-z_j)**2 + + if i == j: + cm[i, j] = (0.5*Z_i**2.4) + else: + cm[i, j] = (conversion_rate*Z_i*Z_j/math.sqrt(x + + y + + z)) + else: + break + + # Now the value will be returned. + if as_eig: + cm_sorted = np.sort(eig(cm)[0])[::-1] + # Thanks to SO for the following lines of code. + # https://stackoverflow.com/a/43011036 + + # Keep zeros at the end. + mask = cm_sorted != 0. + f_mask = mask.sum(0, keepdims=1) >\ + np.arange(cm_sorted.shape[0]-1, -1, -1) + + f_mask = f_mask[::-1] + cm_sorted[f_mask] = cm_sorted[mask] + cm_sorted[~f_mask] = 0. + + return cm_sorted + + else: + return cm + + else: + cm_temp = [] + # Actual calculation of the coulomb matrix. + for i in mol_nr: + x_i = mol_data[i, 0] + y_i = mol_data[i, 1] + z_i = mol_data[i, 2] + Z_i = nc_data[i] + + cm_row = [] + for j in mol_nr: + x_j = mol_data[j, 0] + y_j = mol_data[j, 1] + z_j = mol_data[j, 2] + Z_j = nc_data[j] + + x = (x_i-x_j)**2 + y = (y_i-y_j)**2 + z = (z_i-z_j)**2 + + if i == j: + cm_row.append(0.5*Z_i**2.4) + else: + cm_row.append(conversion_rate*Z_i*Z_j/math.sqrt(x + + y + + z)) + + cm_temp.append(np.array(cm_row)) + + cm = np.array(cm_temp) + # Now the value will be returned. + if as_eig: + return np.sort(eig(cm)[0])[::-1] + else: + return cm diff --git a/cholesky_solve.py b/cholesky_solve.py new file mode 100644 index 000000000..b2addc01c --- /dev/null +++ b/cholesky_solve.py @@ -0,0 +1,42 @@ +import numpy as np +from numpy.linalg import cholesky + + +def cholesky_solve(K, y): + """ + Applies Cholesky decomposition to obtain the 'alpha coeficients'. + K: kernel. + y: known parameters. + """ + # The initial mathematical problem is to solve Ka=y. + + # First, add a small lambda value. + K[np.diag_indices_from(K)] += 1e-8 + + # Get the Cholesky decomposition of the kernel. + L = cholesky(K) + size = len(L) + + # Solve Lx=y for x. + x = np.zeros(size) + x[0] = y[0] / L[0, 0] + for i in range(1, size): + temp_sum = 0.0 + for j in range(i): + temp_sum += L[i, j] * x[j] + x[i] = (y[i] - temp_sum) / L[i, i] + + # Now, solve LTa=x for a. + L2 = L.T + a = np.zeros(size) + a_ms = size - 1 + a[a_ms] = x[a_ms] / L2[a_ms, a_ms] + # Because of the form of L2 (upper triangular matriz), an inversion of + # range() needs to be done. + for i in range(0, a_ms)[::-1]: + temp_sum = 0.0 + for j in range(i, size)[::-1]: + temp_sum += L2[i, j] * a[j] + a[i] = (x[i] - temp_sum) / L2[i, i] + + return a diff --git a/desktop.ini b/desktop.ini new file mode 100644 index 000000000..dce88a3ef --- /dev/null +++ b/desktop.ini @@ -0,0 +1,5 @@ +[.ShellClassInfo] +InfoTip=This folder is shared online. +IconFile=C:\Program Files\Google\Drive\googledrivesync.exe +IconIndex=16 + \ No newline at end of file diff --git a/frob_norm.py b/frob_norm.py new file mode 100644 index 000000000..8f8dc48bd --- /dev/null +++ b/frob_norm.py @@ -0,0 +1,29 @@ +import math + + +def frob_norm(array): + """ + Calculates the frobenius norm of a given array or matrix. + array: array of data. + """ + + arr_sh_len = len(array.shape) + arr_range = range(len(array)) + fn = 0.0 + + # If it is a 'vector'. + if arr_sh_len == 1: + for i in arr_range: + fn += array[i]*array[i] + + return math.sqrt(fn) + + # If it is a matrix. + elif arr_sh_len == 2: + for i in arr_range: + for j in arr_range: + fn += array[i, j]*array[i, j] + + return math.sqrt(fn) + else: + print('Error. Array size greater than 2 ({}).'.format(arr_sh_len)) diff --git a/gauss_kernel.py b/gauss_kernel.py new file mode 100644 index 000000000..3b0a5a198 --- /dev/null +++ b/gauss_kernel.py @@ -0,0 +1,27 @@ +import math +import numpy as np +from frob_norm import frob_norm + + +def gauss_kernel(X_1, X_2, sigma): + """ + Calculates the Gaussian Kernel. + X_1: first representations. + X_2: second representations. + sigma: kernel width. + """ + x1_l = len(X_1) + x1_range = range(x1_l) + x2_l = len(X_2) + x2_range = range(x2_l) + + inv_sigma = -0.5 / (sigma*sigma) + + K = np.zeros((x1_l, x2_l)) + for i in x1_range: + for j in x2_range: + f_norm = frob_norm(X_1[i] - X_2[j]) + # print(f_norm) + K[i, j] = math.exp(inv_sigma * f_norm) + + return K diff --git a/lj_matrix.py b/lj_matrix.py new file mode 100644 index 000000000..bb5506fdb --- /dev/null +++ b/lj_matrix.py @@ -0,0 +1,142 @@ +import math +import numpy as np +from numpy.linalg import eig + + +def lj_matrix(mol_data, + nc_data, + max_len=25, + as_eig=False, + bohr_radius_units=False): + """ + Creates the coulomb matrix from the molecule data given. + mol_data: molecule data, matrix of atom coordinates. + nc_data: nuclear charge data, array of atom data. + max_len: maximum amount of atoms in molecule. + as_eig: if data should be returned as matrix or array of eigenvalues. + bohr_radius_units: if units should be in bohr's radius units. + """ + if bohr_radius_units: + conversion_rate = 0.52917721067 + else: + conversion_rate = 1 + + mol_n = len(mol_data) + mol_nr = range(mol_n) + + if not mol_n == len(nc_data): + print(''.join(['Error. Molecule matrix dimension is different ', + 'than the nuclear charge array dimension.'])) + else: + if max_len < mol_n: + print(''.join(['Error. Molecule matrix dimension (mol_n) is ', + 'greater than max_len. Using mol_n.'])) + max_len = None + + if max_len: + lj = np.zeros((max_len, max_len)) + ml_r = range(max_len) + + # Actual calculation of the coulomb matrix. + for i in ml_r: + if i < mol_n: + x_i = mol_data[i, 0] + y_i = mol_data[i, 1] + z_i = mol_data[i, 2] + Z_i = nc_data[i] + else: + break + + for j in ml_r: + if j < mol_n: + x_j = mol_data[j, 0] + y_j = mol_data[j, 1] + z_j = mol_data[j, 2] + + x = (x_i-x_j)**2 + y = (y_i-y_j)**2 + z = (z_i-z_j)**2 + + if i == j: + lj[i, j] = (0.5*Z_i**2.4) + else: + # Calculations are done after i==j is checked + # so no division by zero is done. + + # A little play with r exponents + # so no square root is calculated. + # Conversion factor is included in r^2. + + # 1/r^2 + r_2 = 1/(conversion_rate**2*(x + y + z)) + + r_6 = math.pow(r_2, 3) + r_12 = math.pow(r_6, 2) + lj[i, j] = (4*(r_12 - r_6)) + else: + break + + # Now the value will be returned. + if as_eig: + lj_sorted = np.sort(eig(lj)[0])[::-1] + # Thanks to SO for the following lines of code. + # https://stackoverflow.com/a/43011036 + + # Keep zeros at the end. + mask = lj_sorted != 0. + f_mask = mask.sum(0, keepdims=1) >\ + np.arange(lj_sorted.shape[0]-1, -1, -1) + + f_mask = f_mask[::-1] + lj_sorted[f_mask] = lj_sorted[mask] + lj_sorted[~f_mask] = 0. + + return lj_sorted + + else: + return lj + + else: + lj_temp = [] + # Actual calculation of the coulomb matrix. + for i in mol_nr: + x_i = mol_data[i, 0] + y_i = mol_data[i, 1] + z_i = mol_data[i, 2] + Z_i = nc_data[i] + + lj_row = [] + for j in mol_nr: + x_j = mol_data[j, 0] + y_j = mol_data[j, 1] + z_j = mol_data[j, 2] + + x = (x_i-x_j)**2 + y = (y_i-y_j)**2 + z = (z_i-z_j)**2 + + if i == j: + lj_row.append(0.5*Z_i**2.4) + else: + # Calculations are done after i==j is checked + # so no division by zero is done. + + # A little play with r exponents + # so no square root is calculated. + # Conversion factor is included in r^2. + + # 1/r^2 + r_2 = 1/(conversion_rate**2*(x + y + z)) + + r_6 = math.pow(r_2, 3) + r_12 = math.pow(r_6, 2) + lj_row.append(4*(r_12 - r_6)) + + lj_temp.append(np.array(lj_row)) + + lj = np.array(lj_temp) + # Now the value will be returned. + if as_eig: + return np.sort(eig(lj)[0])[::-1] + else: + return lj diff --git a/main.py b/main.py new file mode 100644 index 000000000..0a934e814 --- /dev/null +++ b/main.py @@ -0,0 +1,135 @@ +import os +import time +from colorama import init, Fore, Style +# import math +# import random +import numpy as np +# from numpy.linalg import cholesky, eig +# import matplotlib.pyplot as plt +from read_nc_data import read_nc_data +from read_db_edata import read_db_edata +from c_matrix import c_matrix +from lj_matrix import lj_matrix +# from frob_norm import frob_norm +from gauss_kernel import gauss_kernel +from cholesky_solve import cholesky_solve + +# Initialization. +init_time = time.perf_counter() +init() + +# Move to data folder. +init_path = os.getcwd() +os.chdir('../../data') +data_path = os.getcwd() + +print(Fore.CYAN + + 'Data reading started.' + + Style.RESET_ALL) +tic = time.perf_counter() + +# Read nuclear charge data. +zi_data = read_nc_data(data_path) + +# Read molecule data. +molecules, nuclear_charge, energy_pbe0, energy_delta = \ + read_db_edata(zi_data, data_path) + +toc = time.perf_counter() +print(Fore.GREEN + + '\tData reading took {:.4f} seconds.'.format(toc-tic) + + Style.RESET_ALL) + +# Go back to main folder. +os.chdir(init_path) + +print(Fore.CYAN + + 'Coulomb Matrices calculation started.' + + Style.RESET_ALL) +tic = time.perf_counter() + +cm_data = np.array([c_matrix(mol, nc, as_eig=True) + for mol, nc in zip(molecules, nuclear_charge)]) + +toc = time.perf_counter() +print(Fore.GREEN + + '\tCoulomb matrices calculation took {:.4f} seconds.'.format(toc-tic) + + Style.RESET_ALL) + +print(Fore.CYAN + + 'L-J Matrices calculation started.' + + Style.RESET_ALL) +tic = time.perf_counter() + +ljm_data = np.array([lj_matrix(mol, nc, as_eig=True) + for mol, nc in zip(molecules, nuclear_charge)]) + +toc = time.perf_counter() +print(Fore.GREEN + + '\tL-J matrices calculation took {:.4f} seconds.'.format(toc-tic) + + Style.RESET_ALL) + +# +# Problem solving with Coulomb Matrix. +# +print(Fore.CYAN + + 'CM ML started.' + + Style.RESET_ALL) +tic = time.perf_counter() + +sigma = 1000.0 + +Xcm_training = cm_data[:6000] +Ycm_training = energy_pbe0[:6000] +Kcm_training = gauss_kernel(Xcm_training, Xcm_training, sigma) +alpha_cm = cholesky_solve(Kcm_training, Ycm_training) + +Xcm_test = cm_data[-1000:] +Ycm_test = energy_pbe0[-1000:] +Kcm_test = gauss_kernel(Xcm_test, Xcm_training, sigma) +Ycm_predicted = np.dot(Kcm_test, alpha_cm) + +print('\tMean absolute error for CM: {}'.format(np.mean(np.abs(Ycm_predicted + - Ycm_test)))) + +toc = time.perf_counter() +print(Fore.GREEN + + '\tCM ML took {:.4f} seconds.'.format(toc-tic) + + Style.RESET_ALL) + + +# +# Problem solving with L-J Matrix. +# +print(Fore.CYAN + + 'L-JM ML started.' + + Style.RESET_ALL) +tic = time.perf_counter() + +sigma = 1000.0 + +Xljm_training = ljm_data[:6000] +Yljm_training = energy_pbe0[:6000] +Kljm_training = gauss_kernel(Xljm_training, Xljm_training, sigma) +alpha_ljm = cholesky_solve(Kljm_training, Yljm_training) + +Xljm_test = ljm_data[-1000:] +Yljm_test = energy_pbe0[-1000:] +Kljm_test = gauss_kernel(Xljm_test, Xljm_training, sigma) +Yljm_predicted = np.dot(Kljm_test, alpha_ljm) + +print('\tMean absolute error for LJM: {}'.format(np.mean(np.abs(Yljm_predicted + - Yljm_test)))) + +toc = time.perf_counter() +print(Fore.GREEN + + '\tL-JM ML took {:.4f} seconds.'.format(toc-tic) + + Style.RESET_ALL) + + +# End of program +end_time = time.perf_counter() +print(Fore.CYAN + + 'The program took {:.4f} seconds of runtime.'.format(end_time + - init_time) + + Style.RESET_ALL) diff --git a/read_db_edata.py b/read_db_edata.py new file mode 100644 index 000000000..bf006ded2 --- /dev/null +++ b/read_db_edata.py @@ -0,0 +1,76 @@ +import os +import numpy as np +import random + + +# 'hof_qm7.txt.txt' retrieved from +# https://github.com/qmlcode/tutorial +def read_db_edata(zi_data, + data_path, + r_seed=111): + """ + Reads molecule database and extracts + its contents as usable variables. + zi_data: dictionary containing nuclear charge data. + data_path: path to the data directory. + r_seed: random seed. + """ + os.chdir(data_path) + + fname = 'hof_qm7.txt' + with open(fname, 'r') as infile: + lines = infile.readlines() + + # Temporary energy dictionary. + energy_temp = dict() + + for line in lines: + xyz_data = line.split() + + xyz_name = xyz_data[0] + hof = float(xyz_data[1]) + dftb = float(xyz_data[2]) + # print(xyz_name, hof, dftb) + + energy_temp[xyz_name] = np.array([hof, hof - dftb]) + + # Use a random seed. + random.seed(r_seed) + + et_keys = list(energy_temp.keys()) + random.shuffle(et_keys) + + # Temporary energy dictionary, shuffled. + energy_temp_shuffled = dict() + for key in et_keys: + energy_temp_shuffled.update({key: energy_temp[key]}) + + mol_data = [] + mol_nc_data = [] + # Actual reading of the xyz files. + for i, k in enumerate(energy_temp_shuffled.keys()): + with open(k, 'r') as xyz_file: + lines = xyz_file.readlines() + + len_lines = len(lines) + mol_temp_data = [] + mol_nc_temp_data = np.array(np.zeros(len_lines-2)) + for j, line in enumerate(lines[2:len_lines]): + line_list = line.split() + + mol_nc_temp_data[j] = float(zi_data[line_list[0]]) + line_data = np.array(np.asarray(line_list[1:4], dtype=float)) + mol_temp_data.append(line_data) + + mol_data.append(mol_temp_data) + mol_nc_data.append(mol_nc_temp_data) + + # Convert everything to a numpy array. + molecules = np.array([np.array(mol) for mol in mol_data]) + nuclear_charge = np.array([nc_d for nc_d in mol_nc_data]) + energy_pbe0 = np.array([energy_temp_shuffled[k][0] + for k in energy_temp_shuffled.keys()]) + energy_delta = np.array([energy_temp_shuffled[k][1] + for k in energy_temp_shuffled.keys()]) + + return molecules, nuclear_charge, energy_pbe0, energy_delta diff --git a/read_nc_data.py b/read_nc_data.py new file mode 100644 index 000000000..b00cd4dba --- /dev/null +++ b/read_nc_data.py @@ -0,0 +1,22 @@ +# 'periodic_table_of_elements.txt' retrieved from +# https://gist.github.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee + + +def read_nc_data(data_path): + """ + Reads nuclear charge data from file and returns a dictionary. + data_path: path to the data directory. + """ + fname = 'periodic_table_of_elements.txt' + with open(''.join([data_path, '\\', fname]), 'r') as infile: + temp_lines = infile.readlines() + + del temp_lines[0] + + lines = [] + for temp_line in temp_lines: + new_line = temp_line.split(sep=',') + lines.append(new_line) + + # Dictionary of nuclear charge. + return {line[2]: int(line[0]) for line in lines} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..fd676ab6a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +colorama==0.4.1 +numpy==1.17.4 +termcolor==1.1.0 -- cgit v1.2.3-54-g00ecf