From c2b15a51e80a051fe9dbc2e558937f54bf4a459c Mon Sep 17 00:00:00 2001
From: David Luevano <55825613+luevano@users.noreply.github.com>
Date: Thu, 12 Dec 2019 19:38:23 -0700
Subject: Reformat data reading

---
 main.py          |  20 ++------
 read_db_edata.py |  98 -------------------------------------
 read_nc_data.py  |  44 -----------------
 read_qm7_data.py | 144 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 147 insertions(+), 159 deletions(-)
 delete mode 100644 read_db_edata.py
 delete mode 100644 read_nc_data.py
 create mode 100644 read_qm7_data.py

diff --git a/main.py b/main.py
index 88734d57f..f37054b3b 100644
--- a/main.py
+++ b/main.py
@@ -20,34 +20,20 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
 """
-import os
 import time
 from misc import printc
 # import matplotlib.pyplot as plt
-from read_nc_data import read_nc_data
-from read_db_edata import read_db_edata
 from c_matrix import c_matrix_multiple
 from lj_matrix import lj_matrix_multiple
 from do_ml import do_ml
+from read_qm7_data import read_qm7_data
 
 # Initialization time.
 init_time = time.perf_counter()
 
 # Data reading.
-tic = time.perf_counter()
-printc('Data reading started.', 'CYAN')
-
-init_path = os.getcwd()
-os.chdir('data')
-data_path = os.getcwd()
-
-zi_data = read_nc_data(data_path)
-molecules, nuclear_charge, energy_pbe0, energy_delta = \
-    read_db_edata(zi_data, data_path)
-
-os.chdir(init_path)
-toc = time.perf_counter()
-printc('\tData reading took {:.4f} seconds.'.format(toc-tic), 'GREEN')
+zi_data, molecules, nuclear_charge, energy_pbe0, energy_delta =\
+    read_qm7_data()
 
 # Matrices calculation.
 cm_data = c_matrix_multiple(molecules, nuclear_charge, as_eig=True)
diff --git a/read_db_edata.py b/read_db_edata.py
deleted file mode 100644
index 893edf26e..000000000
--- a/read_db_edata.py
+++ /dev/null
@@ -1,98 +0,0 @@
-"""MIT License
-
-Copyright (c) 2019 David Luevano Alvarado
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-"""
-import os
-import numpy as np
-import random
-
-
-# 'hof_qm7.txt.txt' retrieved from
-# https://github.com/qmlcode/tutorial
-def read_db_edata(zi_data,
-                  data_path,
-                  r_seed=111):
-    """
-    Reads molecule database and extracts
-    its contents as usable variables.
-    zi_data: dictionary containing nuclear charge data.
-    data_path: path to the data directory.
-    r_seed: random seed.
-    """
-    os.chdir(data_path)
-
-    fname = 'hof_qm7.txt'
-    with open(fname, 'r') as infile:
-        lines = infile.readlines()
-
-    # Temporary energy dictionary.
-    energy_temp = dict()
-
-    for line in lines:
-        xyz_data = line.split()
-
-        xyz_name = xyz_data[0]
-        hof = float(xyz_data[1])
-        dftb = float(xyz_data[2])
-        # print(xyz_name, hof, dftb)
-
-        energy_temp[xyz_name] = np.array([hof, hof - dftb])
-
-    # Use a random seed.
-    random.seed(r_seed)
-
-    et_keys = list(energy_temp.keys())
-    random.shuffle(et_keys)
-
-    # Temporary energy dictionary, shuffled.
-    energy_temp_shuffled = dict()
-    for key in et_keys:
-        energy_temp_shuffled.update({key: energy_temp[key]})
-
-    mol_data = []
-    mol_nc_data = []
-    # Actual reading of the xyz files.
-    for i, k in enumerate(energy_temp_shuffled.keys()):
-        with open(k, 'r') as xyz_file:
-            lines = xyz_file.readlines()
-
-        len_lines = len(lines)
-        mol_temp_data = []
-        mol_nc_temp_data = np.array(np.zeros(len_lines-2))
-        for j, line in enumerate(lines[2:len_lines]):
-            line_list = line.split()
-
-            mol_nc_temp_data[j] = float(zi_data[line_list[0]])
-            line_data = np.array(np.asarray(line_list[1:4], dtype=float))
-            mol_temp_data.append(line_data)
-
-        mol_data.append(mol_temp_data)
-        mol_nc_data.append(mol_nc_temp_data)
-
-    # Convert everything to a numpy array.
-    molecules = np.array([np.array(mol) for mol in mol_data])
-    nuclear_charge = np.array([nc_d for nc_d in mol_nc_data])
-    energy_pbe0 = np.array([energy_temp_shuffled[k][0]
-                            for k in energy_temp_shuffled.keys()])
-    energy_delta = np.array([energy_temp_shuffled[k][1]
-                             for k in energy_temp_shuffled.keys()])
-
-    return molecules, nuclear_charge, energy_pbe0, energy_delta
diff --git a/read_nc_data.py b/read_nc_data.py
deleted file mode 100644
index d7891f8f6..000000000
--- a/read_nc_data.py
+++ /dev/null
@@ -1,44 +0,0 @@
-"""MIT License
-
-Copyright (c) 2019 David Luevano Alvarado
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-"""
-# 'periodic_table_of_elements.txt' retrieved from
-# https://gist.github.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee
-
-
-def read_nc_data(data_path):
-    """
-    Reads nuclear charge data from file and returns a dictionary.
-    data_path: path to the data directory.
-    """
-    fname = 'periodic_table_of_elements.txt'
-    with open(''.join([data_path, '\\', fname]), 'r') as infile:
-        temp_lines = infile.readlines()
-
-    del temp_lines[0]
-
-    lines = []
-    for temp_line in temp_lines:
-        new_line = temp_line.split(sep=',')
-        lines.append(new_line)
-
-    # Dictionary of nuclear charge.
-    return {line[2]: int(line[0]) for line in lines}
diff --git a/read_qm7_data.py b/read_qm7_data.py
new file mode 100644
index 000000000..0c0cc88aa
--- /dev/null
+++ b/read_qm7_data.py
@@ -0,0 +1,144 @@
+"""MIT License
+
+Copyright (c) 2019 David Luevano Alvarado
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+"""
+import os
+import time
+import numpy as np
+import random
+from misc import printc
+# 'periodic_table_of_elements.txt' retrieved from
+# https://gist.github.com/GoodmanSciences/c2dd862cd38f21b0ad36b8f96b4bf1ee
+
+
+def read_nc_data(data_path):
+    """
+    Reads nuclear charge data from file and returns a dictionary.
+    data_path: path to the data directory.
+    """
+    fname = 'periodic_table_of_elements.txt'
+    with open(''.join([data_path, '\\', fname]), 'r') as infile:
+        temp_lines = infile.readlines()
+
+    del temp_lines[0]
+
+    lines = []
+    for temp_line in temp_lines:
+        new_line = temp_line.split(sep=',')
+        lines.append(new_line)
+
+    # Dictionary of nuclear charge.
+    return {line[2]: int(line[0]) for line in lines}
+
+
+# 'hof_qm7.txt.txt' retrieved from
+# https://github.com/qmlcode/tutorial
+def reas_db_data(zi_data,
+                 data_path,
+                 r_seed=111):
+    """
+    Reads molecule database and extracts
+    its contents as usable variables.
+    zi_data: dictionary containing nuclear charge data.
+    data_path: path to the data directory.
+    r_seed: random seed.
+    """
+    os.chdir(data_path)
+
+    fname = 'hof_qm7.txt'
+    with open(fname, 'r') as infile:
+        lines = infile.readlines()
+
+    # Temporary energy dictionary.
+    energy_temp = dict()
+
+    for line in lines:
+        xyz_data = line.split()
+
+        xyz_name = xyz_data[0]
+        hof = float(xyz_data[1])
+        dftb = float(xyz_data[2])
+        # print(xyz_name, hof, dftb)
+
+        energy_temp[xyz_name] = np.array([hof, hof - dftb])
+
+    # Use a random seed.
+    random.seed(r_seed)
+
+    et_keys = list(energy_temp.keys())
+    random.shuffle(et_keys)
+
+    # Temporary energy dictionary, shuffled.
+    energy_temp_shuffled = dict()
+    for key in et_keys:
+        energy_temp_shuffled.update({key: energy_temp[key]})
+
+    mol_data = []
+    mol_nc_data = []
+    # Actual reading of the xyz files.
+    for i, k in enumerate(energy_temp_shuffled.keys()):
+        with open(k, 'r') as xyz_file:
+            lines = xyz_file.readlines()
+
+        len_lines = len(lines)
+        mol_temp_data = []
+        mol_nc_temp_data = np.array(np.zeros(len_lines-2))
+        for j, line in enumerate(lines[2:len_lines]):
+            line_list = line.split()
+
+            mol_nc_temp_data[j] = float(zi_data[line_list[0]])
+            line_data = np.array(np.asarray(line_list[1:4], dtype=float))
+            mol_temp_data.append(line_data)
+
+        mol_data.append(mol_temp_data)
+        mol_nc_data.append(mol_nc_temp_data)
+
+    # Convert everything to a numpy array.
+    molecules = np.array([np.array(mol) for mol in mol_data])
+    nuclear_charge = np.array([nc_d for nc_d in mol_nc_data])
+    energy_pbe0 = np.array([energy_temp_shuffled[k][0]
+                            for k in energy_temp_shuffled.keys()])
+    energy_delta = np.array([energy_temp_shuffled[k][1]
+                             for k in energy_temp_shuffled.keys()])
+
+    return molecules, nuclear_charge, energy_pbe0, energy_delta
+
+
+def read_qm7_data():
+    """
+    Reads all the qm7 data.
+    """
+    tic = time.perf_counter()
+    printc('Data reading started.', 'CYAN')
+
+    init_path = os.getcwd()
+    os.chdir('data')
+    data_path = os.getcwd()
+
+    zi_data = read_nc_data(data_path)
+    molecules, nuclear_charge, energy_pbe0, energy_delta = \
+        reas_db_data(zi_data, data_path)
+
+    os.chdir(init_path)
+    toc = time.perf_counter()
+    printc('\tData reading took {:.4f} seconds.'.format(toc-tic), 'GREEN')
+
+    return zi_data, molecules, nuclear_charge, energy_pbe0, energy_delta
-- 
cgit v1.2.3-70-g09d2