summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Luevano Alvarado <david@luevano.xyz>2022-04-23 20:47:19 -0600
committerDavid Luevano Alvarado <david@luevano.xyz>2022-04-23 20:47:19 -0600
commitdfc3e6db921815416b8edc5892b2a7adfc677a25 (patch)
tree545c66fa2102c10d3b85c5318e76e446aee30ef6
parent7ecd3c9501a16da6c1872ed7521f93df9a8da7f5 (diff)
add checksum checking for mod files instead of timestampv0.7.0
-rw-r--r--ChangeLog6
-rw-r--r--README.md6
-rw-r--r--src/pyssg/arg_parser.py3
-rw-r--r--src/pyssg/builder.py3
-rw-r--r--src/pyssg/database.py165
-rw-r--r--src/pyssg/pyssg.py13
-rw-r--r--src/pyssg/utils.py18
7 files changed, 156 insertions, 58 deletions
diff --git a/ChangeLog b/ChangeLog
index aca14d5..e480159 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,12 @@
CHANGES
=======
+v0.6.2
+------
+
+* mayor bugfix in the database writer
+* minor refactoring
+
v0.6.1
------
diff --git a/README.md b/README.md
index 8a56246..3aee523 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,8 @@ Inspired (initially) by Roman Zolotarev's [`ssg5`](https://rgz.ee/bin/ssg5) and
## Features and to-do
+**Please note that since this is a WIP, there will be changes that will break your site setup (the database management, for example). Read the tag notes for any possible break between the version you're using and the one you're updating to.**
+
- [x] Build static site parsing `markdown` files ( `*.md` -> `*.html`)
- [x] ~~Using plain `*.html` files for templates.~~ Changed to Jinja templates.
- [x] Would like to change to something more flexible and easier to manage ([`jinja`](https://jinja.palletsprojects.com/en/3.0.x/), for example).
@@ -23,7 +25,8 @@ Inspired (initially) by Roman Zolotarev's [`ssg5`](https://rgz.ee/bin/ssg5) and
- [x] Avoid the program to freak out when there are directories created in advance.
- [x] Provide more meaningful error messages when you are missing mandatory metadata in your `*.md` files.
- [ ] More complex directory structure to support multiple subdomains and different types of pages.
-- [ ] Add option/change to using an SQL database instead of the custom solution.
+- [ ] Option/change to using an SQL database instead of the custom solution.
+- [x] Checksum checking because the timestamp of the file is not enough.
### Markdown features
@@ -131,6 +134,7 @@ rss_date=%%a, %%d %%b %%Y %%H:%%M:%%S GMT # fixed
sitemap_date=%%Y-%%m-%%d # fixed
[info]
version= # current 'pyssg' version (0.5.1.dev16, for example)
+debug=True/False # depending if --debug was used when executing
rss_run_date= # date the program was run, formatted with 'rss_date'
sitemap_run_date= # date the program was run, formatted with 'sitemap_date'
```
diff --git a/src/pyssg/arg_parser.py b/src/pyssg/arg_parser.py
index ec150fb..2fc6853 100644
--- a/src/pyssg/arg_parser.py
+++ b/src/pyssg/arg_parser.py
@@ -37,6 +37,9 @@ def get_parsed_arguments() -> Namespace:
parser.add_argument('--debug',
action='store_true',
help='''change logging level from info to debug''')
+ parser.add_argument('--add-checksum-to-db',
+ action='store_true',
+ help='''add checksum column to db entries''')
# really not needed, too much bloat and case scenarios to check for,
# instead, just read from config file or default config file
"""
diff --git a/src/pyssg/builder.py b/src/pyssg/builder.py
index 35502b0..6d65187 100644
--- a/src/pyssg/builder.py
+++ b/src/pyssg/builder.py
@@ -83,7 +83,8 @@ class Builder:
dir_path: str = None
for d in self.dirs:
dir_path = os.path.join(self.config.get('path', 'dst'), d)
- create_dir(dir_path, True)
+ # using silent=True to not print the info create dir msgs for this
+ create_dir(dir_path, True, True)
def __copy_html_files(self) -> None:
diff --git a/src/pyssg/database.py b/src/pyssg/database.py
index 66c7087..290ba51 100644
--- a/src/pyssg/database.py
+++ b/src/pyssg/database.py
@@ -2,7 +2,6 @@ import os
import sys
from logging import Logger, getLogger
from configparser import ConfigParser
-from tabnanny import check
from .utils import get_checksum
@@ -11,14 +10,15 @@ log: Logger = getLogger(__name__)
# db class that works for both html and md files
class Database:
- __COLUMN_NUM: int = 4
+ __OLD_COLUMN_NUM: int = 4
+ __COLUMN_NUM: int = 5
def __init__(self, db_path: str,
config: ConfigParser):
log.debug('initializing the page db on path "%s"', db_path)
self.db_path: str = db_path
self.config: ConfigParser = config
- self.e: dict[str, tuple[float, float, list[str]]] = dict()
+ self.e: dict[str, tuple[float, float, str, list[str]]] = dict()
# updates the tags for a specific entry (file)
@@ -27,12 +27,12 @@ class Database:
tags: list[str]) -> None:
if file_name in self.e:
log.debug('updating tags for entry "%s"', file_name)
- cts, mts, old_tags = self.e[file_name]
- log.debug('entry "%s" old content: (%s, %s, (%s))',
- file_name, cts, mts, ', '.join(old_tags))
- self.e[file_name] = (cts, mts, tags)
- log.debug('entry "%s" new content: (%s, %s, (%s))',
- file_name, cts, mts, ', '.join(tags))
+ cts, mts, checksum, old_tags = self.e[file_name]
+ log.debug('entry "%s" old content: (%s, %s, %s, (%s))',
+ file_name, cts, mts, checksum, ', '.join(old_tags))
+ self.e[file_name] = (cts, mts, checksum, tags)
+ log.debug('entry "%s" new content: (%s, %s, %s, (%s))',
+ file_name, cts, mts, checksum, ', '.join(tags))
else:
log.error('can\'t update tags for entry "%s",'
' as it is not present in db', file_name)
@@ -51,44 +51,42 @@ class Database:
f = file_name.replace(remove, '')
log.debug('removed "%s" from "%s": "%s"', remove, file_name, f)
-
# get current time, needs actual file name
time: float = os.stat(file_name).st_mtime
log.debug('modified time for "%s": %s', file_name, time)
- # three cases, 1) entry didn't exist,
- # 2) entry hasn't been mod and,
- # 3) entry has been mod
+ # calculate current checksum, also needs actual file name
+ checksum: str = get_checksum(file_name)
+ log.debug('current checksum for "%s": "%s"', file_name, checksum)
+
+ # two cases, 1) entry didn't exist,
+ # 2) entry has been mod and,
+ # 3) entry hasn't been mod
#1)
if f not in self.e:
log.debug('entry "%s" didn\'t exist, adding with defaults', f)
- self.e[f] = (time, 0.0, tags)
+ self.e[f] = (time, 0.0, checksum, tags)
return True
- old_time, old_mod_time, tags = self.e[f]
- log.debug('entry "%s" old content: (%s, %s, (%s))',
- f, old_time, old_mod_time, ', '.join(tags))
+ old_time, old_mod_time, old_checksum, tags = self.e[f]
+ log.debug('entry "%s" old content: (%s, %s, %s, (%s))',
+ f, old_time, old_mod_time, old_checksum, ', '.join(tags))
# 2)
- if old_mod_time == 0.0:
- if time > old_time:
+ if checksum != old_checksum:
+ if old_mod_time == 0.0:
log.debug('entry "%s" has been modified for the first'
' time, updating', f)
- self.e[f] = (old_time, time, tags)
- log.debug('entry "%s" new content: (%s, %s, (%s))',
- f, old_time, time, ', '.join(tags))
- return True
+ else:
+ log.debug('entry "%s" has been modified, updating', f)
+ self.e[f] = (old_time, time, checksum, tags)
+ log.debug('entry "%s" new content: (%s, %s, %s, (%s))',
+ f, old_time, time, checksum, ', '.join(tags))
+ return True
# 3)
else:
- if time > old_mod_time:
- log.debug('entry "%s" has been modified, updating', f)
- self.e[f] = (old_time, time, tags)
- log.debug('entry "%s" new content: (%s, %s, (%s))',
- f, old_time, time, ', '.join(tags))
- return True
-
- log.debug('entry "%s" hasn\'t been modified', f)
- return False
+ log.debug('entry "%s" hasn\'t been modified', f)
+ return False
def write(self) -> None:
@@ -98,54 +96,117 @@ class Database:
log.debug('parsing row for page "%s"', k)
t: str = None
row: str = None
- if len(v[2]) == 0:
+ if len(v[3]) == 0:
t = '-'
else:
- t = ','.join(v[2])
+ t = ','.join(v[3])
- row = f'{k} {v[0]} {v[1]} {t}'
+ row = f'{k} {v[0]} {v[1]} {v[2]} {t}'
log.debug('writing row: "%s\\n"', row)
file.write(f'{row}\n')
- def read(self) -> None:
- log.debug('reading db')
+ def _db_path_exists(self) -> bool:
+ log.debug('checking that "%s" exists or is a file', self.db_path)
if not os.path.exists(self.db_path):
log.warning('"%s" doesn\'t exist, will be'
' created once process finishes,'
' ignore if it\'s the first run', self.db_path)
- return
+ return False
- if os.path.exists(self.db_path) and not os.path.isfile(self.db_path):
+ if not os.path.isfile(self.db_path):
log.error('"%s" is not a file"', self.db_path)
sys.exit(1)
+ return True
+
+
+ def _read_raw(self) -> list[str]:
rows: list[str] = None
with open(self.db_path, 'r') as file:
rows = file.readlines()
- log.info('db contains %d rows', len(rows))
+ log.debug('db contains %d rows', len(rows))
+
+ return rows
+
+
+ def read_old(self) -> None:
+ log.debug('reading db with old schema (%d columns)', self.__OLD_COLUMN_NUM)
+ if not self._db_path_exists():
+ log.error('db path "%s" desn\'t exist, --add-checksum-to-db should'
+ 'only be used when updating the old db schema', self.db_path)
+ sys.exit(1)
+
+ rows: list[str] = self._read_raw()
+ cols: list[str] = None
+ # l=list of values in entry
+ log.debug('parsing rows from db')
+ for it, row in enumerate(rows):
+ i: int = it + 1
+ r: str = row.strip()
+ log.debug('row %d content: "%s"', i, r)
+ # (file_name, ctimestamp, mtimestamp, [tags])
+ cols: tuple[str, float, float, list[str]] = tuple(r.split())
+ col_num: int = len(cols)
+ if col_num != self.__OLD_COLUMN_NUM:
+ log.critical('row %d doesn\'t contain %s columns, contains %d'
+ ' columns: "%s"',
+ i, self.__OLD_COLUMN_NUM, col_num, r)
+ sys.exit(1)
+
+ t: list[str] = None
+ if cols[3] == '-':
+ t = []
+ else:
+ t = cols[3].split(',')
+ log.debug('tag content: (%s)', ', '.join(t))
+ file_path: str = os.path.join(self.config.get('path', 'src'), cols[0])
+ checksum: str = get_checksum(file_path)
+ log.debug('checksum for "%s": "%s"', file_path, checksum)
- # parse each entry and populate accordingly
- l: list[str] = None
+ self.e[cols[0]] = (float(cols[1]), float(cols[2]), checksum, t)
+
+
+
+ def read(self) -> None:
+ log.debug('reading db')
+ if not self._db_path_exists():
+ return
+
+ rows: list[str] = self._read_raw()
+ cols: list[str] = None
# l=list of values in entry
log.debug('parsing rows from db')
for it, row in enumerate(rows):
- i = it + 1
- r = row.strip()
+ i: int = it + 1
+ r: str = row.strip()
log.debug('row %d content: "%s"', i, r)
- l = tuple(r.split())
- if len(l) != self.__COLUMN_NUM:
- log.critical('row %d doesn\'t contain %s columns,'
- ' contains %d elements; row %d content: "%s"',
- i, self.__COLUMN_NUM, len(l), i, r)
+ # (file_name, ctimestamp, mtimestamp, checksum, [tags])
+ cols: tuple[str, float, float, str, list[str]] = tuple(r.split())
+ col_num: int = len(cols)
+ if col_num == self.__OLD_COLUMN_NUM:
+ log.error('row %d contains %d columns: "%s"; this is probably'
+ ' because of missing checksum column, which is used'
+ ' now to also check if a file has changed. Rerun'
+ ' with flag --add-checksum-to-db to add the checksum'
+ ' column to the current db; if you did any changes'
+ ' since last timestamp in db, it won\'t update'
+ ' modification timestamp',
+ i, self.__OLD_COLUMN_NUM, r)
+ sys.exit(1)
+
+ if col_num != self.__COLUMN_NUM:
+ log.critical('row %d doesn\'t contain %s columns, contains %d'
+ ' columns: "%s"',
+ i, self.__COLUMN_NUM, col_num, r)
sys.exit(1)
t: list[str] = None
- if l[3] == '-':
+ if cols[4] == '-':
t = []
else:
- t = l[3].split(',')
+ t = cols[4].split(',')
log.debug('tag content: (%s)', ', '.join(t))
- self.e[l[0]] = (float(l[1]), float(l[2]), t)
+ self.e[cols[0]] = (float(cols[1]), float(cols[2]), cols[3], t)
diff --git a/src/pyssg/pyssg.py b/src/pyssg/pyssg.py
index af7b166..598bf41 100644
--- a/src/pyssg/pyssg.py
+++ b/src/pyssg/pyssg.py
@@ -56,6 +56,7 @@ def main() -> None:
sys.exit(1)
config: ConfigParser = get_parsed_config(config_path)
+ config.set('info', 'debug', str(args['debug']))
if args['init']:
log.info('initializing the directory structure and copying over templates')
@@ -74,8 +75,18 @@ def main() -> None:
copy_file(p, plt_file)
sys.exit(0)
+ if args['add_checksum_to_db']:
+ log.info('adding checksum column to existing db')
+ db_path: str = os.path.join(config.get('path', 'src'), '.files')
+ db: Database = Database(db_path, config)
+ # needs to be read_old instead of read
+ db.read_old()
+ db.write()
+
+ sys.exit(0)
+
if args['build']:
- log.debug('building the html files')
+ log.info('building the html files')
db_path: str = os.path.join(config.get('path', 'src'), '.files')
db: Database = Database(db_path, config)
db.read()
diff --git a/src/pyssg/utils.py b/src/pyssg/utils.py
index ffaf8ba..a41249a 100644
--- a/src/pyssg/utils.py
+++ b/src/pyssg/utils.py
@@ -1,6 +1,7 @@
import os
import sys
import shutil
+from hashlib import md5
from logging import Logger, getLogger
log: Logger = getLogger(__name__)
@@ -54,15 +55,15 @@ def get_dir_structure(path: str,
return [o.replace(path, '')[1:] for o in out]
-def create_dir(path: str, p: bool=False) -> None:
+def create_dir(path: str, p: bool=False, silent=False) -> None:
try:
if p:
os.makedirs(path)
else:
os.mkdir(path)
- log.info('created directory "%s"', path)
+ if not silent: log.info('created directory "%s"', path)
except FileExistsError:
- log.info('directory "%s" already exists, ignoring', path)
+ if not silent: log.info('directory "%s" already exists, ignoring', path)
def copy_file(src: str, dst: str) -> None:
@@ -78,3 +79,14 @@ def sanity_check_path(path: str) -> None:
log.error('"$" character found in path "%s";'
' could be due to non-existant env var.', path)
sys.exit(1)
+
+
+# as seen in SO: https://stackoverflow.com/a/1131238
+def get_checksum(path: str) -> str:
+ log.debug('calculating md5 checksum for "%s"', path)
+ file_hash = md5()
+ with open(path, "rb") as f:
+ while chunk := f.read(4096):
+ file_hash.update(chunk)
+
+ return file_hash.hexdigest() \ No newline at end of file