src/pyssg/md_parser.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

import os
from operator import itemgetter
from markdown import Markdown
from logging import Logger, getLogger

from markdown import Markdown
from yafg import YafgExtension
from markdown_checklist.extension import ChecklistExtension

from .database import Database
from .page import Page

log: Logger = getLogger(__name__)


def _get_md_obj() -> Markdown:
    exts: list = ['extra',
                  'meta',
                  'sane_lists',
                  'smarty',
                  'toc',
                  'wikilinks',
                  # stripTitle generates an error when True,
                  # if there is no title attr
                  YafgExtension(stripTitle=False,
                                figureClass="",
                                figcaptionClass="",
                                figureNumbering=False,
                                figureNumberClass="number",
                                figureNumberText="Figure"),
                  ChecklistExtension(),
                  'pymdownx.mark',
                  'pymdownx.caret',
                  'pymdownx.tilde']
    log.debug('list of md extensions: (%s)',
              ', '.join([e if isinstance(e, str) else type(e).__name__
                         for e in exts]))
    # for some reason, the definition for output_format doesn't include html5
    #   even though it is listed in the documentation, ignoring
    return Markdown(extensions=exts, output_format='html5')  # type: ignore


# page and file is basically a synonym
class MDParser:
    def __init__(self, files: list[str],
                 config: dict,
                 dir_config: dict,
                 db: Database):
        log.debug('initializing the md parser with %d files', len(files))
        self.files: list[str] = files
        self.config: dict = config
        self.dir_config: dict = dir_config
        self.db: Database = db
        self.md: Markdown = _get_md_obj()

        self.all_files: list[Page] = []
        self.all_tags: list[tuple[str, str]] = []

    def parse_files(self) -> None:
        log.debug('parsing all files')
        for i, f in enumerate(self.files):
            log.debug('parsing file "%s"', f)
            src_file: str = os.path.join(self.dir_config['src'], f)
            log.debug('path "%s"', src_file)
            self.db.update(src_file, remove=f'{self.dir_config["src"]}/')

            log.debug('parsing md into html')
            content: str = self.md.reset().convert(open(src_file).read())
            # ignoring md.Meta type as it is not yet defined (because it is from an extension)
            page: Page = Page(f,
                              self.db.e[f].ctimestamp,
                              self.db.e[f].mtimestamp,
                              content,
                              self.md.toc, # type: ignore
                              self.md.toc_tokens, # type: ignore
                              self.md.Meta,  # type: ignore
                              self.config,
                              self.dir_config)
            page.parse_metadata()

            log.debug('adding to file list')
            self.all_files.append(page)

            if self.dir_config['tags'] and page.tags is not None:
                log.debug('parsing tags for "%s"', f)
                self.db.update_tags(f, list(map(itemgetter(0), page.tags)))

                log.debug('add all tags to tag list')
                for t in page.tags:
                    if t[0] not in list(map(itemgetter(0), self.all_tags)):
                        log.debug('adding tag "%s"', t[0])
                        self.all_tags.append(t)
                    else:
                        log.debug('ignoring tag "%s"; already present', t[0])
            else:
                log.debug('no tags to parse')

        log.debug('sorting all lists for consistency')
        self.all_files.sort(reverse=True)
        self.all_tags.sort(key=itemgetter(0))

        pages_amount: int = len(self.all_files)
        # note that prev and next are switched because of the
        # reverse ordering of all_pages
        log.debug('update next and prev attributes')
        for i, p in enumerate(self.all_files):
            if i != 0:
                next_page: Page = self.all_files[i - 1]
                p.next = next_page

            if i != pages_amount - 1:
                prev_page: Page = self.all_files[i + 1]
                p.previous = prev_page