Initial import

Signed-off-by: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
author: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> 2023-12-26 16:01:56 +0100
committer: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> 2023-12-31 13:11:42 +0100
commit: dac22e627f3c716201556537849743387464c73d (patch)
tree: a79c1eaeadc0697f72075a2f88a0ab6188bd90cf /convert.py
download: haunt-blog-dac22e627f3c716201556537849743387464c73d.tar.gz
haunt-blog-dac22e627f3c716201556537849743387464c73d.tar.bz2
haunt-blog-dac22e627f3c716201556537849743387464c73d.zip
1 files changed, 226 insertions, 0 deletions
diff --git a/convert.py b/convert.py
new file mode 100755
index 0000000..776fe5b
--- /dev/null
+++ b/convert.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# Copyright (C) 2020-2022 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from bs4 import BeautifulSoup
+from html2text import config, HTML2Text
+
+try:
+    # This has been removed in more recent
+    # versions of python-html2text. See commit
+    # b361467894fb277563b4547ec9d4df49f5e0c6e3
+    # (b361467 Remove support for Python ≤ 3.4)
+    # in https://github.com/Alir3z4/html2text.git
+    from html2text.utils import wrapwrite
+except:
+    pass
+
+import os
+import re
+import sh
+import sys
+
+def usage(progname):
+    print("{} path/to/file.html".format(progname))
+    sys.exit(1)
+
+# A "[1]" in the html becomes "[[1]][6]" in text.
+# As we already uses references at the end a [6] would
+# be enough.
+def fix_wordpress_references_link(string):
+    open_square_bracket = re.escape('[')
+    close_square_bracket = re.escape(']')
+    whitespaces = '\s*'
+    numbers = '\d+'
+
+    # [ [ 1 ] ] [ 6 ]
+    wordpress_link_regex = \
+        \
+          open_square_bracket + whitespaces \
+        + open_square_bracket + whitespaces \
+        + numbers + whitespaces \
+        + close_square_bracket + whitespaces \
+        + close_square_bracket + whitespaces \
+        \
+        + open_square_bracket + whitespaces \
+        + numbers + whitespaces \
+        + close_square_bracket + whitespaces \
+
+    results = re.findall(wordpress_link_regex, string)
+
+    part_to_remove = '^' \
+        + open_square_bracket + whitespaces \
+        + open_square_bracket + whitespaces \
+        + numbers + whitespaces \
+        + close_square_bracket + whitespaces \
+        + close_square_bracket + whitespaces \
+
+    for result in results:
+        replacement = re.sub(part_to_remove, '', result)
+        string = string.replace(result, replacement)
+
+    return string
+
+def fix_alignment(string):
+    new_string = ""
+    for line in string.split(os.linesep):
+        new_line = re.sub('^  ', '', line)
+        new_string += (new_line + os.linesep)
+
+    return new_string
+
+# Emacs breaks lists when doing a fill-paragraph to adjust a paragraph to the
+# maximum width so we make sure that there is at least one blank line before
+# the '*'
+def fix_lists(string):
+    new_string = ''
+    nr_lineseps_before_star = 0
+    for c in string:
+        if c == '*' and nr_lineseps_before_star == 1:
+            new_string += os.linesep
+
+        if c == os.linesep:
+            nr_lineseps_before_star += 1
+        else:
+            nr_lineseps_before_star = 0
+
+        new_string += c
+    return new_string
+
+def convert(html_file_path):
+    with open(html_file_path) as html_file:
+        try:
+            soup = BeautifulSoup(html_file, features="html5lib").article
+        except:
+            try:
+                # For some reason the lxml parser isn't found with
+                # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's
+                # probably better to use an html5 parser anyway as the
+                # Replicant blog (now?) uses the html doctype and the
+                # theme seems to include an html5.js file for the IE 9
+                # browser.
+                soup = BeautifulSoup(html_file, features="lxml").article
+            except:
+                print("Cannot find html5lib or lxml parsers")
+                sys.exit(1)
+
+    # Format the output to be compatible with mail conventions but make sure
+    # that the links are not split between two lines
+    config.INLINE_LINKS = False
+    config.PROTECT_LINKS = True
+    config.WRAP_LIST_ITEMS = True
+    config.BODY_WIDTH = 70
+
+    parser = HTML2Text()
+
+    article = soup.find('div', class_='entry-content')
+    text = parser.handle(article.decode())
+
+    text = fix_wordpress_references_link(text)
+    text = fix_alignment(text)
+    text = fix_lists(text)
+
+    return text
+
+def _get_metadata(html_file_path, func):
+    with open(html_file_path) as html_file:
+        try:
+            soup = BeautifulSoup(html_file, features="html5lib")
+        except:
+            try:
+                # For some reason the lxml parser isn't found with
+                # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's
+                # probably better to use an html5 parser anyway as the
+                # Replicant blog (now?) uses the html doctype and the
+                # theme seems to include an html5.js file for the IE 9
+                # browser.
+                soup = BeautifulSoup(html_file, features="lxml").article
+            except:
+                print("Cannot find html5lib or lxml parsers")
+                sys.exit(1)
+        return func(soup)
+
+def get_metadata(html_file_path):
+    metadata = ""
+
+    def get_date(soup):
+        date_metadata = None
+        entries = soup.article.find_all('a')
+        for entry in entries:
+            date_elements = entry.find_all('time', class_='entry-date')
+            for date_element in date_elements:
+                if date_element.get('datetime', None):
+                        new_date = date_element['datetime']
+                        assert(date_metadata == None or date_metadata == new_date)
+                        date_metadata = new_date
+        return date_metadata
+
+    def get_tags(soup):
+        results = []
+        tags = soup.article.find_all('footer', class_='entry-meta')
+        assert(len(tags) == 1)
+        links = tags[0].find_all('a')
+        for link in links:
+            text = link.string
+            if text != 'permalink':
+                results.append(text)
+
+        return ', '.join(results)
+
+    def get_title(soup):
+        title = soup.title.string
+        title = title.replace(os.linesep, '')
+        title = title.replace('\t', '')
+        title = re.sub('\|.*', '', title)
+        title = title.lstrip().rstrip()
+        return title
+
+    date_metadata = _get_metadata(html_file_path, get_date)
+    assert(date_metadata != None)
+    metadata += "date: {}".format(date_metadata) + os.linesep
+
+    title_metadata = _get_metadata(html_file_path, get_title)
+    # assert(title_metadata != None)
+    metadata += "title: {}".format(title_metadata) + os.linesep
+
+    # assert(title_metadata != None)
+    # metadata += "title: {}".format(title_metadata)
+
+    tags_metadata = _get_metadata(html_file_path, get_tags)
+    if tags_metadata:
+        print("tags: {}".format(tags_metadata))
+
+    metadata += "---" + os.linesep
+
+    return metadata
+
+def main():
+    if len(sys.argv) != 2:
+        usage(sys.argv[0])
+
+    html_file_path = sys.argv[1]
+
+    text = get_metadata(html_file_path)
+    text += convert(html_file_path)
+
+    try:
+        wrapwrite(text)
+    except:
+        sys.stdout.write(text)
+
+if __name__ == '__main__':
+    main()
author	Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>	2023-12-26 16:01:56 +0100
committer	Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>	2023-12-31 13:11:42 +0100
commit	dac22e627f3c716201556537849743387464c73d (patch)
tree	a79c1eaeadc0697f72075a2f88a0ab6188bd90cf /convert.py
download	haunt-blog-dac22e627f3c716201556537849743387464c73d.tar.gz haunt-blog-dac22e627f3c716201556537849743387464c73d.tar.bz2 haunt-blog-dac22e627f3c716201556537849743387464c73d.zip