aboutsummaryrefslogtreecommitdiffstats
path: root/convert.py
diff options
context:
space:
mode:
authorDenis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>2023-12-26 16:01:56 +0100
committerDenis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>2023-12-31 13:11:42 +0100
commitdac22e627f3c716201556537849743387464c73d (patch)
treea79c1eaeadc0697f72075a2f88a0ab6188bd90cf /convert.py
downloadhaunt-blog-dac22e627f3c716201556537849743387464c73d.tar.gz
haunt-blog-dac22e627f3c716201556537849743387464c73d.tar.bz2
haunt-blog-dac22e627f3c716201556537849743387464c73d.zip
Initial import
Signed-off-by: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
Diffstat (limited to 'convert.py')
-rwxr-xr-xconvert.py226
1 files changed, 226 insertions, 0 deletions
diff --git a/convert.py b/convert.py
new file mode 100755
index 0000000..776fe5b
--- /dev/null
+++ b/convert.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# Copyright (C) 2020-2022 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+from bs4 import BeautifulSoup
+from html2text import config, HTML2Text
+
+try:
+ # This has been removed in more recent
+ # versions of python-html2text. See commit
+ # b361467894fb277563b4547ec9d4df49f5e0c6e3
+ # (b361467 Remove support for Python ≤ 3.4)
+ # in https://github.com/Alir3z4/html2text.git
+ from html2text.utils import wrapwrite
+except:
+ pass
+
+import os
+import re
+import sh
+import sys
+
+def usage(progname):
+ print("{} path/to/file.html".format(progname))
+ sys.exit(1)
+
+# A "[1]" in the html becomes "[[1]][6]" in text.
+# As we already uses references at the end a [6] would
+# be enough.
+def fix_wordpress_references_link(string):
+ open_square_bracket = re.escape('[')
+ close_square_bracket = re.escape(']')
+ whitespaces = '\s*'
+ numbers = '\d+'
+
+ # [ [ 1 ] ] [ 6 ]
+ wordpress_link_regex = \
+ \
+ open_square_bracket + whitespaces \
+ + open_square_bracket + whitespaces \
+ + numbers + whitespaces \
+ + close_square_bracket + whitespaces \
+ + close_square_bracket + whitespaces \
+ \
+ + open_square_bracket + whitespaces \
+ + numbers + whitespaces \
+ + close_square_bracket + whitespaces \
+
+ results = re.findall(wordpress_link_regex, string)
+
+ part_to_remove = '^' \
+ + open_square_bracket + whitespaces \
+ + open_square_bracket + whitespaces \
+ + numbers + whitespaces \
+ + close_square_bracket + whitespaces \
+ + close_square_bracket + whitespaces \
+
+ for result in results:
+ replacement = re.sub(part_to_remove, '', result)
+ string = string.replace(result, replacement)
+
+ return string
+
+def fix_alignment(string):
+ new_string = ""
+ for line in string.split(os.linesep):
+ new_line = re.sub('^ ', '', line)
+ new_string += (new_line + os.linesep)
+
+ return new_string
+
+# Emacs breaks lists when doing a fill-paragraph to adjust a paragraph to the
+# maximum width so we make sure that there is at least one blank line before
+# the '*'
+def fix_lists(string):
+ new_string = ''
+ nr_lineseps_before_star = 0
+ for c in string:
+ if c == '*' and nr_lineseps_before_star == 1:
+ new_string += os.linesep
+
+ if c == os.linesep:
+ nr_lineseps_before_star += 1
+ else:
+ nr_lineseps_before_star = 0
+
+ new_string += c
+ return new_string
+
+def convert(html_file_path):
+ with open(html_file_path) as html_file:
+ try:
+ soup = BeautifulSoup(html_file, features="html5lib").article
+ except:
+ try:
+ # For some reason the lxml parser isn't found with
+ # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's
+ # probably better to use an html5 parser anyway as the
+ # Replicant blog (now?) uses the html doctype and the
+ # theme seems to include an html5.js file for the IE 9
+ # browser.
+ soup = BeautifulSoup(html_file, features="lxml").article
+ except:
+ print("Cannot find html5lib or lxml parsers")
+ sys.exit(1)
+
+ # Format the output to be compatible with mail conventions but make sure
+ # that the links are not split between two lines
+ config.INLINE_LINKS = False
+ config.PROTECT_LINKS = True
+ config.WRAP_LIST_ITEMS = True
+ config.BODY_WIDTH = 70
+
+ parser = HTML2Text()
+
+ article = soup.find('div', class_='entry-content')
+ text = parser.handle(article.decode())
+
+ text = fix_wordpress_references_link(text)
+ text = fix_alignment(text)
+ text = fix_lists(text)
+
+ return text
+
+def _get_metadata(html_file_path, func):
+ with open(html_file_path) as html_file:
+ try:
+ soup = BeautifulSoup(html_file, features="html5lib")
+ except:
+ try:
+ # For some reason the lxml parser isn't found with
+ # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's
+ # probably better to use an html5 parser anyway as the
+ # Replicant blog (now?) uses the html doctype and the
+ # theme seems to include an html5.js file for the IE 9
+ # browser.
+ soup = BeautifulSoup(html_file, features="lxml").article
+ except:
+ print("Cannot find html5lib or lxml parsers")
+ sys.exit(1)
+ return func(soup)
+
+def get_metadata(html_file_path):
+ metadata = ""
+
+ def get_date(soup):
+ date_metadata = None
+ entries = soup.article.find_all('a')
+ for entry in entries:
+ date_elements = entry.find_all('time', class_='entry-date')
+ for date_element in date_elements:
+ if date_element.get('datetime', None):
+ new_date = date_element['datetime']
+ assert(date_metadata == None or date_metadata == new_date)
+ date_metadata = new_date
+ return date_metadata
+
+ def get_tags(soup):
+ results = []
+ tags = soup.article.find_all('footer', class_='entry-meta')
+ assert(len(tags) == 1)
+ links = tags[0].find_all('a')
+ for link in links:
+ text = link.string
+ if text != 'permalink':
+ results.append(text)
+
+ return ', '.join(results)
+
+ def get_title(soup):
+ title = soup.title.string
+ title = title.replace(os.linesep, '')
+ title = title.replace('\t', '')
+ title = re.sub('\|.*', '', title)
+ title = title.lstrip().rstrip()
+ return title
+
+ date_metadata = _get_metadata(html_file_path, get_date)
+ assert(date_metadata != None)
+ metadata += "date: {}".format(date_metadata) + os.linesep
+
+ title_metadata = _get_metadata(html_file_path, get_title)
+ # assert(title_metadata != None)
+ metadata += "title: {}".format(title_metadata) + os.linesep
+
+ # assert(title_metadata != None)
+ # metadata += "title: {}".format(title_metadata)
+
+ tags_metadata = _get_metadata(html_file_path, get_tags)
+ if tags_metadata:
+ print("tags: {}".format(tags_metadata))
+
+ metadata += "---" + os.linesep
+
+ return metadata
+
+def main():
+ if len(sys.argv) != 2:
+ usage(sys.argv[0])
+
+ html_file_path = sys.argv[1]
+
+ text = get_metadata(html_file_path)
+ text += convert(html_file_path)
+
+ try:
+ wrapwrite(text)
+ except:
+ sys.stdout.write(text)
+
+if __name__ == '__main__':
+ main()