aboutsummaryrefslogtreecommitdiffstats
path: root/convert.py
diff options
context:
space:
mode:
authorDenis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>2024-02-28 17:09:10 +0100
committerDenis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>2025-06-15 01:30:21 +0200
commitbbea73d1c21e11e35b252edf41c107536be8e98e (patch)
tree72e9ac4a2e4b87b0877a4d481177860316b60c66 /convert.py
parent048bb1c8ecc552fb4e91d361f108ede65cd7a9b9 (diff)
downloadhaunt-blog-bbea73d1c21e11e35b252edf41c107536be8e98e.tar.gz
haunt-blog-bbea73d1c21e11e35b252edf41c107536be8e98e.tar.bz2
haunt-blog-bbea73d1c21e11e35b252edf41c107536be8e98e.zip
convert.py: remove non-determinism.
Introducing non-deterministic code paths in the conversion is a bad idea because we want the new blog to be as close as possible. And so if humans verify that the old and new website are similar, to be effective, the verification need to be done on top of a deterministic process. Signed-off-by: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
Diffstat (limited to 'convert.py')
-rwxr-xr-xconvert.py47
1 files changed, 4 insertions, 43 deletions
diff --git a/convert.py b/convert.py
index 776fe5b..a2eeb20 100755
--- a/convert.py
+++ b/convert.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# encoding: utf-8
#
-# Copyright (C) 2020-2022 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+# Copyright (C) 2020-2024 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
@@ -19,16 +19,6 @@
from bs4 import BeautifulSoup
from html2text import config, HTML2Text
-try:
- # This has been removed in more recent
- # versions of python-html2text. See commit
- # b361467894fb277563b4547ec9d4df49f5e0c6e3
- # (b361467 Remove support for Python ≤ 3.4)
- # in https://github.com/Alir3z4/html2text.git
- from html2text.utils import wrapwrite
-except:
- pass
-
import os
import re
import sh
@@ -103,20 +93,7 @@ def fix_lists(string):
def convert(html_file_path):
with open(html_file_path) as html_file:
- try:
- soup = BeautifulSoup(html_file, features="html5lib").article
- except:
- try:
- # For some reason the lxml parser isn't found with
- # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's
- # probably better to use an html5 parser anyway as the
- # Replicant blog (now?) uses the html doctype and the
- # theme seems to include an html5.js file for the IE 9
- # browser.
- soup = BeautifulSoup(html_file, features="lxml").article
- except:
- print("Cannot find html5lib or lxml parsers")
- sys.exit(1)
+ soup = BeautifulSoup(html_file, features="html5lib").article
# Format the output to be compatible with mail conventions but make sure
# that the links are not split between two lines
@@ -138,20 +115,7 @@ def convert(html_file_path):
def _get_metadata(html_file_path, func):
with open(html_file_path) as html_file:
- try:
- soup = BeautifulSoup(html_file, features="html5lib")
- except:
- try:
- # For some reason the lxml parser isn't found with
- # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's
- # probably better to use an html5 parser anyway as the
- # Replicant blog (now?) uses the html doctype and the
- # theme seems to include an html5.js file for the IE 9
- # browser.
- soup = BeautifulSoup(html_file, features="lxml").article
- except:
- print("Cannot find html5lib or lxml parsers")
- sys.exit(1)
+ soup = BeautifulSoup(html_file, features="html5lib")
return func(soup)
def get_metadata(html_file_path):
@@ -217,10 +181,7 @@ def main():
text = get_metadata(html_file_path)
text += convert(html_file_path)
- try:
- wrapwrite(text)
- except:
- sys.stdout.write(text)
+ sys.stdout.write(text)
if __name__ == '__main__':
main()