diff options
| author | Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> | 2024-02-28 17:09:10 +0100 |
|---|---|---|
| committer | Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> | 2025-06-15 01:30:21 +0200 |
| commit | bbea73d1c21e11e35b252edf41c107536be8e98e (patch) | |
| tree | 72e9ac4a2e4b87b0877a4d481177860316b60c66 /convert.py | |
| parent | 048bb1c8ecc552fb4e91d361f108ede65cd7a9b9 (diff) | |
| download | haunt-blog-bbea73d1c21e11e35b252edf41c107536be8e98e.tar.gz haunt-blog-bbea73d1c21e11e35b252edf41c107536be8e98e.tar.bz2 haunt-blog-bbea73d1c21e11e35b252edf41c107536be8e98e.zip | |
convert.py: remove non-determinism.
Introducing non-deterministic code paths in the conversion is a bad
idea because we want the new blog to be as close as possible.
And so if humans verify that the old and new website are similar, to
be effective, the verification need to be done on top of a
deterministic process.
Signed-off-by: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
Diffstat (limited to 'convert.py')
| -rwxr-xr-x | convert.py | 47 |
1 files changed, 4 insertions, 43 deletions
@@ -1,7 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # -# Copyright (C) 2020-2022 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> +# Copyright (C) 2020-2024 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -19,16 +19,6 @@ from bs4 import BeautifulSoup from html2text import config, HTML2Text -try: - # This has been removed in more recent - # versions of python-html2text. See commit - # b361467894fb277563b4547ec9d4df49f5e0c6e3 - # (b361467 Remove support for Python ≤ 3.4) - # in https://github.com/Alir3z4/html2text.git - from html2text.utils import wrapwrite -except: - pass - import os import re import sh @@ -103,20 +93,7 @@ def fix_lists(string): def convert(html_file_path): with open(html_file_path) as html_file: - try: - soup = BeautifulSoup(html_file, features="html5lib").article - except: - try: - # For some reason the lxml parser isn't found with - # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's - # probably better to use an html5 parser anyway as the - # Replicant blog (now?) uses the html doctype and the - # theme seems to include an html5.js file for the IE 9 - # browser. - soup = BeautifulSoup(html_file, features="lxml").article - except: - print("Cannot find html5lib or lxml parsers") - sys.exit(1) + soup = BeautifulSoup(html_file, features="html5lib").article # Format the output to be compatible with mail conventions but make sure # that the links are not split between two lines @@ -138,20 +115,7 @@ def convert(html_file_path): def _get_metadata(html_file_path, func): with open(html_file_path) as html_file: - try: - soup = BeautifulSoup(html_file, features="html5lib") - except: - try: - # For some reason the lxml parser isn't found with - # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's - # probably better to use an html5 parser anyway as the - # Replicant blog (now?) uses the html doctype and the - # theme seems to include an html5.js file for the IE 9 - # browser. - soup = BeautifulSoup(html_file, features="lxml").article - except: - print("Cannot find html5lib or lxml parsers") - sys.exit(1) + soup = BeautifulSoup(html_file, features="html5lib") return func(soup) def get_metadata(html_file_path): @@ -217,10 +181,7 @@ def main(): text = get_metadata(html_file_path) text += convert(html_file_path) - try: - wrapwrite(text) - except: - sys.stdout.write(text) + sys.stdout.write(text) if __name__ == '__main__': main() |
