diff options
| author | Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> | 2023-12-26 16:01:56 +0100 |
|---|---|---|
| committer | Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> | 2023-12-31 13:11:42 +0100 |
| commit | dac22e627f3c716201556537849743387464c73d (patch) | |
| tree | a79c1eaeadc0697f72075a2f88a0ab6188bd90cf /convert.py | |
| download | haunt-blog-dac22e627f3c716201556537849743387464c73d.tar.gz haunt-blog-dac22e627f3c716201556537849743387464c73d.tar.bz2 haunt-blog-dac22e627f3c716201556537849743387464c73d.zip | |
Initial import
Signed-off-by: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
Diffstat (limited to 'convert.py')
| -rwxr-xr-x | convert.py | 226 |
1 files changed, 226 insertions, 0 deletions
diff --git a/convert.py b/convert.py new file mode 100755 index 0000000..776fe5b --- /dev/null +++ b/convert.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# Copyright (C) 2020-2022 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +from bs4 import BeautifulSoup +from html2text import config, HTML2Text + +try: + # This has been removed in more recent + # versions of python-html2text. See commit + # b361467894fb277563b4547ec9d4df49f5e0c6e3 + # (b361467 Remove support for Python ≤ 3.4) + # in https://github.com/Alir3z4/html2text.git + from html2text.utils import wrapwrite +except: + pass + +import os +import re +import sh +import sys + +def usage(progname): + print("{} path/to/file.html".format(progname)) + sys.exit(1) + +# A "[1]" in the html becomes "[[1]][6]" in text. +# As we already uses references at the end a [6] would +# be enough. +def fix_wordpress_references_link(string): + open_square_bracket = re.escape('[') + close_square_bracket = re.escape(']') + whitespaces = '\s*' + numbers = '\d+' + + # [ [ 1 ] ] [ 6 ] + wordpress_link_regex = \ + \ + open_square_bracket + whitespaces \ + + open_square_bracket + whitespaces \ + + numbers + whitespaces \ + + close_square_bracket + whitespaces \ + + close_square_bracket + whitespaces \ + \ + + open_square_bracket + whitespaces \ + + numbers + whitespaces \ + + close_square_bracket + whitespaces \ + + results = re.findall(wordpress_link_regex, string) + + part_to_remove = '^' \ + + open_square_bracket + whitespaces \ + + open_square_bracket + whitespaces \ + + numbers + whitespaces \ + + close_square_bracket + whitespaces \ + + close_square_bracket + whitespaces \ + + for result in results: + replacement = re.sub(part_to_remove, '', result) + string = string.replace(result, replacement) + + return string + +def fix_alignment(string): + new_string = "" + for line in string.split(os.linesep): + new_line = re.sub('^ ', '', line) + new_string += (new_line + os.linesep) + + return new_string + +# Emacs breaks lists when doing a fill-paragraph to adjust a paragraph to the +# maximum width so we make sure that there is at least one blank line before +# the '*' +def fix_lists(string): + new_string = '' + nr_lineseps_before_star = 0 + for c in string: + if c == '*' and nr_lineseps_before_star == 1: + new_string += os.linesep + + if c == os.linesep: + nr_lineseps_before_star += 1 + else: + nr_lineseps_before_star = 0 + + new_string += c + return new_string + +def convert(html_file_path): + with open(html_file_path) as html_file: + try: + soup = BeautifulSoup(html_file, features="html5lib").article + except: + try: + # For some reason the lxml parser isn't found with + # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's + # probably better to use an html5 parser anyway as the + # Replicant blog (now?) uses the html doctype and the + # theme seems to include an html5.js file for the IE 9 + # browser. + soup = BeautifulSoup(html_file, features="lxml").article + except: + print("Cannot find html5lib or lxml parsers") + sys.exit(1) + + # Format the output to be compatible with mail conventions but make sure + # that the links are not split between two lines + config.INLINE_LINKS = False + config.PROTECT_LINKS = True + config.WRAP_LIST_ITEMS = True + config.BODY_WIDTH = 70 + + parser = HTML2Text() + + article = soup.find('div', class_='entry-content') + text = parser.handle(article.decode()) + + text = fix_wordpress_references_link(text) + text = fix_alignment(text) + text = fix_lists(text) + + return text + +def _get_metadata(html_file_path, func): + with open(html_file_path) as html_file: + try: + soup = BeautifulSoup(html_file, features="html5lib") + except: + try: + # For some reason the lxml parser isn't found with + # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's + # probably better to use an html5 parser anyway as the + # Replicant blog (now?) uses the html doctype and the + # theme seems to include an html5.js file for the IE 9 + # browser. + soup = BeautifulSoup(html_file, features="lxml").article + except: + print("Cannot find html5lib or lxml parsers") + sys.exit(1) + return func(soup) + +def get_metadata(html_file_path): + metadata = "" + + def get_date(soup): + date_metadata = None + entries = soup.article.find_all('a') + for entry in entries: + date_elements = entry.find_all('time', class_='entry-date') + for date_element in date_elements: + if date_element.get('datetime', None): + new_date = date_element['datetime'] + assert(date_metadata == None or date_metadata == new_date) + date_metadata = new_date + return date_metadata + + def get_tags(soup): + results = [] + tags = soup.article.find_all('footer', class_='entry-meta') + assert(len(tags) == 1) + links = tags[0].find_all('a') + for link in links: + text = link.string + if text != 'permalink': + results.append(text) + + return ', '.join(results) + + def get_title(soup): + title = soup.title.string + title = title.replace(os.linesep, '') + title = title.replace('\t', '') + title = re.sub('\|.*', '', title) + title = title.lstrip().rstrip() + return title + + date_metadata = _get_metadata(html_file_path, get_date) + assert(date_metadata != None) + metadata += "date: {}".format(date_metadata) + os.linesep + + title_metadata = _get_metadata(html_file_path, get_title) + # assert(title_metadata != None) + metadata += "title: {}".format(title_metadata) + os.linesep + + # assert(title_metadata != None) + # metadata += "title: {}".format(title_metadata) + + tags_metadata = _get_metadata(html_file_path, get_tags) + if tags_metadata: + print("tags: {}".format(tags_metadata)) + + metadata += "---" + os.linesep + + return metadata + +def main(): + if len(sys.argv) != 2: + usage(sys.argv[0]) + + html_file_path = sys.argv[1] + + text = get_metadata(html_file_path) + text += convert(html_file_path) + + try: + wrapwrite(text) + except: + sys.stdout.write(text) + +if __name__ == '__main__': + main() |
