release_notes.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138

#!/usr/bin/env python3
# encoding: utf-8
# Copyright (C) 2020 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

# How to use:
# - Download the release notes from Wordpress. If it's not published yet you
#   can still click on preview and save-as the page
# - Point this program to the resulting html file, it will generate a text
#   version of it.

from bs4 import BeautifulSoup
from html2text import config, HTML2Text

try:
    # This has been removed in more recent
    # versions of python-html2text. See commit
    # b361467894fb277563b4547ec9d4df49f5e0c6e3
    # (b361467 Remove support for Python ≤ 3.4)
    # in https://github.com/Alir3z4/html2text.git
    from html2text.utils import wrapwrite
except:
    pass

import os
import re
import sys

def usage(progname):
    print("{} path/to/file.html".format(progname))
    sys.exit(1)

# A "[1]" in the html becomes "[[1]][6]" in text.
# As we already uses references at the end a [6] would
# be enough.
def fix_wordpress_references_link(string):
    open_square_bracket = re.escape('[')
    close_square_bracket = re.escape(']')
    whitespaces = '\s*'
    numbers = '\d+'

    # [ [ 1 ] ] [ 6 ]
    wordpress_link_regex = \
        \
          open_square_bracket + whitespaces \
        + open_square_bracket + whitespaces \
        + numbers + whitespaces \
        + close_square_bracket + whitespaces \
        + close_square_bracket + whitespaces \
        \
        + open_square_bracket + whitespaces \
        + numbers + whitespaces \
        + close_square_bracket + whitespaces \

    results = re.findall(wordpress_link_regex, string)

    part_to_remove = '^' \
        + open_square_bracket + whitespaces \
        + open_square_bracket + whitespaces \
        + numbers + whitespaces \
        + close_square_bracket + whitespaces \
        + close_square_bracket + whitespaces \

    for result in results:
        replacement = re.sub(part_to_remove, '', result)
        string = string.replace(result, replacement)

    return string

def fix_alignment(string):
    new_string = ""
    for line in string.split(os.linesep):
        new_line = re.sub('^  ', '', line)
        new_string += (new_line + os.linesep)

    return new_string

def convert(html_file_path):
    with open(html_file_path) as html_file:
        try:
            soup = BeautifulSoup(html_file, features="html5lib").article
        except:
            try:
                # For some reason the lxml parser isn't found with
                # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's
                # probably better to use an html5 parser anyway as the
                # Replicant blog (now?) uses the html doctype and the
                # theme seems to include an html5.js file for the IE 9
                # browser.
                soup = BeautifulSoup(html_file, features="lxml").article
            except:
                print("Cannot find html5lib or lxml parsers")
                sys.exit(1)

    # Format the output to be compatible with mail conventions but make sure
    # that the links are not split between two lines
    config.INLINE_LINKS = False
    config.PROTECT_LINKS = True
    config.WRAP_LIST_ITEMS = True
    config.BODY_WIDTH = 70

    parser = HTML2Text()

    article = soup.find('div', class_='entry-content')
    text = parser.handle(article.decode())

    text = fix_wordpress_references_link(text)
    text = fix_alignment(text)

    return text

def main():
    if len(sys.argv) != 2:
        usage(sys.argv[0])

    html_file_path = sys.argv[1]

    text = convert(html_file_path)

    try:
        wrapwrite(text)
    except:
        sys.stdout.write(text)

if __name__ == '__main__':
    main()