diff options
author | Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> | 2021-04-26 16:44:02 +0200 |
---|---|---|
committer | Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> | 2021-04-26 19:27:19 +0200 |
commit | 7fa8e944615f9e38d22fcf1afe007d7ca9c92bb9 (patch) | |
tree | 7ae8927ab400dfb6bf8b3d559d8facc1e1bdca48 /release_notes.py | |
parent | 11150894031d0d4135cf7e94b5701491e75f0d51 (diff) | |
download | vendor_replicant-release-scripts-7fa8e944615f9e38d22fcf1afe007d7ca9c92bb9.tar.gz vendor_replicant-release-scripts-7fa8e944615f9e38d22fcf1afe007d7ca9c92bb9.tar.bz2 vendor_replicant-release-scripts-7fa8e944615f9e38d22fcf1afe007d7ca9c92bb9.zip |
release_notes: Handle reference links without names like [1]
Some links in WordPress have text like [1], [2], etc.
When they get converted into text the "[1]" is kept and a
reference is added so we have something like "[[1]][2]" in the
final text.
The fix is to completely remove the [[1]] part as it is not
necessary anymore.
Signed-off-by: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
Diffstat (limited to 'release_notes.py')
-rwxr-xr-x | release_notes.py | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/release_notes.py b/release_notes.py index 95ae737..85ab84c 100755 --- a/release_notes.py +++ b/release_notes.py @@ -34,12 +34,51 @@ try: except: pass +import os +import re import sys def usage(progname): print("{} path/to/file.html".format(progname)) sys.exit(1) +# A "[1]" in the html becomes "[[1]][6]" in text. +# As we already uses references at the end a [6] would +# be enough. +def fix_wordpress_references_link(string): + open_square_bracket = re.escape('[') + close_square_bracket = re.escape(']') + whitespaces = '\s*' + numbers = '\d+' + + # [ [ 1 ] ] [ 6 ] + wordpress_link_regex = \ + \ + open_square_bracket + whitespaces \ + + open_square_bracket + whitespaces \ + + numbers + whitespaces \ + + close_square_bracket + whitespaces \ + + close_square_bracket + whitespaces \ + \ + + open_square_bracket + whitespaces \ + + numbers + whitespaces \ + + close_square_bracket + whitespaces \ + + results = re.findall(wordpress_link_regex, string) + + part_to_remove = '^' \ + + open_square_bracket + whitespaces \ + + open_square_bracket + whitespaces \ + + numbers + whitespaces \ + + close_square_bracket + whitespaces \ + + close_square_bracket + whitespaces \ + + for result in results: + replacement = re.sub(part_to_remove, '', result) + string = string.replace(result, replacement) + + return string + def convert(html_file_path): with open(html_file_path) as html_file: try: @@ -67,6 +106,8 @@ def convert(html_file_path): parser = HTML2Text() text = parser.handle(soup.decode()) + text = fix_wordpress_references_link(text) + return text def main(): |