aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDenis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>2021-04-26 16:44:02 +0200
committerDenis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>2021-04-26 19:27:19 +0200
commit7fa8e944615f9e38d22fcf1afe007d7ca9c92bb9 (patch)
tree7ae8927ab400dfb6bf8b3d559d8facc1e1bdca48
parent11150894031d0d4135cf7e94b5701491e75f0d51 (diff)
downloadvendor_replicant-release-scripts-7fa8e944615f9e38d22fcf1afe007d7ca9c92bb9.tar.gz
vendor_replicant-release-scripts-7fa8e944615f9e38d22fcf1afe007d7ca9c92bb9.tar.bz2
vendor_replicant-release-scripts-7fa8e944615f9e38d22fcf1afe007d7ca9c92bb9.zip
release_notes: Handle reference links without names like [1]
Some links in WordPress have text like [1], [2], etc. When they get converted into text the "[1]" is kept and a reference is added so we have something like "[[1]][2]" in the final text. The fix is to completely remove the [[1]] part as it is not necessary anymore. Signed-off-by: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
-rwxr-xr-xrelease_notes.py41
1 files changed, 41 insertions, 0 deletions
diff --git a/release_notes.py b/release_notes.py
index 95ae737..85ab84c 100755
--- a/release_notes.py
+++ b/release_notes.py
@@ -34,12 +34,51 @@ try:
except:
pass
+import os
+import re
import sys
def usage(progname):
print("{} path/to/file.html".format(progname))
sys.exit(1)
+# A "[1]" in the html becomes "[[1]][6]" in text.
+# As we already uses references at the end a [6] would
+# be enough.
+def fix_wordpress_references_link(string):
+ open_square_bracket = re.escape('[')
+ close_square_bracket = re.escape(']')
+ whitespaces = '\s*'
+ numbers = '\d+'
+
+ # [ [ 1 ] ] [ 6 ]
+ wordpress_link_regex = \
+ \
+ open_square_bracket + whitespaces \
+ + open_square_bracket + whitespaces \
+ + numbers + whitespaces \
+ + close_square_bracket + whitespaces \
+ + close_square_bracket + whitespaces \
+ \
+ + open_square_bracket + whitespaces \
+ + numbers + whitespaces \
+ + close_square_bracket + whitespaces \
+
+ results = re.findall(wordpress_link_regex, string)
+
+ part_to_remove = '^' \
+ + open_square_bracket + whitespaces \
+ + open_square_bracket + whitespaces \
+ + numbers + whitespaces \
+ + close_square_bracket + whitespaces \
+ + close_square_bracket + whitespaces \
+
+ for result in results:
+ replacement = re.sub(part_to_remove, '', result)
+ string = string.replace(result, replacement)
+
+ return string
+
def convert(html_file_path):
with open(html_file_path) as html_file:
try:
@@ -67,6 +106,8 @@ def convert(html_file_path):
parser = HTML2Text()
text = parser.handle(soup.decode())
+ text = fix_wordpress_references_link(text)
+
return text
def main():