aboutsummaryrefslogtreecommitdiffstats
path: root/convert.py
diff options
context:
space:
mode:
Diffstat (limited to 'convert.py')
-rwxr-xr-xconvert.py25
1 files changed, 13 insertions, 12 deletions
diff --git a/convert.py b/convert.py
index dd51a60..ab8b7e2 100755
--- a/convert.py
+++ b/convert.py
@@ -93,16 +93,6 @@ def fix_lists(string):
def fix_right_single_quotation_mark(string):
return string.replace('’', '\'')
-def fix_dashes(string):
- string = string.replace('—', '-') # em dash
- string = string.replace('–', '-') # en dash
- return string
-
-def fix_quotes(string):
- string = string.replace('“', '"')
- string = string.replace('”', '"')
- return string
-
# Some links are broken: they start in one line (like with
# '<http://ftp-') and the next line
# ('osl.osuosl.org/pub/replicant/images/replicant-4.0/0004/infos/changelog.txt>')
@@ -128,6 +118,18 @@ def fix_broken_links(string, protocol):
prev_line = line
return os.linesep.join(lines)
+def replace_invalid_characters(string):
+ # Without that, haunt fails with the following error:
+ # ERROR: In procedure substring:
+ # Value out of range 146 to< 152: 154
+ string = string.replace('…', '...')
+
+ # Without that, haunt fails with the following error:
+ # ERROR: In procedure substring:
+ # Value out of range 0 to< 45: 46
+ string = string.replace('—', '-')
+ return string
+
def convert(html_file_path):
with open(html_file_path) as html_file:
soup = BeautifulSoup(html_file, features="html5lib").article
@@ -148,10 +150,9 @@ def convert(html_file_path):
text = fix_alignment(text)
text = fix_lists(text)
text = fix_right_single_quotation_mark(text)
- text = fix_dashes(text)
text = fix_broken_links(text, 'http')
text = fix_broken_links(text, 'https')
- text = fix_quotes(text)
+ text = replace_invalid_characters(text)
return text