diff options
Diffstat (limited to 'convert.py')
| -rwxr-xr-x | convert.py | 25 |
1 files changed, 13 insertions, 12 deletions
@@ -93,16 +93,6 @@ def fix_lists(string): def fix_right_single_quotation_mark(string): return string.replace('’', '\'') -def fix_dashes(string): - string = string.replace('—', '-') # em dash - string = string.replace('–', '-') # en dash - return string - -def fix_quotes(string): - string = string.replace('“', '"') - string = string.replace('”', '"') - return string - # Some links are broken: they start in one line (like with # '<http://ftp-') and the next line # ('osl.osuosl.org/pub/replicant/images/replicant-4.0/0004/infos/changelog.txt>') @@ -128,6 +118,18 @@ def fix_broken_links(string, protocol): prev_line = line return os.linesep.join(lines) +def replace_invalid_characters(string): + # Without that, haunt fails with the following error: + # ERROR: In procedure substring: + # Value out of range 146 to< 152: 154 + string = string.replace('…', '...') + + # Without that, haunt fails with the following error: + # ERROR: In procedure substring: + # Value out of range 0 to< 45: 46 + string = string.replace('—', '-') + return string + def convert(html_file_path): with open(html_file_path) as html_file: soup = BeautifulSoup(html_file, features="html5lib").article @@ -148,10 +150,9 @@ def convert(html_file_path): text = fix_alignment(text) text = fix_lists(text) text = fix_right_single_quotation_mark(text) - text = fix_dashes(text) text = fix_broken_links(text, 'http') text = fix_broken_links(text, 'https') - text = fix_quotes(text) + text = replace_invalid_characters(text) return text |
