#!/usr/bin/env python3 # encoding: utf-8 # Copyright (C) 2020, 2024 Denis 'GNUtoo' Carikli # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import bs4 import enum import os import re import sys def usage(progname): print("{} path/to/file.html".format(progname)) sys.exit(1) def fixup_html(html): open_p = 0 buf = [None] * 4 for c in html: buf[0] = buf[1] buf[1] = buf[2] buf[2] = buf[3] buf[3] = c if buf[1] == '<' and buf[2] == 'p' and buf[3] == '>': open_p += 1 if buf[0] == '<' and buf[1] == '/' and buf[2] == 'p' and buf[3] == '>': open_p -= 1 if c == '\n' and open_p > 0: print(' ', end='') else: print(c, end='') def main(): if len(sys.argv) != 2: usage(sys.argv[0]) html_file_path = sys.argv[1] with open(html_file_path) as html_file: soup = bs4.BeautifulSoup(html_file, 'html.parser') ################# # Article title # ################# article_title = soup.find(id='title').text print("Title: {}".format(article_title)) ################ # Post content # ################ post_content = soup.find(id='post-content').extract() pretty_html = post_content.prettify() # Remove the outer
and
div_content = os.linesep.join(pretty_html.split(os.linesep)[1:-1]) + os.linesep print("Post content:") print('-' * 38 + ' %< ' + '-' * 38) fixup_html(div_content) print('-' * 38 + ' >% ' + '-' * 38) if __name__ == '__main__': main()