#!/usr/bin/env python3 # encoding: utf-8 # # Copyright (C) 2020-2023 Denis 'GNUtoo' Carikli # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . from bs4 import BeautifulSoup # from html2text import config, HTML2Text import os import re import sys def usage(progname): print("Usage: {} path/to/file.html".format(progname)) sys.exit(1) def get_article_links(html_file_path): with open(html_file_path) as html_file: soup = BeautifulSoup(html_file, 'html.parser') # print(soup.prettify()) entries = soup.find_all('article') # class_='entry-title') for entry in entries: titles = entry.find_all('h1', class_='entry-title') for title in titles: links = title.find_all('a') for link in links: print(link['href']) return None def main(): if len(sys.argv) != 2: usage(sys.argv[0]) html_file_path = sys.argv[1] text = get_article_links(html_file_path) if __name__ == '__main__': main()