diff options
| author | Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> | 2023-12-26 16:01:56 +0100 |
|---|---|---|
| committer | Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> | 2023-12-31 13:11:42 +0100 |
| commit | dac22e627f3c716201556537849743387464c73d (patch) | |
| tree | a79c1eaeadc0697f72075a2f88a0ab6188bd90cf /get_links.py | |
| download | haunt-blog-dac22e627f3c716201556537849743387464c73d.tar.gz haunt-blog-dac22e627f3c716201556537849743387464c73d.tar.bz2 haunt-blog-dac22e627f3c716201556537849743387464c73d.zip | |
Initial import
Signed-off-by: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
Diffstat (limited to 'get_links.py')
| -rwxr-xr-x | get_links.py | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/get_links.py b/get_links.py new file mode 100755 index 0000000..c605b39 --- /dev/null +++ b/get_links.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# Copyright (C) 2020-2023 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. + +from bs4 import BeautifulSoup +# from html2text import config, HTML2Text + +import os +import re +import sys + +def usage(progname): + print("Usage: {} path/to/file.html".format(progname)) + sys.exit(1) + +def get_article_links(html_file_path): + with open(html_file_path) as html_file: + soup = BeautifulSoup(html_file, 'html.parser') + # print(soup.prettify()) + entries = soup.find_all('article') # class_='entry-title') + for entry in entries: + titles = entry.find_all('h1', class_='entry-title') + for title in titles: + links = title.find_all('a') + for link in links: + print(link['href']) + return None + +def main(): + if len(sys.argv) != 2: + usage(sys.argv[0]) + + html_file_path = sys.argv[1] + + text = get_article_links(html_file_path) + +if __name__ == '__main__': + main() |
