Initial import

Signed-off-by: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
author: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> 2023-12-26 16:01:56 +0100
committer: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org> 2023-12-31 13:11:42 +0100
commit: dac22e627f3c716201556537849743387464c73d (patch)
tree: a79c1eaeadc0697f72075a2f88a0ab6188bd90cf /get_links.py
download: haunt-blog-dac22e627f3c716201556537849743387464c73d.tar.gz
haunt-blog-dac22e627f3c716201556537849743387464c73d.tar.bz2
haunt-blog-dac22e627f3c716201556537849743387464c73d.zip
1 files changed, 52 insertions, 0 deletions
diff --git a/get_links.py b/get_links.py
new file mode 100755
index 0000000..c605b39
--- /dev/null
+++ b/get_links.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# Copyright (C) 2020-2023 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from bs4 import BeautifulSoup
+# from html2text import config, HTML2Text
+
+import os
+import re
+import sys
+
+def usage(progname):
+    print("Usage: {} path/to/file.html".format(progname))
+    sys.exit(1)
+
+def get_article_links(html_file_path):
+    with open(html_file_path) as html_file:
+        soup = BeautifulSoup(html_file, 'html.parser')
+        # print(soup.prettify())
+        entries = soup.find_all('article') # class_='entry-title')
+        for entry in entries:
+            titles = entry.find_all('h1', class_='entry-title')
+            for title in titles:
+                links = title.find_all('a')
+                for link in links:
+                    print(link['href'])
+    return None
+
+def main():
+    if len(sys.argv) != 2:
+        usage(sys.argv[0])
+
+    html_file_path = sys.argv[1]
+
+    text = get_article_links(html_file_path)
+
+if __name__ == '__main__':
+    main()
author	Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>	2023-12-26 16:01:56 +0100
committer	Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>	2023-12-31 13:11:42 +0100
commit	dac22e627f3c716201556537849743387464c73d (patch)
tree	a79c1eaeadc0697f72075a2f88a0ab6188bd90cf /get_links.py
download	haunt-blog-dac22e627f3c716201556537849743387464c73d.tar.gz haunt-blog-dac22e627f3c716201556537849743387464c73d.tar.bz2 haunt-blog-dac22e627f3c716201556537849743387464c73d.zip