aboutsummaryrefslogtreecommitdiffstats
path: root/get_links.py
diff options
context:
space:
mode:
authorDenis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>2023-12-26 16:01:56 +0100
committerDenis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>2023-12-31 13:11:42 +0100
commitdac22e627f3c716201556537849743387464c73d (patch)
treea79c1eaeadc0697f72075a2f88a0ab6188bd90cf /get_links.py
downloadhaunt-blog-dac22e627f3c716201556537849743387464c73d.tar.gz
haunt-blog-dac22e627f3c716201556537849743387464c73d.tar.bz2
haunt-blog-dac22e627f3c716201556537849743387464c73d.zip
Initial import
Signed-off-by: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
Diffstat (limited to 'get_links.py')
-rwxr-xr-xget_links.py52
1 files changed, 52 insertions, 0 deletions
diff --git a/get_links.py b/get_links.py
new file mode 100755
index 0000000..c605b39
--- /dev/null
+++ b/get_links.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# Copyright (C) 2020-2023 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+from bs4 import BeautifulSoup
+# from html2text import config, HTML2Text
+
+import os
+import re
+import sys
+
+def usage(progname):
+ print("Usage: {} path/to/file.html".format(progname))
+ sys.exit(1)
+
+def get_article_links(html_file_path):
+ with open(html_file_path) as html_file:
+ soup = BeautifulSoup(html_file, 'html.parser')
+ # print(soup.prettify())
+ entries = soup.find_all('article') # class_='entry-title')
+ for entry in entries:
+ titles = entry.find_all('h1', class_='entry-title')
+ for title in titles:
+ links = title.find_all('a')
+ for link in links:
+ print(link['href'])
+ return None
+
+def main():
+ if len(sys.argv) != 2:
+ usage(sys.argv[0])
+
+ html_file_path = sys.argv[1]
+
+ text = get_article_links(html_file_path)
+
+if __name__ == '__main__':
+ main()