From dac22e627f3c716201556537849743387464c73d Mon Sep 17 00:00:00 2001
From: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
Date: Tue, 26 Dec 2023 16:01:56 +0100
Subject: Initial import

Signed-off-by: Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
---
 .gitignore    |  27 +++++++
 Makefile      | 108 ++++++++++++++++++++++++++++
 README        |  47 ++++++++++++
 convert.py    | 226 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fix-dates.sh  |  60 ++++++++++++++++
 get_links.py  |  52 ++++++++++++++
 haunt.scm     |  27 +++++++
 lighttpd.conf |  31 ++++++++
 8 files changed, 578 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 Makefile
 create mode 100644 README
 create mode 100755 convert.py
 create mode 100755 fix-dates.sh
 create mode 100755 get_links.py
 create mode 100644 haunt.scm
 create mode 100644 lighttpd.conf

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..94d3025
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,27 @@
+# Copyright (C) 2023 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+html/**
+markdown/**
+site/**
+links.txt
+replicant_blog_page_0.html
+replicant_blog_page_1.html
+replicant_blog_page_2.html
+replicant_blog_page_3.html
+replicant_blog_page_4.html
+replicant_blog_page_5.html
+replicant_blog_page_6.html
+replicant_blog_page_7.html
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..4d50541
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,108 @@
+# Copyright (C) 2023 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+CURL ?= curl
+BLOG_URL ?= https://blog.replicant.us/page
+CONVERT ?= guix shell python python-beautifulsoup4 python-html2text -- python3 convert.py
+
+SENTINEL =
+
+.PHONY: all build help serve
+
+PAGES = \
+	replicant_blog_page_7.html \
+	replicant_blog_page_6.html \
+	replicant_blog_page_5.html \
+	replicant_blog_page_4.html \
+	replicant_blog_page_3.html \
+	replicant_blog_page_2.html \
+	replicant_blog_page_1.html \
+	replicant_blog_page_0.html \
+	$(SENTINEL)
+
+all: help
+
+replicant_blog_page_%.html:
+	$(CURL) $(BLOG_URL)/$(subst replicant_blog_page_,,$*)/ -o $@
+
+links.txt: $(PAGES)
+	@echo -n '' > $@
+	@for f in $(PAGES) ; do \
+		echo "Processing $$f" > /dev/stderr ; \
+		guix shell -C python python-beautifulsoup4 -- \
+		python3 get_links.py $$f >> $@ ; \
+	done
+
+html: # do not depend on links.txt as a human is supposed to review it manually
+	mkdir -p $@ && \
+	for url in `cat links.txt` ; do \
+		if [ ! -f html/`echo "$${url}" | \
+			sed 's#https://blog.replicant.us/##' | \
+			sed 's#/$$##'| \
+			sed 's#/#_#g'`.html ] ; then \
+			curl "$${url}" -o \
+			html/`echo "$${url}" | \
+				sed 's#https://blog.replicant.us/##' | \
+				sed 's#/$$##'| \
+				sed 's#/#_#g'`.html ; \
+		fi \
+	done
+	# TODO: erase html directory if that fails for some reason.
+
+markdown: html
+	mkdir -p $@ && \
+	for url in `cat links.txt` ; do \
+		echo "$${url}" | \
+			sed 's#https://blog.replicant.us/##' | \
+			sed 's#/$$##' ; \
+		$(CONVERT) \
+			html/`echo "$${url}" | \
+				sed 's#https://blog.replicant.us/##' | \
+				sed 's#/$$##'| \
+				sed 's#/#_#g'`.html \
+			> \
+			markdown/`echo "$${url}" | \
+				sed 's#https://blog.replicant.us/##' | \
+				sed 's#/$$##'| \
+				sed 's#/#_#g'`.md ; \
+	done
+
+build:
+	haunt build
+
+help:
+	@printf "%s\n\t%s\n\t%s\n\t%s\n\t%s\n" \
+	"Available commands:" \
+	"links.txt # Create links.txt file with blog post links." \
+	"          # For security reasons This file needs to be manually " \
+	"          # reviewed manually before usage." \
+	"html      # create html pages. To be done after reviewing links.txt." \
+
+serve:
+	haunt serve -w -p $(HAUNT_PORT)
+
+website.tar.gz: build
+	tar \
+		--exclude-vcs \
+		--format=gnu \
+		--owner=0 --group=0 --numeric-owner \
+		--sort=name \
+		-czf \
+		website.tar.gz \
+		site \
+		pages/img \
+		index.html \
+		--transform="s#^site#web#" \
+		--transform="s#^pages/img/#web/img/#"
diff --git a/README b/README
new file mode 100644
index 0000000..1d9e14f
--- /dev/null
+++ b/README
@@ -0,0 +1,47 @@
+Introduction
+============
+This is a set of scripts that can be used to do a semi-automatic
+migration from WordPress to haunt, a static website generator.
+
+The code is a bit fragile and parses untrusted data so a human is
+required to review if there are any suspicious data at several steps
+in the migration process.
+
+Usage
+=====
+
+First run 'make links.txt' manually to produce the links.txt file.
+Then you need to manually inspect it to see if it contains dangerous
+or problematic characters / text.
+
+Then you need to run run 'make markdown' to download all the blog
+posts in html/ and convert them to markdown (and store them in
+markdown/).
+
+At this point the dates inside the file need to be fixed because at
+the time of writing haunt doesn't support the same date format than
+WordPress yet.
+
+Since the dates are again untrusted data, a human needs to review them
+with the './fix-dates.sh preview markdown/*' command.
+
+If no strange data was found they then need to be converted with
+'./fix-dates.sh fix markdown/*'.
+
+At this stage the website is then ready to used with haunt. You can
+then build it with 'make build' or 'make serve'.
+
+License
+=======
+This project is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or (at
+your option) any later version.
+
+This project is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program.  If not, see <https://www.gnu.org/licenses/>.
\ No newline at end of file
diff --git a/convert.py b/convert.py
new file mode 100755
index 0000000..776fe5b
--- /dev/null
+++ b/convert.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# Copyright (C) 2020-2022 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from bs4 import BeautifulSoup
+from html2text import config, HTML2Text
+
+try:
+    # This has been removed in more recent
+    # versions of python-html2text. See commit
+    # b361467894fb277563b4547ec9d4df49f5e0c6e3
+    # (b361467 Remove support for Python ≤ 3.4)
+    # in https://github.com/Alir3z4/html2text.git
+    from html2text.utils import wrapwrite
+except:
+    pass
+
+import os
+import re
+import sh
+import sys
+
+def usage(progname):
+    print("{} path/to/file.html".format(progname))
+    sys.exit(1)
+
+# A "[1]" in the html becomes "[[1]][6]" in text.
+# As we already uses references at the end a [6] would
+# be enough.
+def fix_wordpress_references_link(string):
+    open_square_bracket = re.escape('[')
+    close_square_bracket = re.escape(']')
+    whitespaces = '\s*'
+    numbers = '\d+'
+
+    # [ [ 1 ] ] [ 6 ]
+    wordpress_link_regex = \
+        \
+          open_square_bracket + whitespaces \
+        + open_square_bracket + whitespaces \
+        + numbers + whitespaces \
+        + close_square_bracket + whitespaces \
+        + close_square_bracket + whitespaces \
+        \
+        + open_square_bracket + whitespaces \
+        + numbers + whitespaces \
+        + close_square_bracket + whitespaces \
+
+    results = re.findall(wordpress_link_regex, string)
+
+    part_to_remove = '^' \
+        + open_square_bracket + whitespaces \
+        + open_square_bracket + whitespaces \
+        + numbers + whitespaces \
+        + close_square_bracket + whitespaces \
+        + close_square_bracket + whitespaces \
+
+    for result in results:
+        replacement = re.sub(part_to_remove, '', result)
+        string = string.replace(result, replacement)
+
+    return string
+
+def fix_alignment(string):
+    new_string = ""
+    for line in string.split(os.linesep):
+        new_line = re.sub('^  ', '', line)
+        new_string += (new_line + os.linesep)
+
+    return new_string
+
+# Emacs breaks lists when doing a fill-paragraph to adjust a paragraph to the
+# maximum width so we make sure that there is at least one blank line before
+# the '*'
+def fix_lists(string):
+    new_string = ''
+    nr_lineseps_before_star = 0
+    for c in string:
+        if c == '*' and nr_lineseps_before_star == 1:
+            new_string += os.linesep
+
+        if c == os.linesep:
+            nr_lineseps_before_star += 1
+        else:
+            nr_lineseps_before_star = 0
+
+        new_string += c
+    return new_string
+
+def convert(html_file_path):
+    with open(html_file_path) as html_file:
+        try:
+            soup = BeautifulSoup(html_file, features="html5lib").article
+        except:
+            try:
+                # For some reason the lxml parser isn't found with
+                # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's
+                # probably better to use an html5 parser anyway as the
+                # Replicant blog (now?) uses the html doctype and the
+                # theme seems to include an html5.js file for the IE 9
+                # browser.
+                soup = BeautifulSoup(html_file, features="lxml").article
+            except:
+                print("Cannot find html5lib or lxml parsers")
+                sys.exit(1)
+
+    # Format the output to be compatible with mail conventions but make sure
+    # that the links are not split between two lines
+    config.INLINE_LINKS = False
+    config.PROTECT_LINKS = True
+    config.WRAP_LIST_ITEMS = True
+    config.BODY_WIDTH = 70
+
+    parser = HTML2Text()
+
+    article = soup.find('div', class_='entry-content')
+    text = parser.handle(article.decode())
+
+    text = fix_wordpress_references_link(text)
+    text = fix_alignment(text)
+    text = fix_lists(text)
+
+    return text
+
+def _get_metadata(html_file_path, func):
+    with open(html_file_path) as html_file:
+        try:
+            soup = BeautifulSoup(html_file, features="html5lib")
+        except:
+            try:
+                # For some reason the lxml parser isn't found with
+                # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's
+                # probably better to use an html5 parser anyway as the
+                # Replicant blog (now?) uses the html doctype and the
+                # theme seems to include an html5.js file for the IE 9
+                # browser.
+                soup = BeautifulSoup(html_file, features="lxml").article
+            except:
+                print("Cannot find html5lib or lxml parsers")
+                sys.exit(1)
+        return func(soup)
+
+def get_metadata(html_file_path):
+    metadata = ""
+
+    def get_date(soup):
+        date_metadata = None
+        entries = soup.article.find_all('a')
+        for entry in entries:
+            date_elements = entry.find_all('time', class_='entry-date')
+            for date_element in date_elements:
+                if date_element.get('datetime', None):
+                        new_date = date_element['datetime']
+                        assert(date_metadata == None or date_metadata == new_date)
+                        date_metadata = new_date
+        return date_metadata
+
+    def get_tags(soup):
+        results = []
+        tags = soup.article.find_all('footer', class_='entry-meta')
+        assert(len(tags) == 1)
+        links = tags[0].find_all('a')
+        for link in links:
+            text = link.string
+            if text != 'permalink':
+                results.append(text)
+
+        return ', '.join(results)
+
+    def get_title(soup):
+        title = soup.title.string
+        title = title.replace(os.linesep, '')
+        title = title.replace('\t', '')
+        title = re.sub('\|.*', '', title)
+        title = title.lstrip().rstrip()
+        return title
+
+    date_metadata = _get_metadata(html_file_path, get_date)
+    assert(date_metadata != None)
+    metadata += "date: {}".format(date_metadata) + os.linesep
+
+    title_metadata = _get_metadata(html_file_path, get_title)
+    # assert(title_metadata != None)
+    metadata += "title: {}".format(title_metadata) + os.linesep
+
+    # assert(title_metadata != None)
+    # metadata += "title: {}".format(title_metadata)
+
+    tags_metadata = _get_metadata(html_file_path, get_tags)
+    if tags_metadata:
+        print("tags: {}".format(tags_metadata))
+
+    metadata += "---" + os.linesep
+
+    return metadata
+
+def main():
+    if len(sys.argv) != 2:
+        usage(sys.argv[0])
+
+    html_file_path = sys.argv[1]
+
+    text = get_metadata(html_file_path)
+    text += convert(html_file_path)
+
+    try:
+        wrapwrite(text)
+    except:
+        sys.stdout.write(text)
+
+if __name__ == '__main__':
+    main()
diff --git a/fix-dates.sh b/fix-dates.sh
new file mode 100755
index 0000000..74a5bd6
--- /dev/null
+++ b/fix-dates.sh
@@ -0,0 +1,60 @@
+#!/bin/sh
+
+progname="fix-dates.sh"
+
+usage()
+{
+    progname="$1"
+    printf "Usage: %s <preview|fix> [path-to-file [path-to-file [...]]]\n" \
+	   "${progname}"
+}
+
+get_date()
+{
+    file_path="$1"
+
+    max_lines=$(wc -l markdown/* | \
+                    sort -n | tail -n2 | head -n +1 | \
+                    awk '{print $1}')
+    date=$(grep -h -B "${max_lines}"  '^---$' "${file_path}" | \
+               grep '^date:' | sed 's/^date: \?//')
+    echo "${date}"
+}
+
+preview_file()
+{
+    file_path="$1"
+    date="$(get_date "${file_path}")"
+    echo "${file_path}":"${date}"
+}
+
+fix_file()
+{
+    file_path="$1"
+    date="$(get_date "${file_path}")"
+
+    new_date_format=$(date --date="${date}" '+%F %R')
+
+    sed "s/${date}/${new_date_format}/g" -i "${file_path}"
+}
+
+if [ $# -lt 2 ] ; then
+    usage "${progname}"
+    exit 64
+fi
+
+command="$1"
+shift 1
+
+if [ "${command}" = "preview" ] ; then
+    for file in $@ ; do
+	preview_file "${file}"
+    done
+elif [ "${command}" = "fix" ] ; then
+    for file in $@ ; do
+	fix_file "${file}"
+    done
+else
+    usage "${progname}"
+    exit 64
+fi
diff --git a/get_links.py b/get_links.py
new file mode 100755
index 0000000..c605b39
--- /dev/null
+++ b/get_links.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# Copyright (C) 2020-2023 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+from bs4 import BeautifulSoup
+# from html2text import config, HTML2Text
+
+import os
+import re
+import sys
+
+def usage(progname):
+    print("Usage: {} path/to/file.html".format(progname))
+    sys.exit(1)
+
+def get_article_links(html_file_path):
+    with open(html_file_path) as html_file:
+        soup = BeautifulSoup(html_file, 'html.parser')
+        # print(soup.prettify())
+        entries = soup.find_all('article') # class_='entry-title')
+        for entry in entries:
+            titles = entry.find_all('h1', class_='entry-title')
+            for title in titles:
+                links = title.find_all('a')
+                for link in links:
+                    print(link['href'])
+    return None
+
+def main():
+    if len(sys.argv) != 2:
+        usage(sys.argv[0])
+
+    html_file_path = sys.argv[1]
+
+    text = get_article_links(html_file_path)
+
+if __name__ == '__main__':
+    main()
diff --git a/haunt.scm b/haunt.scm
new file mode 100644
index 0000000..9577759
--- /dev/null
+++ b/haunt.scm
@@ -0,0 +1,27 @@
+;;; Copyright © 2023 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+;;;
+;;; This file is free software; you can redistribute it and/or modify
+;;; it under the terms of the GNU General Public License as published
+;;; by the Free Software Foundation; either version 3 of the License,
+;;; or (at your option) any later version.
+;;;
+;;; Haunt is distributed in the hope that it will be useful, but
+;;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;;; General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with this file.  If not, see <http://www.gnu.org/licenses/>.
+
+(use-modules (haunt builder blog)
+             (haunt site)
+             (haunt reader commonmark))
+
+(site #:title "Replicant"
+      #:domain "blog.replicant.us"
+      #:default-metadata
+      '((author . "Replicant contributors"))
+;;      #:file-filter untitled-file-filter
+      #:posts-directory "markdown"
+      #:readers (list commonmark-reader)
+      #:builders (list (blog)))
diff --git a/lighttpd.conf b/lighttpd.conf
new file mode 100644
index 0000000..b0917fb
--- /dev/null
+++ b/lighttpd.conf
@@ -0,0 +1,31 @@
+# Copyright (C) 2023 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+server.bind             = "localhost"
+server.port		= 8086
+server.document-root	= "/home/gnutoo/work/projects/replicant/infrastructure/haunt/site/"
+dir-listing.activate	= "enable"
+index-file.names	= ( "index.html" )
+mimetype.assign		= (
+				".html" => "text/html",
+				".txt" => "text/plain",
+				".css" => "text/css",
+				".js" => "application/x-javascript",
+				".jpg" => "image/jpeg",
+				".jpeg" => "image/jpeg",
+				".gif" => "image/gif",
+				".png" => "image/png",
+				"" => "application/octet-stream"
+			)
-- 
cgit v1.2.3