aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.gitignore27
-rw-r--r--Makefile108
-rw-r--r--README47
-rwxr-xr-xconvert.py226
-rwxr-xr-xfix-dates.sh60
-rwxr-xr-xget_links.py52
-rw-r--r--haunt.scm27
-rw-r--r--lighttpd.conf31
8 files changed, 578 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..94d3025
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,27 @@
+# Copyright (C) 2023 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+html/**
+markdown/**
+site/**
+links.txt
+replicant_blog_page_0.html
+replicant_blog_page_1.html
+replicant_blog_page_2.html
+replicant_blog_page_3.html
+replicant_blog_page_4.html
+replicant_blog_page_5.html
+replicant_blog_page_6.html
+replicant_blog_page_7.html
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..4d50541
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,108 @@
+# Copyright (C) 2023 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+CURL ?= curl
+BLOG_URL ?= https://blog.replicant.us/page
+CONVERT ?= guix shell python python-beautifulsoup4 python-html2text -- python3 convert.py
+
+SENTINEL =
+
+.PHONY: all build help serve
+
+PAGES = \
+ replicant_blog_page_7.html \
+ replicant_blog_page_6.html \
+ replicant_blog_page_5.html \
+ replicant_blog_page_4.html \
+ replicant_blog_page_3.html \
+ replicant_blog_page_2.html \
+ replicant_blog_page_1.html \
+ replicant_blog_page_0.html \
+ $(SENTINEL)
+
+all: help
+
+replicant_blog_page_%.html:
+ $(CURL) $(BLOG_URL)/$(subst replicant_blog_page_,,$*)/ -o $@
+
+links.txt: $(PAGES)
+ @echo -n '' > $@
+ @for f in $(PAGES) ; do \
+ echo "Processing $$f" > /dev/stderr ; \
+ guix shell -C python python-beautifulsoup4 -- \
+ python3 get_links.py $$f >> $@ ; \
+ done
+
+html: # do not depend on links.txt as a human is supposed to review it manually
+ mkdir -p $@ && \
+ for url in `cat links.txt` ; do \
+ if [ ! -f html/`echo "$${url}" | \
+ sed 's#https://blog.replicant.us/##' | \
+ sed 's#/$$##'| \
+ sed 's#/#_#g'`.html ] ; then \
+ curl "$${url}" -o \
+ html/`echo "$${url}" | \
+ sed 's#https://blog.replicant.us/##' | \
+ sed 's#/$$##'| \
+ sed 's#/#_#g'`.html ; \
+ fi \
+ done
+ # TODO: erase html directory if that fails for some reason.
+
+markdown: html
+ mkdir -p $@ && \
+ for url in `cat links.txt` ; do \
+ echo "$${url}" | \
+ sed 's#https://blog.replicant.us/##' | \
+ sed 's#/$$##' ; \
+ $(CONVERT) \
+ html/`echo "$${url}" | \
+ sed 's#https://blog.replicant.us/##' | \
+ sed 's#/$$##'| \
+ sed 's#/#_#g'`.html \
+ > \
+ markdown/`echo "$${url}" | \
+ sed 's#https://blog.replicant.us/##' | \
+ sed 's#/$$##'| \
+ sed 's#/#_#g'`.md ; \
+ done
+
+build:
+ haunt build
+
+help:
+ @printf "%s\n\t%s\n\t%s\n\t%s\n\t%s\n" \
+ "Available commands:" \
+ "links.txt # Create links.txt file with blog post links." \
+ " # For security reasons This file needs to be manually " \
+ " # reviewed manually before usage." \
+ "html # create html pages. To be done after reviewing links.txt." \
+
+serve:
+ haunt serve -w -p $(HAUNT_PORT)
+
+website.tar.gz: build
+ tar \
+ --exclude-vcs \
+ --format=gnu \
+ --owner=0 --group=0 --numeric-owner \
+ --sort=name \
+ -czf \
+ website.tar.gz \
+ site \
+ pages/img \
+ index.html \
+ --transform="s#^site#web#" \
+ --transform="s#^pages/img/#web/img/#"
diff --git a/README b/README
new file mode 100644
index 0000000..1d9e14f
--- /dev/null
+++ b/README
@@ -0,0 +1,47 @@
+Introduction
+============
+This is a set of scripts that can be used to do a semi-automatic
+migration from WordPress to haunt, a static website generator.
+
+The code is a bit fragile and parses untrusted data so a human is
+required to review if there are any suspicious data at several steps
+in the migration process.
+
+Usage
+=====
+
+First run 'make links.txt' manually to produce the links.txt file.
+Then you need to manually inspect it to see if it contains dangerous
+or problematic characters / text.
+
+Then you need to run run 'make markdown' to download all the blog
+posts in html/ and convert them to markdown (and store them in
+markdown/).
+
+At this point the dates inside the file need to be fixed because at
+the time of writing haunt doesn't support the same date format than
+WordPress yet.
+
+Since the dates are again untrusted data, a human needs to review them
+with the './fix-dates.sh preview markdown/*' command.
+
+If no strange data was found they then need to be converted with
+'./fix-dates.sh fix markdown/*'.
+
+At this stage the website is then ready to used with haunt. You can
+then build it with 'make build' or 'make serve'.
+
+License
+=======
+This project is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or (at
+your option) any later version.
+
+This project is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program. If not, see <https://www.gnu.org/licenses/>. \ No newline at end of file
diff --git a/convert.py b/convert.py
new file mode 100755
index 0000000..776fe5b
--- /dev/null
+++ b/convert.py
@@ -0,0 +1,226 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# Copyright (C) 2020-2022 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+from bs4 import BeautifulSoup
+from html2text import config, HTML2Text
+
+try:
+ # This has been removed in more recent
+ # versions of python-html2text. See commit
+ # b361467894fb277563b4547ec9d4df49f5e0c6e3
+ # (b361467 Remove support for Python ≤ 3.4)
+ # in https://github.com/Alir3z4/html2text.git
+ from html2text.utils import wrapwrite
+except:
+ pass
+
+import os
+import re
+import sh
+import sys
+
+def usage(progname):
+ print("{} path/to/file.html".format(progname))
+ sys.exit(1)
+
+# A "[1]" in the html becomes "[[1]][6]" in text.
+# As we already uses references at the end a [6] would
+# be enough.
+def fix_wordpress_references_link(string):
+ open_square_bracket = re.escape('[')
+ close_square_bracket = re.escape(']')
+ whitespaces = '\s*'
+ numbers = '\d+'
+
+ # [ [ 1 ] ] [ 6 ]
+ wordpress_link_regex = \
+ \
+ open_square_bracket + whitespaces \
+ + open_square_bracket + whitespaces \
+ + numbers + whitespaces \
+ + close_square_bracket + whitespaces \
+ + close_square_bracket + whitespaces \
+ \
+ + open_square_bracket + whitespaces \
+ + numbers + whitespaces \
+ + close_square_bracket + whitespaces \
+
+ results = re.findall(wordpress_link_regex, string)
+
+ part_to_remove = '^' \
+ + open_square_bracket + whitespaces \
+ + open_square_bracket + whitespaces \
+ + numbers + whitespaces \
+ + close_square_bracket + whitespaces \
+ + close_square_bracket + whitespaces \
+
+ for result in results:
+ replacement = re.sub(part_to_remove, '', result)
+ string = string.replace(result, replacement)
+
+ return string
+
+def fix_alignment(string):
+ new_string = ""
+ for line in string.split(os.linesep):
+ new_line = re.sub('^ ', '', line)
+ new_string += (new_line + os.linesep)
+
+ return new_string
+
+# Emacs breaks lists when doing a fill-paragraph to adjust a paragraph to the
+# maximum width so we make sure that there is at least one blank line before
+# the '*'
+def fix_lists(string):
+ new_string = ''
+ nr_lineseps_before_star = 0
+ for c in string:
+ if c == '*' and nr_lineseps_before_star == 1:
+ new_string += os.linesep
+
+ if c == os.linesep:
+ nr_lineseps_before_star += 1
+ else:
+ nr_lineseps_before_star = 0
+
+ new_string += c
+ return new_string
+
+def convert(html_file_path):
+ with open(html_file_path) as html_file:
+ try:
+ soup = BeautifulSoup(html_file, features="html5lib").article
+ except:
+ try:
+ # For some reason the lxml parser isn't found with
+ # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's
+ # probably better to use an html5 parser anyway as the
+ # Replicant blog (now?) uses the html doctype and the
+ # theme seems to include an html5.js file for the IE 9
+ # browser.
+ soup = BeautifulSoup(html_file, features="lxml").article
+ except:
+ print("Cannot find html5lib or lxml parsers")
+ sys.exit(1)
+
+ # Format the output to be compatible with mail conventions but make sure
+ # that the links are not split between two lines
+ config.INLINE_LINKS = False
+ config.PROTECT_LINKS = True
+ config.WRAP_LIST_ITEMS = True
+ config.BODY_WIDTH = 70
+
+ parser = HTML2Text()
+
+ article = soup.find('div', class_='entry-content')
+ text = parser.handle(article.decode())
+
+ text = fix_wordpress_references_link(text)
+ text = fix_alignment(text)
+ text = fix_lists(text)
+
+ return text
+
+def _get_metadata(html_file_path, func):
+ with open(html_file_path) as html_file:
+ try:
+ soup = BeautifulSoup(html_file, features="html5lib")
+ except:
+ try:
+ # For some reason the lxml parser isn't found with
+ # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's
+ # probably better to use an html5 parser anyway as the
+ # Replicant blog (now?) uses the html doctype and the
+ # theme seems to include an html5.js file for the IE 9
+ # browser.
+ soup = BeautifulSoup(html_file, features="lxml").article
+ except:
+ print("Cannot find html5lib or lxml parsers")
+ sys.exit(1)
+ return func(soup)
+
+def get_metadata(html_file_path):
+ metadata = ""
+
+ def get_date(soup):
+ date_metadata = None
+ entries = soup.article.find_all('a')
+ for entry in entries:
+ date_elements = entry.find_all('time', class_='entry-date')
+ for date_element in date_elements:
+ if date_element.get('datetime', None):
+ new_date = date_element['datetime']
+ assert(date_metadata == None or date_metadata == new_date)
+ date_metadata = new_date
+ return date_metadata
+
+ def get_tags(soup):
+ results = []
+ tags = soup.article.find_all('footer', class_='entry-meta')
+ assert(len(tags) == 1)
+ links = tags[0].find_all('a')
+ for link in links:
+ text = link.string
+ if text != 'permalink':
+ results.append(text)
+
+ return ', '.join(results)
+
+ def get_title(soup):
+ title = soup.title.string
+ title = title.replace(os.linesep, '')
+ title = title.replace('\t', '')
+ title = re.sub('\|.*', '', title)
+ title = title.lstrip().rstrip()
+ return title
+
+ date_metadata = _get_metadata(html_file_path, get_date)
+ assert(date_metadata != None)
+ metadata += "date: {}".format(date_metadata) + os.linesep
+
+ title_metadata = _get_metadata(html_file_path, get_title)
+ # assert(title_metadata != None)
+ metadata += "title: {}".format(title_metadata) + os.linesep
+
+ # assert(title_metadata != None)
+ # metadata += "title: {}".format(title_metadata)
+
+ tags_metadata = _get_metadata(html_file_path, get_tags)
+ if tags_metadata:
+ print("tags: {}".format(tags_metadata))
+
+ metadata += "---" + os.linesep
+
+ return metadata
+
+def main():
+ if len(sys.argv) != 2:
+ usage(sys.argv[0])
+
+ html_file_path = sys.argv[1]
+
+ text = get_metadata(html_file_path)
+ text += convert(html_file_path)
+
+ try:
+ wrapwrite(text)
+ except:
+ sys.stdout.write(text)
+
+if __name__ == '__main__':
+ main()
diff --git a/fix-dates.sh b/fix-dates.sh
new file mode 100755
index 0000000..74a5bd6
--- /dev/null
+++ b/fix-dates.sh
@@ -0,0 +1,60 @@
+#!/bin/sh
+
+progname="fix-dates.sh"
+
+usage()
+{
+ progname="$1"
+ printf "Usage: %s <preview|fix> [path-to-file [path-to-file [...]]]\n" \
+ "${progname}"
+}
+
+get_date()
+{
+ file_path="$1"
+
+ max_lines=$(wc -l markdown/* | \
+ sort -n | tail -n2 | head -n +1 | \
+ awk '{print $1}')
+ date=$(grep -h -B "${max_lines}" '^---$' "${file_path}" | \
+ grep '^date:' | sed 's/^date: \?//')
+ echo "${date}"
+}
+
+preview_file()
+{
+ file_path="$1"
+ date="$(get_date "${file_path}")"
+ echo "${file_path}":"${date}"
+}
+
+fix_file()
+{
+ file_path="$1"
+ date="$(get_date "${file_path}")"
+
+ new_date_format=$(date --date="${date}" '+%F %R')
+
+ sed "s/${date}/${new_date_format}/g" -i "${file_path}"
+}
+
+if [ $# -lt 2 ] ; then
+ usage "${progname}"
+ exit 64
+fi
+
+command="$1"
+shift 1
+
+if [ "${command}" = "preview" ] ; then
+ for file in $@ ; do
+ preview_file "${file}"
+ done
+elif [ "${command}" = "fix" ] ; then
+ for file in $@ ; do
+ fix_file "${file}"
+ done
+else
+ usage "${progname}"
+ exit 64
+fi
diff --git a/get_links.py b/get_links.py
new file mode 100755
index 0000000..c605b39
--- /dev/null
+++ b/get_links.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+#
+# Copyright (C) 2020-2023 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+from bs4 import BeautifulSoup
+# from html2text import config, HTML2Text
+
+import os
+import re
+import sys
+
+def usage(progname):
+ print("Usage: {} path/to/file.html".format(progname))
+ sys.exit(1)
+
+def get_article_links(html_file_path):
+ with open(html_file_path) as html_file:
+ soup = BeautifulSoup(html_file, 'html.parser')
+ # print(soup.prettify())
+ entries = soup.find_all('article') # class_='entry-title')
+ for entry in entries:
+ titles = entry.find_all('h1', class_='entry-title')
+ for title in titles:
+ links = title.find_all('a')
+ for link in links:
+ print(link['href'])
+ return None
+
+def main():
+ if len(sys.argv) != 2:
+ usage(sys.argv[0])
+
+ html_file_path = sys.argv[1]
+
+ text = get_article_links(html_file_path)
+
+if __name__ == '__main__':
+ main()
diff --git a/haunt.scm b/haunt.scm
new file mode 100644
index 0000000..9577759
--- /dev/null
+++ b/haunt.scm
@@ -0,0 +1,27 @@
+;;; Copyright © 2023 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+;;;
+;;; This file is free software; you can redistribute it and/or modify
+;;; it under the terms of the GNU General Public License as published
+;;; by the Free Software Foundation; either version 3 of the License,
+;;; or (at your option) any later version.
+;;;
+;;; Haunt is distributed in the hope that it will be useful, but
+;;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;;; General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with this file. If not, see <http://www.gnu.org/licenses/>.
+
+(use-modules (haunt builder blog)
+ (haunt site)
+ (haunt reader commonmark))
+
+(site #:title "Replicant"
+ #:domain "blog.replicant.us"
+ #:default-metadata
+ '((author . "Replicant contributors"))
+;; #:file-filter untitled-file-filter
+ #:posts-directory "markdown"
+ #:readers (list commonmark-reader)
+ #:builders (list (blog)))
diff --git a/lighttpd.conf b/lighttpd.conf
new file mode 100644
index 0000000..b0917fb
--- /dev/null
+++ b/lighttpd.conf
@@ -0,0 +1,31 @@
+# Copyright (C) 2023 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <https://www.gnu.org/licenses/>.
+
+server.bind = "localhost"
+server.port = 8086
+server.document-root = "/home/gnutoo/work/projects/replicant/infrastructure/haunt/site/"
+dir-listing.activate = "enable"
+index-file.names = ( "index.html" )
+mimetype.assign = (
+ ".html" => "text/html",
+ ".txt" => "text/plain",
+ ".css" => "text/css",
+ ".js" => "application/x-javascript",
+ ".jpg" => "image/jpeg",
+ ".jpeg" => "image/jpeg",
+ ".gif" => "image/gif",
+ ".png" => "image/png",
+ "" => "application/octet-stream"
+ )