From dac22e627f3c716201556537849743387464c73d Mon Sep 17 00:00:00 2001 From: Denis 'GNUtoo' Carikli Date: Tue, 26 Dec 2023 16:01:56 +0100 Subject: Initial import Signed-off-by: Denis 'GNUtoo' Carikli --- .gitignore | 27 +++++++ Makefile | 108 ++++++++++++++++++++++++++++ README | 47 ++++++++++++ convert.py | 226 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fix-dates.sh | 60 ++++++++++++++++ get_links.py | 52 ++++++++++++++ haunt.scm | 27 +++++++ lighttpd.conf | 31 ++++++++ 8 files changed, 578 insertions(+) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 README create mode 100755 convert.py create mode 100755 fix-dates.sh create mode 100755 get_links.py create mode 100644 haunt.scm create mode 100644 lighttpd.conf diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..94d3025 --- /dev/null +++ b/.gitignore @@ -0,0 +1,27 @@ +# Copyright (C) 2023 Denis 'GNUtoo' Carikli +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +html/** +markdown/** +site/** +links.txt +replicant_blog_page_0.html +replicant_blog_page_1.html +replicant_blog_page_2.html +replicant_blog_page_3.html +replicant_blog_page_4.html +replicant_blog_page_5.html +replicant_blog_page_6.html +replicant_blog_page_7.html diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..4d50541 --- /dev/null +++ b/Makefile @@ -0,0 +1,108 @@ +# Copyright (C) 2023 Denis 'GNUtoo' Carikli +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +CURL ?= curl +BLOG_URL ?= https://blog.replicant.us/page +CONVERT ?= guix shell python python-beautifulsoup4 python-html2text -- python3 convert.py + +SENTINEL = + +.PHONY: all build help serve + +PAGES = \ + replicant_blog_page_7.html \ + replicant_blog_page_6.html \ + replicant_blog_page_5.html \ + replicant_blog_page_4.html \ + replicant_blog_page_3.html \ + replicant_blog_page_2.html \ + replicant_blog_page_1.html \ + replicant_blog_page_0.html \ + $(SENTINEL) + +all: help + +replicant_blog_page_%.html: + $(CURL) $(BLOG_URL)/$(subst replicant_blog_page_,,$*)/ -o $@ + +links.txt: $(PAGES) + @echo -n '' > $@ + @for f in $(PAGES) ; do \ + echo "Processing $$f" > /dev/stderr ; \ + guix shell -C python python-beautifulsoup4 -- \ + python3 get_links.py $$f >> $@ ; \ + done + +html: # do not depend on links.txt as a human is supposed to review it manually + mkdir -p $@ && \ + for url in `cat links.txt` ; do \ + if [ ! -f html/`echo "$${url}" | \ + sed 's#https://blog.replicant.us/##' | \ + sed 's#/$$##'| \ + sed 's#/#_#g'`.html ] ; then \ + curl "$${url}" -o \ + html/`echo "$${url}" | \ + sed 's#https://blog.replicant.us/##' | \ + sed 's#/$$##'| \ + sed 's#/#_#g'`.html ; \ + fi \ + done + # TODO: erase html directory if that fails for some reason. + +markdown: html + mkdir -p $@ && \ + for url in `cat links.txt` ; do \ + echo "$${url}" | \ + sed 's#https://blog.replicant.us/##' | \ + sed 's#/$$##' ; \ + $(CONVERT) \ + html/`echo "$${url}" | \ + sed 's#https://blog.replicant.us/##' | \ + sed 's#/$$##'| \ + sed 's#/#_#g'`.html \ + > \ + markdown/`echo "$${url}" | \ + sed 's#https://blog.replicant.us/##' | \ + sed 's#/$$##'| \ + sed 's#/#_#g'`.md ; \ + done + +build: + haunt build + +help: + @printf "%s\n\t%s\n\t%s\n\t%s\n\t%s\n" \ + "Available commands:" \ + "links.txt # Create links.txt file with blog post links." \ + " # For security reasons This file needs to be manually " \ + " # reviewed manually before usage." \ + "html # create html pages. To be done after reviewing links.txt." \ + +serve: + haunt serve -w -p $(HAUNT_PORT) + +website.tar.gz: build + tar \ + --exclude-vcs \ + --format=gnu \ + --owner=0 --group=0 --numeric-owner \ + --sort=name \ + -czf \ + website.tar.gz \ + site \ + pages/img \ + index.html \ + --transform="s#^site#web#" \ + --transform="s#^pages/img/#web/img/#" diff --git a/README b/README new file mode 100644 index 0000000..1d9e14f --- /dev/null +++ b/README @@ -0,0 +1,47 @@ +Introduction +============ +This is a set of scripts that can be used to do a semi-automatic +migration from WordPress to haunt, a static website generator. + +The code is a bit fragile and parses untrusted data so a human is +required to review if there are any suspicious data at several steps +in the migration process. + +Usage +===== + +First run 'make links.txt' manually to produce the links.txt file. +Then you need to manually inspect it to see if it contains dangerous +or problematic characters / text. + +Then you need to run run 'make markdown' to download all the blog +posts in html/ and convert them to markdown (and store them in +markdown/). + +At this point the dates inside the file need to be fixed because at +the time of writing haunt doesn't support the same date format than +WordPress yet. + +Since the dates are again untrusted data, a human needs to review them +with the './fix-dates.sh preview markdown/*' command. + +If no strange data was found they then need to be converted with +'./fix-dates.sh fix markdown/*'. + +At this stage the website is then ready to used with haunt. You can +then build it with 'make build' or 'make serve'. + +License +======= +This project is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or (at +your option) any later version. + +This project is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . \ No newline at end of file diff --git a/convert.py b/convert.py new file mode 100755 index 0000000..776fe5b --- /dev/null +++ b/convert.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# Copyright (C) 2020-2022 Denis 'GNUtoo' Carikli +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from bs4 import BeautifulSoup +from html2text import config, HTML2Text + +try: + # This has been removed in more recent + # versions of python-html2text. See commit + # b361467894fb277563b4547ec9d4df49f5e0c6e3 + # (b361467 Remove support for Python ≤ 3.4) + # in https://github.com/Alir3z4/html2text.git + from html2text.utils import wrapwrite +except: + pass + +import os +import re +import sh +import sys + +def usage(progname): + print("{} path/to/file.html".format(progname)) + sys.exit(1) + +# A "[1]" in the html becomes "[[1]][6]" in text. +# As we already uses references at the end a [6] would +# be enough. +def fix_wordpress_references_link(string): + open_square_bracket = re.escape('[') + close_square_bracket = re.escape(']') + whitespaces = '\s*' + numbers = '\d+' + + # [ [ 1 ] ] [ 6 ] + wordpress_link_regex = \ + \ + open_square_bracket + whitespaces \ + + open_square_bracket + whitespaces \ + + numbers + whitespaces \ + + close_square_bracket + whitespaces \ + + close_square_bracket + whitespaces \ + \ + + open_square_bracket + whitespaces \ + + numbers + whitespaces \ + + close_square_bracket + whitespaces \ + + results = re.findall(wordpress_link_regex, string) + + part_to_remove = '^' \ + + open_square_bracket + whitespaces \ + + open_square_bracket + whitespaces \ + + numbers + whitespaces \ + + close_square_bracket + whitespaces \ + + close_square_bracket + whitespaces \ + + for result in results: + replacement = re.sub(part_to_remove, '', result) + string = string.replace(result, replacement) + + return string + +def fix_alignment(string): + new_string = "" + for line in string.split(os.linesep): + new_line = re.sub('^ ', '', line) + new_string += (new_line + os.linesep) + + return new_string + +# Emacs breaks lists when doing a fill-paragraph to adjust a paragraph to the +# maximum width so we make sure that there is at least one blank line before +# the '*' +def fix_lists(string): + new_string = '' + nr_lineseps_before_star = 0 + for c in string: + if c == '*' and nr_lineseps_before_star == 1: + new_string += os.linesep + + if c == os.linesep: + nr_lineseps_before_star += 1 + else: + nr_lineseps_before_star = 0 + + new_string += c + return new_string + +def convert(html_file_path): + with open(html_file_path) as html_file: + try: + soup = BeautifulSoup(html_file, features="html5lib").article + except: + try: + # For some reason the lxml parser isn't found with + # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's + # probably better to use an html5 parser anyway as the + # Replicant blog (now?) uses the html doctype and the + # theme seems to include an html5.js file for the IE 9 + # browser. + soup = BeautifulSoup(html_file, features="lxml").article + except: + print("Cannot find html5lib or lxml parsers") + sys.exit(1) + + # Format the output to be compatible with mail conventions but make sure + # that the links are not split between two lines + config.INLINE_LINKS = False + config.PROTECT_LINKS = True + config.WRAP_LIST_ITEMS = True + config.BODY_WIDTH = 70 + + parser = HTML2Text() + + article = soup.find('div', class_='entry-content') + text = parser.handle(article.decode()) + + text = fix_wordpress_references_link(text) + text = fix_alignment(text) + text = fix_lists(text) + + return text + +def _get_metadata(html_file_path, func): + with open(html_file_path) as html_file: + try: + soup = BeautifulSoup(html_file, features="html5lib") + except: + try: + # For some reason the lxml parser isn't found with + # python-beautifulsoup4 4.9.3-3.0 on Parabola. It's + # probably better to use an html5 parser anyway as the + # Replicant blog (now?) uses the html doctype and the + # theme seems to include an html5.js file for the IE 9 + # browser. + soup = BeautifulSoup(html_file, features="lxml").article + except: + print("Cannot find html5lib or lxml parsers") + sys.exit(1) + return func(soup) + +def get_metadata(html_file_path): + metadata = "" + + def get_date(soup): + date_metadata = None + entries = soup.article.find_all('a') + for entry in entries: + date_elements = entry.find_all('time', class_='entry-date') + for date_element in date_elements: + if date_element.get('datetime', None): + new_date = date_element['datetime'] + assert(date_metadata == None or date_metadata == new_date) + date_metadata = new_date + return date_metadata + + def get_tags(soup): + results = [] + tags = soup.article.find_all('footer', class_='entry-meta') + assert(len(tags) == 1) + links = tags[0].find_all('a') + for link in links: + text = link.string + if text != 'permalink': + results.append(text) + + return ', '.join(results) + + def get_title(soup): + title = soup.title.string + title = title.replace(os.linesep, '') + title = title.replace('\t', '') + title = re.sub('\|.*', '', title) + title = title.lstrip().rstrip() + return title + + date_metadata = _get_metadata(html_file_path, get_date) + assert(date_metadata != None) + metadata += "date: {}".format(date_metadata) + os.linesep + + title_metadata = _get_metadata(html_file_path, get_title) + # assert(title_metadata != None) + metadata += "title: {}".format(title_metadata) + os.linesep + + # assert(title_metadata != None) + # metadata += "title: {}".format(title_metadata) + + tags_metadata = _get_metadata(html_file_path, get_tags) + if tags_metadata: + print("tags: {}".format(tags_metadata)) + + metadata += "---" + os.linesep + + return metadata + +def main(): + if len(sys.argv) != 2: + usage(sys.argv[0]) + + html_file_path = sys.argv[1] + + text = get_metadata(html_file_path) + text += convert(html_file_path) + + try: + wrapwrite(text) + except: + sys.stdout.write(text) + +if __name__ == '__main__': + main() diff --git a/fix-dates.sh b/fix-dates.sh new file mode 100755 index 0000000..74a5bd6 --- /dev/null +++ b/fix-dates.sh @@ -0,0 +1,60 @@ +#!/bin/sh + +progname="fix-dates.sh" + +usage() +{ + progname="$1" + printf "Usage: %s [path-to-file [path-to-file [...]]]\n" \ + "${progname}" +} + +get_date() +{ + file_path="$1" + + max_lines=$(wc -l markdown/* | \ + sort -n | tail -n2 | head -n +1 | \ + awk '{print $1}') + date=$(grep -h -B "${max_lines}" '^---$' "${file_path}" | \ + grep '^date:' | sed 's/^date: \?//') + echo "${date}" +} + +preview_file() +{ + file_path="$1" + date="$(get_date "${file_path}")" + echo "${file_path}":"${date}" +} + +fix_file() +{ + file_path="$1" + date="$(get_date "${file_path}")" + + new_date_format=$(date --date="${date}" '+%F %R') + + sed "s/${date}/${new_date_format}/g" -i "${file_path}" +} + +if [ $# -lt 2 ] ; then + usage "${progname}" + exit 64 +fi + +command="$1" +shift 1 + +if [ "${command}" = "preview" ] ; then + for file in $@ ; do + preview_file "${file}" + done +elif [ "${command}" = "fix" ] ; then + for file in $@ ; do + fix_file "${file}" + done +else + usage "${progname}" + exit 64 +fi diff --git a/get_links.py b/get_links.py new file mode 100755 index 0000000..c605b39 --- /dev/null +++ b/get_links.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# +# Copyright (C) 2020-2023 Denis 'GNUtoo' Carikli +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from bs4 import BeautifulSoup +# from html2text import config, HTML2Text + +import os +import re +import sys + +def usage(progname): + print("Usage: {} path/to/file.html".format(progname)) + sys.exit(1) + +def get_article_links(html_file_path): + with open(html_file_path) as html_file: + soup = BeautifulSoup(html_file, 'html.parser') + # print(soup.prettify()) + entries = soup.find_all('article') # class_='entry-title') + for entry in entries: + titles = entry.find_all('h1', class_='entry-title') + for title in titles: + links = title.find_all('a') + for link in links: + print(link['href']) + return None + +def main(): + if len(sys.argv) != 2: + usage(sys.argv[0]) + + html_file_path = sys.argv[1] + + text = get_article_links(html_file_path) + +if __name__ == '__main__': + main() diff --git a/haunt.scm b/haunt.scm new file mode 100644 index 0000000..9577759 --- /dev/null +++ b/haunt.scm @@ -0,0 +1,27 @@ +;;; Copyright © 2023 Denis 'GNUtoo' Carikli +;;; +;;; This file is free software; you can redistribute it and/or modify +;;; it under the terms of the GNU General Public License as published +;;; by the Free Software Foundation; either version 3 of the License, +;;; or (at your option) any later version. +;;; +;;; Haunt is distributed in the hope that it will be useful, but +;;; WITHOUT ANY WARRANTY; without even the implied warranty of +;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;;; General Public License for more details. +;;; +;;; You should have received a copy of the GNU General Public License +;;; along with this file. If not, see . + +(use-modules (haunt builder blog) + (haunt site) + (haunt reader commonmark)) + +(site #:title "Replicant" + #:domain "blog.replicant.us" + #:default-metadata + '((author . "Replicant contributors")) +;; #:file-filter untitled-file-filter + #:posts-directory "markdown" + #:readers (list commonmark-reader) + #:builders (list (blog))) diff --git a/lighttpd.conf b/lighttpd.conf new file mode 100644 index 0000000..b0917fb --- /dev/null +++ b/lighttpd.conf @@ -0,0 +1,31 @@ +# Copyright (C) 2023 Denis 'GNUtoo' Carikli +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +server.bind = "localhost" +server.port = 8086 +server.document-root = "/home/gnutoo/work/projects/replicant/infrastructure/haunt/site/" +dir-listing.activate = "enable" +index-file.names = ( "index.html" ) +mimetype.assign = ( + ".html" => "text/html", + ".txt" => "text/plain", + ".css" => "text/css", + ".js" => "application/x-javascript", + ".jpg" => "image/jpeg", + ".jpeg" => "image/jpeg", + ".gif" => "image/gif", + ".png" => "image/png", + "" => "application/octet-stream" + ) -- cgit v1.2.3