#!/usr/bin/env python3
# encoding: utf-8
#
# Copyright (C) 2020-2024 Denis 'GNUtoo' Carikli <GNUtoo@cyberdimension.org>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

from bs4 import BeautifulSoup
from html2text import config, HTML2Text

import os
import re
import sh
import sys

def usage(progname):
    print("Usage:\n\t{} <path/to/file.html>".format(progname))

# A "[1]" in the html becomes "[[1]][6]" in text.
# As we already uses references at the end a [6] would
# be enough.
def fix_wordpress_references_link(string):
    open_square_bracket = re.escape('[')
    close_square_bracket = re.escape(']')
    whitespaces = '\s*'
    numbers = '\d+'

    # [ [ 1 ] ] [ 6 ]
    wordpress_link_regex = \
        \
          open_square_bracket + whitespaces \
        + open_square_bracket + whitespaces \
        + numbers + whitespaces \
        + close_square_bracket + whitespaces \
        + close_square_bracket + whitespaces \
        \
        + open_square_bracket + whitespaces \
        + numbers + whitespaces \
        + close_square_bracket + whitespaces \

    results = re.findall(wordpress_link_regex, string)

    part_to_remove = '^' \
        + open_square_bracket + whitespaces \
        + open_square_bracket + whitespaces \
        + numbers + whitespaces \
        + close_square_bracket + whitespaces \
        + close_square_bracket + whitespaces \

    for result in results:
        replacement = re.sub(part_to_remove, '', result)
        string = string.replace(result, replacement)

    return string

def fix_alignment(string):
    new_string = ""
    for line in string.split(os.linesep):
        new_line = re.sub('^  ', '', line)
        new_string += (new_line + os.linesep)

    return new_string

# Emacs breaks lists when doing a fill-paragraph to adjust a paragraph to the
# maximum width so we make sure that there is at least one blank line before
# the '*'
def fix_lists(string):
    new_string = ''
    nr_lineseps_before_star = 0
    for c in string:
        if c == '*' and nr_lineseps_before_star == 1:
            new_string += os.linesep

        if c == os.linesep:
            nr_lineseps_before_star += 1
        else:
            nr_lineseps_before_star = 0

        new_string += c
    return new_string

def fix_right_single_quotation_mark(string):
    return string.replace('’', '\'')

def fix_dashes(string):
    string = string.replace('—', '-') # em dash
    string = string.replace('–', '-') # en dash
    return string

# Some links are broken: they start in one line (like with
# '<http://ftp-') and the next line
# ('osl.osuosl.org/pub/replicant/images/replicant-4.0/0004/infos/changelog.txt>')
# isn't interpretated as being part of the link.
def fix_broken_links(string, protocol):
    assert(string)
    assert(protocol in ['http', 'https'])
    skip_line_break = False
    lines = []
    prev_line = None
    for line in string.split(os.linesep):
        if '<{}://'.format(protocol) in line and line.endswith('-') and \
           not skip_line_break:
            skip_line_break = True
        elif '<{}://'.format(protocol) in line and line.endswith('-') and \
             skip_line_break:
            assert(False) # TODO
        elif skip_line_break:
            skip_line_break = False
            lines.append(prev_line + line)
        else:
            lines.append(line)
        prev_line = line
    return os.linesep.join(lines)

def convert(html_file_path):
    with open(html_file_path) as html_file:
        soup = BeautifulSoup(html_file, features="html5lib").article

    # Format the output to be compatible with mail conventions but make sure
    # that the links are not split between two lines
    config.INLINE_LINKS = False
    config.PROTECT_LINKS = True
    config.WRAP_LIST_ITEMS = True
    config.BODY_WIDTH = 70

    parser = HTML2Text()

    article = soup.find('div', class_='entry-content')
    text = parser.handle(article.decode())

    text = fix_wordpress_references_link(text)
    text = fix_alignment(text)
    text = fix_lists(text)
    text = fix_right_single_quotation_mark(text)
    text = fix_dashes(text)
    text = fix_broken_links(text, 'http')
    text = fix_broken_links(text, 'https')

    return text

def _get_metadata(html_file_path, func):
    with open(html_file_path) as html_file:
        soup = BeautifulSoup(html_file, features="html5lib")
        return func(soup)

def get_metadata(html_file_path):
    metadata = ""

    def get_date(soup):
        date_metadata = None
        entries = soup.article.find_all('a')
        for entry in entries:
            date_elements = entry.find_all('time', class_='entry-date')
            for date_element in date_elements:
                if date_element.get('datetime', None):
                        new_date = date_element['datetime']
                        assert(date_metadata == None or date_metadata == new_date)
                        date_metadata = new_date
        return date_metadata

    def get_tags(soup):
        results = []
        tags = soup.article.find_all('footer', class_='entry-meta')
        assert(len(tags) == 1)
        links = tags[0].find_all('a')
        for link in links:
            text = link.string
            if text != 'permalink':
                results.append(text)

        return ', '.join(results)

    def get_author(soup):
        results = []
        author_vcard = soup.article.find_all('span', class_='author vcard')
        assert(len(author_vcard) == 1)
        link = author_vcard[0].find_all('a')
        assert(len(link) == 1)
        return link[0].string

    # Returns SPDX license declaration.
    def get_licenses(soup):
        if get_author(soup) == 'GNUtoo':
            return 'CC-BY-3.0 OR CC-BY-4.0'
        # I got the agreement of dllud though XMPP on the 1 March 2024:
        # <gnutoo-laptop>: Also You wrote several blog posts, do you
        # agree to license them under both CC-BY-SA 3.0 (unported) and
        # CC-BY-SA 4.0 (international) ?
        # <dllud>: yes I do agree
        elif get_author(soup) == 'dllud':
            return 'CC-BY-3.0 OR CC-BY-4.0'
        else:
            return None

    def get_title(soup):
        title = soup.title.string
        title = title.replace(os.linesep, '')
        title = title.replace('\t', '')
        title = re.sub('\|.*', '', title)
        title = title.lstrip().rstrip()
        return title

    date_metadata = _get_metadata(html_file_path, get_date)
    assert(date_metadata != None)
    metadata += "date: {}".format(date_metadata) + os.linesep

    title_metadata = _get_metadata(html_file_path, get_title)
    # assert(title_metadata != None)
    metadata += "title: {}".format(title_metadata) + os.linesep

    # assert(title_metadata != None)
    # metadata += "title: {}".format(title_metadata)

    authors_metadata = _get_metadata(html_file_path, get_author)
    metadata += "authors: {}".format(authors_metadata) + os.linesep

    tags_metadata = _get_metadata(html_file_path, get_tags)
    if tags_metadata:
        metadata += "tags: {}".format(tags_metadata) + os.linesep

    licenses_metadata = _get_metadata(html_file_path, get_licenses)
    if licenses_metadata:
        metadata += "licenses: {}".format(licenses_metadata) + os.linesep

    metadata += "---" + os.linesep

    return metadata

def main():
    if len(sys.argv) != 2:
        usage(sys.argv[0])
        sys.exit(os.EX_USAGE)

    html_file_path = sys.argv[1]

    text = get_metadata(html_file_path)
    text += convert(html_file_path)

    sys.stdout.write(text)

if __name__ == '__main__':
    main()