From f0be43de02a1e07308d3d95408349c3c7f973430 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Thu, 27 Aug 2015 13:50:00 -0700
Subject: Binary format for hyphenation patterns

In the current state, hyphenation in all languages than Sanskrit seems
to work (case-folding edge cases). Thus, we just disable Sanskrit.
Packed tries are implemented, but not the finite state machine
(space/speed tradeoff).

This commit contains a throw-away test app, which runs on the host.
I think I want to replace it with unit tests, but I'm including it in
the CL because it's useful during development.

Bug: 21562869
Bug: 21826930
Bug: 23317038
Bug: 23317904

Change-Id: I7479a565a4a062fa319651c2c14c0fa18c5ceaea
---
 tools/mk_hyb_file.py | 567 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 567 insertions(+)
 create mode 100755 tools/mk_hyb_file.py

(limited to 'tools')

diff --git a/tools/mk_hyb_file.py b/tools/mk_hyb_file.py
new file mode 100755
index 0000000..c078454
--- /dev/null
+++ b/tools/mk_hyb_file.py
@@ -0,0 +1,567 @@
+#!/usr/bin/env python
+
+# Copyright (C) 2015 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Convert hyphen files in standard TeX format (a trio of pat, chr, and hyp)
+into binary format. See doc/hyb_file_format.md for more information.
+
+Usage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb
+
+Optional -v parameter turns on verbose debugging.
+
+"""
+
+from __future__ import print_function
+
+import io
+import sys
+import struct
+import math
+import getopt
+
+
+VERBOSE = False
+
+
+if sys.version_info[0] >= 3:
+    def unichr(x):
+        return chr(x)
+
+
+# number of bits required to represent numbers up to n inclusive
+def num_bits(n):
+    return 1 + int(math.log(n, 2)) if n > 0 else 0
+
+
+class Node:
+
+    def __init__(self):
+        self.succ = {}
+        self.res = None
+        self.fsm_pat = None
+        self.fail = None
+
+
+# List of free slots, implemented as doubly linked list
+class Freelist:
+
+    def __init__(self):
+        self.first = None
+        self.last = None
+        self.pred = []
+        self.succ = []
+
+    def grow(self):
+        this = len(self.pred)
+        self.pred.append(self.last)
+        self.succ.append(None)
+        if self.last is None:
+            self.first = this
+        else:
+            self.succ[self.last] = this
+        self.last = this
+
+    def next(self, cursor):
+        if cursor == 0:
+            cursor = self.first
+        if cursor is None:
+            self.grow()
+            result = self.last
+        else:
+            result = cursor
+        return result, self.succ[result]
+
+    def is_free(self, ix):
+        while ix >= len(self.pred):
+            self.grow()
+        return self.pred[ix] != -1
+
+    def use(self, ix):
+        if self.pred[ix] is None:
+            self.first = self.succ[ix]
+        else:
+            self.succ[self.pred[ix]] = self.succ[ix]
+        if self.succ[ix] is None:
+            self.last = self.pred[ix]
+        else:
+            self.pred[self.succ[ix]] = self.pred[ix]
+        if self.pred[ix] == -1:
+            assert self.pred[ix] != -1, 'double free!'
+        self.pred[ix] = -1
+
+
+def combine(a, b):
+    if a is None: return b
+    if b is None: return a
+    if len(b) < len(a): a, b = b, a
+    res = b[:len(b) - len(a)]
+    for i in range(len(a)):
+        res.append(max(a[i], b[i + len(b) - len(a)]))
+    return res
+
+
+def trim(pattern):
+    for ix in range(len(pattern)):
+        if pattern[ix] != 0:
+            return pattern[ix:]
+
+
+def pat_to_binary(pattern):
+    return b''.join(struct.pack('B', x) for x in pattern)
+
+
+class Hyph:
+
+    def __init__(self):
+        self.root = Node()
+        self.root.str = '<root>'
+        self.node_list = [self.root]
+
+    # Add a pattern (word fragment with numeric codes, such as ".ad4der")
+    def add_pat(self, pat):
+        lastWasLetter = False
+        haveSeenNumber = False
+        result = []
+        word = ''
+        for c in pat:
+            if c.isdigit():
+                result.append(int(c))
+                lastWasLetter = False
+                haveSeenNumber = True
+            else:
+                word += c
+                if lastWasLetter and haveSeenNumber:
+                    result.append(0)
+                lastWasLetter = True
+        if lastWasLetter:
+            result.append(0)
+
+        self.add_word_res(word, result)
+
+    # Add an exception (word with hyphens, such as "ta-ble")
+    def add_exception(self, hyph_word):
+        res = []
+        word = ['.']
+        need_10 = False
+        for c in hyph_word:
+            if c == '-':
+                res.append(11)
+                need_10 = False
+            else:
+                if need_10:
+                    res.append(10)
+                word.append(c)
+                need_10 = True
+        word.append('.')
+        res.append(0)
+        res.append(0)
+        if VERBOSE:
+            print(word, res)
+        self.add_word_res(''.join(word), res)
+
+    def add_word_res(self, word, result):
+        if VERBOSE:
+            print(word, result)
+
+        t = self.root
+        s = ''
+        for c in word:
+            s += c
+            if c not in t.succ:
+                new_node = Node()
+                new_node.str = s
+                self.node_list.append(new_node)
+                t.succ[c] = new_node
+            t = t.succ[c]
+        t.res = result
+
+    def pack(self, node_list, ch_map, use_node=False):
+        size = 0
+        self.node_map = {}
+        nodes = Freelist()
+        edges = Freelist()
+        edge_start = 1 if use_node else 0
+        for node in node_list:
+            succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()])
+            if len(succ):
+                cursor = 0
+                while True:
+                    edge_ix, cursor = edges.next(cursor)
+                    ix = edge_ix - succ[0]
+                    if (ix >= 0 and nodes.is_free(ix) and
+                            all(edges.is_free(ix + s) for s in succ) and
+                            ((not use_node) or edges.is_free(ix))):
+                        break
+            elif use_node:
+                ix, _ = edges.next(0)
+                nodes.is_free(ix)  # actually don't need nodes at all when use_node,
+                # but keep it happy
+            else:
+                ix, _ = nodes.next(0)
+            node.ix = ix
+            self.node_map[ix] = node
+            nodes.use(ix)
+            size = max(size, ix)
+            if use_node:
+                edges.use(ix)
+            for s in succ:
+                edges.use(ix + s)
+        size += max(ch_map.values()) + 1
+        return size
+
+    # return list of nodes in bfs order
+    def bfs(self, ch_map):
+        result = [self.root]
+        ix = 0
+        while ix < len(result):
+            node = result[ix]
+            node.bfs_ix = ix
+            mapped = {}
+            for c, next in node.succ.items():
+                assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c))
+                mapped[ch_map[c]] = next
+            for i in sorted(mapped.keys()):
+                result.append(mapped[i])
+            ix += 1
+        self.bfs_order = result
+        return result
+
+    # suffix compression - convert the trie into an acyclic digraph, merging nodes when
+    # the subtries are identical
+    def dedup(self):
+        uniques = []
+        dupmap = {}
+        dedup_ix = [0] * len(self.bfs_order)
+        for ix in reversed(range(len(self.bfs_order))):
+            # construct string representation of node
+            node = self.bfs_order[ix]
+            if node.res is None:
+                s = ''
+            else:
+                s = ''.join(str(c) for c in node.res)
+            for c in sorted(node.succ.keys()):
+                succ = node.succ[c]
+                s += ' ' + c + str(dedup_ix[succ.bfs_ix])
+            if s in dupmap:
+                dedup_ix[ix] = dupmap[s]
+            else:
+                uniques.append(node)
+                dedup_ix[ix] = ix
+            dupmap[s] = dedup_ix[ix]
+        uniques.reverse()
+        print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total')
+        return dedup_ix, uniques
+
+
+# load the ".pat" file, which contains patterns such as a1b2c3
+def load(fn):
+    hyph = Hyph()
+    with io.open(fn, encoding='UTF-8') as f:
+        for l in f:
+            pat = l.strip()
+            hyph.add_pat(pat)
+    return hyph
+
+
+# load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc.
+def load_chr(fn):
+    ch_map = {'.': 0}
+    with io.open(fn, encoding='UTF-8') as f:
+        for i, l in enumerate(f):
+            l = l.strip()
+            if len(l) > 2:
+                # lowercase maps to multi-character uppercase sequence, ignore uppercase for now
+                l = l[:1]
+            else:
+                assert len(l) == 2, 'expected 2 chars in chr'
+            for c in l:
+                ch_map[c] = i + 1
+    return ch_map
+
+
+# load exceptions with explicit hyphens
+def load_hyp(hyph, fn):
+    with io.open(fn, encoding='UTF-8') as f:
+        for l in f:
+            hyph.add_exception(l.strip())
+
+
+def generate_header(alphabet, trie, pattern):
+    alphabet_off = 6 * 4
+    trie_off = alphabet_off + len(alphabet)
+    pattern_off = trie_off + len(trie)
+    file_size = pattern_off + len(pattern)
+    data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size]
+    return struct.pack('<6I', *data)
+
+
+def generate_alphabet(ch_map):
+    ch_map = ch_map.copy()
+    del ch_map['.']
+    min_ch = ord(min(ch_map))
+    max_ch = ord(max(ch_map))
+    if max_ch - min_ch < 1024 and max(ch_map.values()) < 256:
+        # generate format 0
+        data = [0] * (max_ch - min_ch + 1)
+        for c, val in ch_map.items():
+            data[ord(c) - min_ch] = val
+        result = [struct.pack('<3I', 0, min_ch, max_ch + 1)]
+        for b in data:
+            result.append(struct.pack('<B', b))
+    else:
+        # generate format 1
+        assert max(ch_map.values()) < 2048, 'max number of unique characters exceeded'
+        result = [struct.pack('<2I', 1, len(ch_map))]
+        for c, val in sorted(ch_map.items()):
+            data = (ord(c) << 11) | val
+            result.append(struct.pack('<I', data))
+    binary = b''.join(result)
+    if len(binary) % 4 != 0:
+        binary += b'\x00' * (4 - len(binary) % 4)
+    return binary
+
+
+# assumes hyph structure has been packed, ie node.ix values have been set
+def generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap):
+    ch_array = [0] * n_trie
+    link_array = [0] * n_trie
+    pat_array = [0] * n_trie
+    link_shift = num_bits(max(ch_map.values()))
+    char_mask = (1 << link_shift) - 1
+    pattern_shift = link_shift + num_bits(n_trie - 1)
+    link_mask = (1 << pattern_shift) - (1 << link_shift)
+    result = [struct.pack('<6I', 0, char_mask, link_shift, link_mask, pattern_shift, n_trie)]
+
+    for node in dedup_nodes:
+        ix = node.ix
+        if node.res is not None:
+            pat_array[ix] = patmap[pat_to_binary(node.res)]
+        for c, next in node.succ.items():
+            c_num = ch_map[c]
+            link_ix = ix + c_num
+            ch_array[link_ix] = c_num
+            if dedup_ix is None:
+                dedup_next = next
+            else:
+                dedup_next = hyph.bfs_order[dedup_ix[next.bfs_ix]]
+            link_array[link_ix] = dedup_next.ix
+
+    for i in range(n_trie):
+        #print((pat_array[i], link_array[i], ch_array[i]))
+        packed = (pat_array[i] << pattern_shift) | (link_array[i] << link_shift) | ch_array[i]
+        result.append(struct.pack('<I', packed))
+    return b''.join(result)
+
+
+def generate_pattern(pats):
+    pat_array = [0]
+    patmap = {b'': 0}
+
+    raw_pat_array = []
+    raw_pat_size = 0
+    raw_patmap = {}
+
+    for pat in pats:
+        if pat is None:
+            continue
+        pat_str = pat_to_binary(pat)
+        if pat_str not in patmap:
+            shift = 0
+            while shift < len(pat) and pat[len(pat) - shift - 1] == 0:
+                shift += 1
+            rawpat = pat_str[:len(pat) - shift]
+            if rawpat not in raw_patmap:
+                raw_patmap[rawpat] = raw_pat_size
+                raw_pat_array.append(rawpat)
+                raw_pat_size += len(rawpat)
+            data = (len(rawpat) << 26) | (shift << 20) | raw_patmap[rawpat]
+            patmap[pat_str] = len(pat_array)
+            pat_array.append(data)
+    data = [0, len(pat_array), 16 + 4 * len(pat_array), raw_pat_size]
+    result = [struct.pack('<4I', *data)]
+    for x in pat_array:
+        result.append(struct.pack('<I', x))
+    result.extend(raw_pat_array)
+    return patmap, b''.join(result)
+
+
+def generate_hyb_file(hyph, ch_map, hyb_fn):
+    bfs = hyph.bfs(ch_map)
+    dedup_ix, dedup_nodes = hyph.dedup()
+    n_trie = hyph.pack(dedup_nodes, ch_map)
+    alphabet = generate_alphabet(ch_map)
+    patmap, pattern = generate_pattern([n.res for n in hyph.node_list])
+    trie = generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap)
+    header = generate_header(alphabet, trie, pattern)
+
+    with open(hyb_fn, 'wb') as f:
+        f.write(header)
+        f.write(alphabet)
+        f.write(trie)
+        f.write(pattern)
+
+
+# Verify that the file contains the same lines as the lines argument, in arbitrary order
+def verify_file_sorted(lines, fn):
+    file_lines = [l.strip() for l in io.open(fn)]
+    line_set = set(lines)
+    file_set = set(file_lines)
+    if line_set == file_set:
+        return True
+    for line in line_set - file_set:
+        print(repr(line) + ' in reconstruction, not in file')
+    for line in file_set - line_set:
+        print(repr(line) + ' in file, not in reconstruction')
+    return False
+
+
+def map_to_chr(alphabet_map):
+    result = []
+    ch_map = {}
+    for val in alphabet_map.values():
+        chs = [ch for ch in alphabet_map if alphabet_map[ch] == val]
+        # non-cased characters (like Ethopic) are in both, matching chr file
+        lowercase = [ch for ch in chs if not ch.isupper()]
+        uppercase = [ch for ch in chs if not ch.islower()]
+        # print(val, `lowercase`, `uppercase`)
+        assert len(lowercase) == 1, 'expected 1 lowercase character'
+        assert 0 <= len(uppercase) <= 1, 'expected 0 or 1 uppercase character'
+        ch_map[val] = lowercase[0]
+        result.append(''.join(lowercase + uppercase))
+    ch_map[0] = '.'
+    return (ch_map, result)
+
+
+def get_pattern(pattern_data, ix):
+    pattern_offset = struct.unpack('<I', pattern_data[8:12])[0]
+    entry = struct.unpack('<I', pattern_data[16 + ix * 4: 16 + ix * 4 + 4])[0]
+    pat_len = entry >> 26
+    pat_shift = (entry >> 20) & 0x1f
+    offset = pattern_offset + (entry & 0xfffff)
+    return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift
+
+
+def traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions):
+    (char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20])
+    node_entry = struct.unpack('<I', trie_data[24 + ix * 4: 24 + ix * 4 + 4])[0]
+    pattern = node_entry >> pattern_shift
+    if pattern:
+        result = []
+        is_exception = False
+        pat = get_pattern(pattern_data, pattern)
+        for i in range(len(s) + 1):
+            pat_off = i - 1 + len(pat) - len(s)
+            if pat_off < 0:
+                code = 0
+            else:
+                code = struct.unpack('B', pat[pat_off : pat_off + 1])[0]
+            if 1 <= code <= 9:
+                result.append('%d' % code)
+            elif code == 10:
+                is_exception = True
+            elif code == 11:
+                result.append('-')
+                is_exception = True
+            else:
+                assert code == 0, 'unexpected code'
+            if i < len(s):
+                result.append(s[i])
+        pat_str = ''.join(result)
+        #print(`pat_str`, `pat`)
+        if is_exception:
+            assert pat_str[0] == '.', "expected leading '.'"
+            assert pat_str[-1] == '.', "expected trailing '.'"
+            exceptions.append(pat_str[1:-1])  # strip leading and trailing '.'
+        else:
+            patterns.append(pat_str)
+    for ch in ch_map:
+        edge_entry = struct.unpack('<I', trie_data[24 + (ix + ch) * 4: 24 + (ix + ch) * 4 + 4])[0]
+        link = (edge_entry & link_mask) >> link_shift
+        if link != 0 and ch == (edge_entry & char_mask):
+            sch = s + ch_map[ch]
+            traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions)
+
+
+# Verify the generated binary file by reconstructing the textual representations
+# from the binary hyb file, then checking that they're identical (mod the order of
+# lines within the file, which is irrelevant). This function makes assumptions that
+# are stronger than absolutely necessary (in particular, that the patterns are in
+# lowercase as defined by python islower).
+def verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn):
+    with open(hyb_fn, 'rb') as f:
+        hyb_data = f.read()
+    header = hyb_data[0: 6 * 4]
+    (magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header)
+    alphabet_data = hyb_data[alphabet_off:trie_off]
+    trie_data = hyb_data[trie_off:pattern_off]
+    pattern_data = hyb_data[pattern_off:file_size]
+
+    # reconstruct alphabet table
+    alphabet_version = struct.unpack('<I', alphabet_data[:4])[0]
+    alphabet_map = {}
+    if alphabet_version == 0:
+        (min_ch, max_ch) = struct.unpack('<2I', alphabet_data[4:12])
+        for ch in range(min_ch, max_ch):
+            offset = 12 + ch - min_ch
+            b = struct.unpack('B', alphabet_data[offset : offset + 1])[0]
+            if b != 0:
+                alphabet_map[unichr(ch)] = b
+    else:
+        assert alphabet_version == 1
+        n_entries = struct.unpack('<I', alphabet_data[4:8])[0]
+        for i in range(n_entries):
+            entry = struct.unpack('<I', alphabet_data[8 + 4 * i: 8 + 4 * i + 4])[0]
+            alphabet_map[unichr(entry >> 11)] = entry & 0x7ff
+
+    ch_map, reconstructed_chr = map_to_chr(alphabet_map)
+
+    # EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587
+    if u'\u0587' in reconstructed_chr:
+        reconstructed_chr.remove(u'\u0587')
+        reconstructed_chr.append(u'\u0587\u0535\u0552')
+
+    assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified'
+
+    # reconstruct trie
+    patterns = []
+    exceptions = []
+    traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions)
+    assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified'
+    assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified'
+
+
+def main():
+    global VERBOSE
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], 'v')
+    except getopt.GetoptError as err:
+        print(str(err))
+        sys.exit(1)
+    for o, _ in opts:
+        if o == '-v':
+            VERBOSE = True
+    pat_fn, out_fn = args
+    hyph = load(pat_fn)
+    if pat_fn.endswith('.pat.txt'):
+        chr_fn = pat_fn[:-8] + '.chr.txt'
+        ch_map = load_chr(chr_fn)
+        hyp_fn = pat_fn[:-8] + '.hyp.txt'
+        load_hyp(hyph, hyp_fn)
+        generate_hyb_file(hyph, ch_map, out_fn)
+        verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn)
+
+if __name__ == '__main__':
+    main()
-- 
cgit v1.2.3