diff options
Diffstat (limited to 'tools')
-rwxr-xr-x | tools/mk_hyb_file.py | 567 |
1 files changed, 567 insertions, 0 deletions
diff --git a/tools/mk_hyb_file.py b/tools/mk_hyb_file.py new file mode 100755 index 0000000..c078454 --- /dev/null +++ b/tools/mk_hyb_file.py @@ -0,0 +1,567 @@ +#!/usr/bin/env python + +# Copyright (C) 2015 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Convert hyphen files in standard TeX format (a trio of pat, chr, and hyp) +into binary format. See doc/hyb_file_format.md for more information. + +Usage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb + +Optional -v parameter turns on verbose debugging. + +""" + +from __future__ import print_function + +import io +import sys +import struct +import math +import getopt + + +VERBOSE = False + + +if sys.version_info[0] >= 3: + def unichr(x): + return chr(x) + + +# number of bits required to represent numbers up to n inclusive +def num_bits(n): + return 1 + int(math.log(n, 2)) if n > 0 else 0 + + +class Node: + + def __init__(self): + self.succ = {} + self.res = None + self.fsm_pat = None + self.fail = None + + +# List of free slots, implemented as doubly linked list +class Freelist: + + def __init__(self): + self.first = None + self.last = None + self.pred = [] + self.succ = [] + + def grow(self): + this = len(self.pred) + self.pred.append(self.last) + self.succ.append(None) + if self.last is None: + self.first = this + else: + self.succ[self.last] = this + self.last = this + + def next(self, cursor): + if cursor == 0: + cursor = self.first + if cursor is None: + self.grow() + result = self.last + else: + result = cursor + return result, self.succ[result] + + def is_free(self, ix): + while ix >= len(self.pred): + self.grow() + return self.pred[ix] != -1 + + def use(self, ix): + if self.pred[ix] is None: + self.first = self.succ[ix] + else: + self.succ[self.pred[ix]] = self.succ[ix] + if self.succ[ix] is None: + self.last = self.pred[ix] + else: + self.pred[self.succ[ix]] = self.pred[ix] + if self.pred[ix] == -1: + assert self.pred[ix] != -1, 'double free!' + self.pred[ix] = -1 + + +def combine(a, b): + if a is None: return b + if b is None: return a + if len(b) < len(a): a, b = b, a + res = b[:len(b) - len(a)] + for i in range(len(a)): + res.append(max(a[i], b[i + len(b) - len(a)])) + return res + + +def trim(pattern): + for ix in range(len(pattern)): + if pattern[ix] != 0: + return pattern[ix:] + + +def pat_to_binary(pattern): + return b''.join(struct.pack('B', x) for x in pattern) + + +class Hyph: + + def __init__(self): + self.root = Node() + self.root.str = '<root>' + self.node_list = [self.root] + + # Add a pattern (word fragment with numeric codes, such as ".ad4der") + def add_pat(self, pat): + lastWasLetter = False + haveSeenNumber = False + result = [] + word = '' + for c in pat: + if c.isdigit(): + result.append(int(c)) + lastWasLetter = False + haveSeenNumber = True + else: + word += c + if lastWasLetter and haveSeenNumber: + result.append(0) + lastWasLetter = True + if lastWasLetter: + result.append(0) + + self.add_word_res(word, result) + + # Add an exception (word with hyphens, such as "ta-ble") + def add_exception(self, hyph_word): + res = [] + word = ['.'] + need_10 = False + for c in hyph_word: + if c == '-': + res.append(11) + need_10 = False + else: + if need_10: + res.append(10) + word.append(c) + need_10 = True + word.append('.') + res.append(0) + res.append(0) + if VERBOSE: + print(word, res) + self.add_word_res(''.join(word), res) + + def add_word_res(self, word, result): + if VERBOSE: + print(word, result) + + t = self.root + s = '' + for c in word: + s += c + if c not in t.succ: + new_node = Node() + new_node.str = s + self.node_list.append(new_node) + t.succ[c] = new_node + t = t.succ[c] + t.res = result + + def pack(self, node_list, ch_map, use_node=False): + size = 0 + self.node_map = {} + nodes = Freelist() + edges = Freelist() + edge_start = 1 if use_node else 0 + for node in node_list: + succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()]) + if len(succ): + cursor = 0 + while True: + edge_ix, cursor = edges.next(cursor) + ix = edge_ix - succ[0] + if (ix >= 0 and nodes.is_free(ix) and + all(edges.is_free(ix + s) for s in succ) and + ((not use_node) or edges.is_free(ix))): + break + elif use_node: + ix, _ = edges.next(0) + nodes.is_free(ix) # actually don't need nodes at all when use_node, + # but keep it happy + else: + ix, _ = nodes.next(0) + node.ix = ix + self.node_map[ix] = node + nodes.use(ix) + size = max(size, ix) + if use_node: + edges.use(ix) + for s in succ: + edges.use(ix + s) + size += max(ch_map.values()) + 1 + return size + + # return list of nodes in bfs order + def bfs(self, ch_map): + result = [self.root] + ix = 0 + while ix < len(result): + node = result[ix] + node.bfs_ix = ix + mapped = {} + for c, next in node.succ.items(): + assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c)) + mapped[ch_map[c]] = next + for i in sorted(mapped.keys()): + result.append(mapped[i]) + ix += 1 + self.bfs_order = result + return result + + # suffix compression - convert the trie into an acyclic digraph, merging nodes when + # the subtries are identical + def dedup(self): + uniques = [] + dupmap = {} + dedup_ix = [0] * len(self.bfs_order) + for ix in reversed(range(len(self.bfs_order))): + # construct string representation of node + node = self.bfs_order[ix] + if node.res is None: + s = '' + else: + s = ''.join(str(c) for c in node.res) + for c in sorted(node.succ.keys()): + succ = node.succ[c] + s += ' ' + c + str(dedup_ix[succ.bfs_ix]) + if s in dupmap: + dedup_ix[ix] = dupmap[s] + else: + uniques.append(node) + dedup_ix[ix] = ix + dupmap[s] = dedup_ix[ix] + uniques.reverse() + print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total') + return dedup_ix, uniques + + +# load the ".pat" file, which contains patterns such as a1b2c3 +def load(fn): + hyph = Hyph() + with io.open(fn, encoding='UTF-8') as f: + for l in f: + pat = l.strip() + hyph.add_pat(pat) + return hyph + + +# load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc. +def load_chr(fn): + ch_map = {'.': 0} + with io.open(fn, encoding='UTF-8') as f: + for i, l in enumerate(f): + l = l.strip() + if len(l) > 2: + # lowercase maps to multi-character uppercase sequence, ignore uppercase for now + l = l[:1] + else: + assert len(l) == 2, 'expected 2 chars in chr' + for c in l: + ch_map[c] = i + 1 + return ch_map + + +# load exceptions with explicit hyphens +def load_hyp(hyph, fn): + with io.open(fn, encoding='UTF-8') as f: + for l in f: + hyph.add_exception(l.strip()) + + +def generate_header(alphabet, trie, pattern): + alphabet_off = 6 * 4 + trie_off = alphabet_off + len(alphabet) + pattern_off = trie_off + len(trie) + file_size = pattern_off + len(pattern) + data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size] + return struct.pack('<6I', *data) + + +def generate_alphabet(ch_map): + ch_map = ch_map.copy() + del ch_map['.'] + min_ch = ord(min(ch_map)) + max_ch = ord(max(ch_map)) + if max_ch - min_ch < 1024 and max(ch_map.values()) < 256: + # generate format 0 + data = [0] * (max_ch - min_ch + 1) + for c, val in ch_map.items(): + data[ord(c) - min_ch] = val + result = [struct.pack('<3I', 0, min_ch, max_ch + 1)] + for b in data: + result.append(struct.pack('<B', b)) + else: + # generate format 1 + assert max(ch_map.values()) < 2048, 'max number of unique characters exceeded' + result = [struct.pack('<2I', 1, len(ch_map))] + for c, val in sorted(ch_map.items()): + data = (ord(c) << 11) | val + result.append(struct.pack('<I', data)) + binary = b''.join(result) + if len(binary) % 4 != 0: + binary += b'\x00' * (4 - len(binary) % 4) + return binary + + +# assumes hyph structure has been packed, ie node.ix values have been set +def generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap): + ch_array = [0] * n_trie + link_array = [0] * n_trie + pat_array = [0] * n_trie + link_shift = num_bits(max(ch_map.values())) + char_mask = (1 << link_shift) - 1 + pattern_shift = link_shift + num_bits(n_trie - 1) + link_mask = (1 << pattern_shift) - (1 << link_shift) + result = [struct.pack('<6I', 0, char_mask, link_shift, link_mask, pattern_shift, n_trie)] + + for node in dedup_nodes: + ix = node.ix + if node.res is not None: + pat_array[ix] = patmap[pat_to_binary(node.res)] + for c, next in node.succ.items(): + c_num = ch_map[c] + link_ix = ix + c_num + ch_array[link_ix] = c_num + if dedup_ix is None: + dedup_next = next + else: + dedup_next = hyph.bfs_order[dedup_ix[next.bfs_ix]] + link_array[link_ix] = dedup_next.ix + + for i in range(n_trie): + #print((pat_array[i], link_array[i], ch_array[i])) + packed = (pat_array[i] << pattern_shift) | (link_array[i] << link_shift) | ch_array[i] + result.append(struct.pack('<I', packed)) + return b''.join(result) + + +def generate_pattern(pats): + pat_array = [0] + patmap = {b'': 0} + + raw_pat_array = [] + raw_pat_size = 0 + raw_patmap = {} + + for pat in pats: + if pat is None: + continue + pat_str = pat_to_binary(pat) + if pat_str not in patmap: + shift = 0 + while shift < len(pat) and pat[len(pat) - shift - 1] == 0: + shift += 1 + rawpat = pat_str[:len(pat) - shift] + if rawpat not in raw_patmap: + raw_patmap[rawpat] = raw_pat_size + raw_pat_array.append(rawpat) + raw_pat_size += len(rawpat) + data = (len(rawpat) << 26) | (shift << 20) | raw_patmap[rawpat] + patmap[pat_str] = len(pat_array) + pat_array.append(data) + data = [0, len(pat_array), 16 + 4 * len(pat_array), raw_pat_size] + result = [struct.pack('<4I', *data)] + for x in pat_array: + result.append(struct.pack('<I', x)) + result.extend(raw_pat_array) + return patmap, b''.join(result) + + +def generate_hyb_file(hyph, ch_map, hyb_fn): + bfs = hyph.bfs(ch_map) + dedup_ix, dedup_nodes = hyph.dedup() + n_trie = hyph.pack(dedup_nodes, ch_map) + alphabet = generate_alphabet(ch_map) + patmap, pattern = generate_pattern([n.res for n in hyph.node_list]) + trie = generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap) + header = generate_header(alphabet, trie, pattern) + + with open(hyb_fn, 'wb') as f: + f.write(header) + f.write(alphabet) + f.write(trie) + f.write(pattern) + + +# Verify that the file contains the same lines as the lines argument, in arbitrary order +def verify_file_sorted(lines, fn): + file_lines = [l.strip() for l in io.open(fn)] + line_set = set(lines) + file_set = set(file_lines) + if line_set == file_set: + return True + for line in line_set - file_set: + print(repr(line) + ' in reconstruction, not in file') + for line in file_set - line_set: + print(repr(line) + ' in file, not in reconstruction') + return False + + +def map_to_chr(alphabet_map): + result = [] + ch_map = {} + for val in alphabet_map.values(): + chs = [ch for ch in alphabet_map if alphabet_map[ch] == val] + # non-cased characters (like Ethopic) are in both, matching chr file + lowercase = [ch for ch in chs if not ch.isupper()] + uppercase = [ch for ch in chs if not ch.islower()] + # print(val, `lowercase`, `uppercase`) + assert len(lowercase) == 1, 'expected 1 lowercase character' + assert 0 <= len(uppercase) <= 1, 'expected 0 or 1 uppercase character' + ch_map[val] = lowercase[0] + result.append(''.join(lowercase + uppercase)) + ch_map[0] = '.' + return (ch_map, result) + + +def get_pattern(pattern_data, ix): + pattern_offset = struct.unpack('<I', pattern_data[8:12])[0] + entry = struct.unpack('<I', pattern_data[16 + ix * 4: 16 + ix * 4 + 4])[0] + pat_len = entry >> 26 + pat_shift = (entry >> 20) & 0x1f + offset = pattern_offset + (entry & 0xfffff) + return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift + + +def traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions): + (char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20]) + node_entry = struct.unpack('<I', trie_data[24 + ix * 4: 24 + ix * 4 + 4])[0] + pattern = node_entry >> pattern_shift + if pattern: + result = [] + is_exception = False + pat = get_pattern(pattern_data, pattern) + for i in range(len(s) + 1): + pat_off = i - 1 + len(pat) - len(s) + if pat_off < 0: + code = 0 + else: + code = struct.unpack('B', pat[pat_off : pat_off + 1])[0] + if 1 <= code <= 9: + result.append('%d' % code) + elif code == 10: + is_exception = True + elif code == 11: + result.append('-') + is_exception = True + else: + assert code == 0, 'unexpected code' + if i < len(s): + result.append(s[i]) + pat_str = ''.join(result) + #print(`pat_str`, `pat`) + if is_exception: + assert pat_str[0] == '.', "expected leading '.'" + assert pat_str[-1] == '.', "expected trailing '.'" + exceptions.append(pat_str[1:-1]) # strip leading and trailing '.' + else: + patterns.append(pat_str) + for ch in ch_map: + edge_entry = struct.unpack('<I', trie_data[24 + (ix + ch) * 4: 24 + (ix + ch) * 4 + 4])[0] + link = (edge_entry & link_mask) >> link_shift + if link != 0 and ch == (edge_entry & char_mask): + sch = s + ch_map[ch] + traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions) + + +# Verify the generated binary file by reconstructing the textual representations +# from the binary hyb file, then checking that they're identical (mod the order of +# lines within the file, which is irrelevant). This function makes assumptions that +# are stronger than absolutely necessary (in particular, that the patterns are in +# lowercase as defined by python islower). +def verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn): + with open(hyb_fn, 'rb') as f: + hyb_data = f.read() + header = hyb_data[0: 6 * 4] + (magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header) + alphabet_data = hyb_data[alphabet_off:trie_off] + trie_data = hyb_data[trie_off:pattern_off] + pattern_data = hyb_data[pattern_off:file_size] + + # reconstruct alphabet table + alphabet_version = struct.unpack('<I', alphabet_data[:4])[0] + alphabet_map = {} + if alphabet_version == 0: + (min_ch, max_ch) = struct.unpack('<2I', alphabet_data[4:12]) + for ch in range(min_ch, max_ch): + offset = 12 + ch - min_ch + b = struct.unpack('B', alphabet_data[offset : offset + 1])[0] + if b != 0: + alphabet_map[unichr(ch)] = b + else: + assert alphabet_version == 1 + n_entries = struct.unpack('<I', alphabet_data[4:8])[0] + for i in range(n_entries): + entry = struct.unpack('<I', alphabet_data[8 + 4 * i: 8 + 4 * i + 4])[0] + alphabet_map[unichr(entry >> 11)] = entry & 0x7ff + + ch_map, reconstructed_chr = map_to_chr(alphabet_map) + + # EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587 + if u'\u0587' in reconstructed_chr: + reconstructed_chr.remove(u'\u0587') + reconstructed_chr.append(u'\u0587\u0535\u0552') + + assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified' + + # reconstruct trie + patterns = [] + exceptions = [] + traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions) + assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified' + assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified' + + +def main(): + global VERBOSE + try: + opts, args = getopt.getopt(sys.argv[1:], 'v') + except getopt.GetoptError as err: + print(str(err)) + sys.exit(1) + for o, _ in opts: + if o == '-v': + VERBOSE = True + pat_fn, out_fn = args + hyph = load(pat_fn) + if pat_fn.endswith('.pat.txt'): + chr_fn = pat_fn[:-8] + '.chr.txt' + ch_map = load_chr(chr_fn) + hyp_fn = pat_fn[:-8] + '.hyp.txt' + load_hyp(hyph, hyp_fn) + generate_hyb_file(hyph, ch_map, out_fn) + verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn) + +if __name__ == '__main__': + main() |