From 6e2cccdc518f8d3424c84ae6fbe0e87ae3c3f66a Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Thu, 27 Aug 2015 13:50:00 -0700 Subject: Binary format for hyphenation patterns In the current state, hyphenation in all languages than Sanskrit seems to work (case-folding edge cases). Thus, we just disable Sanskrit. Packed tries are implemented, but not the finite state machine (space/speed tradeoff). This commit contains a throw-away test app, which runs on the host. I think I want to replace it with unit tests, but I'm including it in the CL because it's useful during development. Bug: 21562869 Bug: 21826930 Bug: 23317038 Bug: 23317904 Bug: 24570591 Change-Id: I7479a565a4a062fa319651c2c14c0fa18c5ceaea (cherry picked from commit f0be43de02a1e07308d3d95408349c3c7f973430) --- app/Android.mk | 36 +++ app/HyphTool.cpp | 62 +++++ doc/hyb_file_format.md | 135 +++++++++++ include/minikin/Hyphenator.h | 39 ++- libs/minikin/Android.mk | 25 ++ libs/minikin/Hyphenator.cpp | 272 +++++++++++++-------- tools/mk_hyb_file.py | 567 +++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 1027 insertions(+), 109 deletions(-) create mode 100644 app/Android.mk create mode 100644 app/HyphTool.cpp create mode 100644 doc/hyb_file_format.md create mode 100755 tools/mk_hyb_file.py diff --git a/app/Android.mk b/app/Android.mk new file mode 100644 index 0000000..2038683 --- /dev/null +++ b/app/Android.mk @@ -0,0 +1,36 @@ +# Copyright (C) 2015 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# see how_to_run.txt for instructions on running these tests + +LOCAL_PATH := $(call my-dir) + +include $(CLEAR_VARS) + +LOCAL_MODULE := hyphtool +LOCAL_MODULE_TAGS := optional + +LOCAL_STATIC_LIBRARIES := libminikin_host + +# Shared libraries which are dependencies of minikin; these are not automatically +# pulled in by the build system (and thus sadly must be repeated). + +LOCAL_SHARED_LIBRARIES := \ + liblog \ + libicuuc-host + +LOCAL_SRC_FILES += \ + HyphTool.cpp + +include $(BUILD_HOST_EXECUTABLE) diff --git a/app/HyphTool.cpp b/app/HyphTool.cpp new file mode 100644 index 0000000..730abad --- /dev/null +++ b/app/HyphTool.cpp @@ -0,0 +1,62 @@ +#include +#include +#include + +#include "utils/Log.h" + +#include +#include + +using android::Hyphenator; + +Hyphenator* loadHybFile(const char* fn) { + struct stat statbuf; + int status = stat(fn, &statbuf); + if (status < 0) { + fprintf(stderr, "error opening %s\n", fn); + return nullptr; + } + size_t size = statbuf.st_size; + FILE* f = fopen(fn, "rb"); + if (f == NULL) { + fprintf(stderr, "error opening %s\n", fn); + return nullptr; + } + uint8_t* buf = new uint8_t[size]; + size_t read_size = fread(buf, 1, size, f); + if (read_size < size) { + fprintf(stderr, "error reading %s\n", fn); + delete[] buf; + return nullptr; + } + return Hyphenator::loadBinary(buf); +} + +int main(int argc, char** argv) { + Hyphenator* hyph = loadHybFile("/tmp/en.hyb"); // should also be configurable + std::vector result; + std::vector word; + if (argc < 2) { + fprintf(stderr, "usage: hyphtool word\n"); + return 1; + } + char* asciiword = argv[1]; + size_t len = strlen(asciiword); + for (size_t i = 0; i < len; i++) { + uint32_t c = asciiword[i]; + if (c == '-') { + c = 0x00AD; + } + // ASCII (or possibly ISO Latin 1), but kinda painful to do utf conversion :( + word.push_back(c); + } + hyph->hyphenate(&result, word.data(), word.size()); + for (size_t i = 0; i < len; i++) { + if (result[i] != 0) { + printf("-"); + } + printf("%c", word[i]); + } + printf("\n"); + return 0; +} diff --git a/doc/hyb_file_format.md b/doc/hyb_file_format.md new file mode 100644 index 0000000..3065a6f --- /dev/null +++ b/doc/hyb_file_format.md @@ -0,0 +1,135 @@ +# Hyb (hyphenation pattern binary) file format + +The hyb file format is how hyphenation patterns are stored in the system image. + +Goals include: + +* Concise (system image space is at a premium) +* Usable when mmap'ed, so it doesn't take significant physical RAM +* Fast to compute +* Simple + +It is _not_ intended as an interchange format, so there is no attempt to make the format +extensible or facilitate backward and forward compatibility. + +Further, at some point we will probably pack patterns for multiple languages into a single +physical file, to reduce number of open mmap'ed files. This document doesn't cover that. + +## Theoretical basis + +At heart, the file contains packed tries with suffix compression, actually quite similar +to the implementation in TeX. + +The file contains three sections. The first section represents the "alphabet," including +case folding. It is effectively a map from Unicode code point to a small integer. + +The second section contains the trie in packed form. It is an array of 3-tuples, packed +into a 32 bit integer. Each (suffix-compressed) trie node has a unique index within this +array, and the pattern field in the tuple is the pattern for that node. Further, each edge +in the trie has an entry in the array, and the character and link fields in the tuple +represent the label and destination of the edge. The packing strategy is as in +[Word Hy-phen-a-tion by Com-put-er](http://www.tug.org/docs/liang/liang-thesis.pdf) by +Franklin Mark Liang. + +The trie representation is similar but not identical to the "double-array trie". +The fundamental operation of lookup of the edge from `s` to `t` with label `c` is +to compare `c == character[s + c]`, and if so, `t = link[s + c]`. + +The third section contains the pattern strings. This section is in two parts: first, +an array with a 3-tuple for each pattern (length, number of trailing 0's, and offset +into the string pool); and second, the string pool. Each pattern is encoded as a byte +(packing 2 per byte would be possible but the space savings would not be signficant). + +As much as possible of the file is represented as 32 bit integers, as that is especially +efficent to access. All are little-endian (this could be revised if the code ever needs +to be ported to big-endian systems). + +## Header + +``` +uint32_t magic == 0x62ad7968 +uint32_t version = 0 +uint32_t alphabet_offset (in bytes) +uint32_t trie_offset (in bytes) +uint32_t pattern_offset (in bytes) +uint32_t file_size (in bytes) +``` + +Offsets are from the front of the file, and in bytes. + +## Alphabet + +The alphabet table comes in two versions. The first is well suited to dense Unicode +ranges and is limited to 256. The second is more general, but lookups will be slower. + +### Alphabet, direct version + +``` +uint32_t version = 0 +uint32_t min_codepoint +uint32_t max_codepoint (exclusive) +uint8_t[] data +``` + +The size of the data array is max_codepoint - min_codepoint. 0 represents an unmapped +character. Note that, in the current implementation, automatic hyphenation is disabled +for any word containing an unmapped character. + +In general, pad bytes follow this table, aligning the next table to a 4-byte boundary. + +### Alphabet, general version + +``` +uint32_t version = 1 +uint32_t n_entries +uint32_t[n_entries] data +``` + +Each element in the data table is `(codepoint << 11) | value`. Note that this is +restricted to 11 bits (2048 possible values). The largest known current value is 483 +(for Sanskrit). + +The entries are sorted by codepoint, to facilitate binary search. Another reasonable +implementation for consumers of the data would be to build a hash table at load time. + +## Trie + +``` +uint32_t version = 0 +uint32_t char_mask +uint32_t link_shift +uint32_t link_mask +uint32_t pattern_shift +uint32_t n_entries +uint32_t[n_entries] data +``` + +Each element in the data table is `(pattern << pattern_shift) | (link << link_shift) | char`. + +All known pattern tables fit in 32 bits total. If this is exceeded, there is a fairly +straightforward tweak, where each node occupies a slot by itself (as opposed to sharing +it with edge slots), which would require very minimal changes to the implementation (TODO +present in more detail). + +## Pattern + +``` +uint32_t version = 0 +uint32_t n_entries +uint32_t pattern_offset (in bytes) +uint32_t pattern_size (in bytes) +uint32_t[n_entries] data +uint8_t[] pattern_buf +``` + +Each element in data table is `(len << 26) | (shift << 20) | offset`, where an offset of 0 +points to the first byte of pattern_buf. + +Generally pattern_offset is `16 + 4 * n_entries`. + +For example, 'a4m5ato' would be represented as `[4, 5, 0, 0, 0]`, then len = 2, shift = 3, and +offset points to [4, 5] in the pattern buffer. + +Future extension: additional data representing nonstandard hyphenation. See +[Automatic non-standard hyphenation in OpenOffice.org](https://www.tug.org/TUGboat/tb27-1/tb86nemeth.pdf) +for more information about that issue. diff --git a/include/minikin/Hyphenator.h b/include/minikin/Hyphenator.h index 581c657..9605205 100644 --- a/include/minikin/Hyphenator.h +++ b/include/minikin/Hyphenator.h @@ -26,11 +26,8 @@ namespace android { -class Trie { -public: - std::vector result; - std::unordered_map succ; -}; +// hyb file header; implementation details are in the .cpp file +struct Header; class Hyphenator { public: @@ -44,19 +41,43 @@ public: // Example: word is "hyphen", result is [0 0 1 0 0 0], corresponding to "hy-phen". void hyphenate(std::vector* result, const uint16_t* word, size_t len); + // pattern data is in binary format, as described in doc/hyb_file_format.md. Note: + // the caller is responsible for ensuring that the lifetime of the pattern data is + // at least as long as the Hyphenator object. + + // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens + static Hyphenator* loadBinary(const uint8_t* patternData); + private: - void addPattern(const uint16_t* pattern, size_t size); + // apply soft hyphens only, ignoring patterns + void hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len); - void hyphenateSoft(std::vector* result, const uint16_t* word, size_t len); + // try looking up word in alphabet table, return false if any code units fail to map + // Note that this methor writes len+2 entries into alpha_codes (including start and stop) + bool alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len); + + // calculate hyphenation from patterns, assuming alphabet lookup has already been done + void hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size_t len); // TODO: these should become parameters, as they might vary by locale, screen size, and // possibly explicit user control. static const int MIN_PREFIX = 2; static const int MIN_SUFFIX = 3; - Trie root; + // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so + // that temporary buffers can be stack-allocated without waste, which is a slightly + // different use case. It measures UTF-16 code units. + static const size_t MAX_HYPHENATED_SIZE = 64; + + const uint8_t* patternData; + + // accessors for binary data + const Header* getHeader() const { + return reinterpret_cast(patternData); + } + }; } // namespace android -#endif // MINIKIN_HYPHENATOR_H \ No newline at end of file +#endif // MINIKIN_HYPHENATOR_H diff --git a/libs/minikin/Android.mk b/libs/minikin/Android.mk index 873f279..7558f83 100644 --- a/libs/minikin/Android.mk +++ b/libs/minikin/Android.mk @@ -48,3 +48,28 @@ LOCAL_SHARED_LIBRARIES := \ libutils include $(BUILD_SHARED_LIBRARY) + +include $(CLEAR_VARS) + +LOCAL_MODULE := libminikin +LOCAL_MODULE_TAGS := optional +LOCAL_EXPORT_C_INCLUDE_DIRS := frameworks/minikin/include +LOCAL_SRC_FILES := $(minikin_src_files) +LOCAL_C_INCLUDES := $(minikin_c_includes) +LOCAL_SHARED_LIBRARIES := $(minikin_shared_libraries) + +include $(BUILD_STATIC_LIBRARY) + +include $(CLEAR_VARS) + +# Reduced library (currently just hyphenation) for host + +LOCAL_MODULE := libminikin_host +LOCAL_MODULE_TAGS := optional +LOCAL_EXPORT_C_INCLUDE_DIRS := frameworks/minikin/include +LOCAL_C_INCLUDES := $(minikin_c_includes) +LOCAL_SHARED_LIBRARIES := liblog libicuuc-host + +LOCAL_SRC_FILES := Hyphenator.cpp + +include $(BUILD_HOST_STATIC_LIBRARY) diff --git a/libs/minikin/Hyphenator.cpp b/libs/minikin/Hyphenator.cpp index 3eb151b..c5eb60b 100644 --- a/libs/minikin/Hyphenator.cpp +++ b/libs/minikin/Hyphenator.cpp @@ -34,130 +34,202 @@ namespace android { static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD; -void Hyphenator::addPattern(const uint16_t* pattern, size_t size) { - vector word; - vector result; - - // start by parsing the Liang-format pattern into a word and a result vector, the - // vector right-aligned but without leading zeros. Examples: - // a1bc2d -> abcd [1, 0, 2, 0] - // abc1 -> abc [1] - // 1a2b3c4d5 -> abcd [1, 2, 3, 4, 5] - bool lastWasLetter = false; - bool haveSeenNumber = false; - for (size_t i = 0; i < size; i++) { - uint16_t c = pattern[i]; - if (isdigit(c)) { - result.push_back(c - '0'); - lastWasLetter = false; - haveSeenNumber = true; - } else { - word.push_back(c); - if (lastWasLetter && haveSeenNumber) { - result.push_back(0); - } - lastWasLetter = true; - } +// The following are structs that correspond to tables inside the hyb file format + +struct AlphabetTable0 { + uint32_t version; + uint32_t min_codepoint; + uint32_t max_codepoint; + uint8_t data[1]; // actually flexible array, size is known at runtime +}; + +struct AlphabetTable1 { + uint32_t version; + uint32_t n_entries; + uint32_t data[1]; // actually flexible array, size is known at runtime + + static uint32_t codepoint(uint32_t entry) { return entry >> 11; } + static uint32_t value(uint32_t entry) { return entry & 0x7ff; } +}; + +struct Trie { + uint32_t version; + uint32_t char_mask; + uint32_t link_shift; + uint32_t link_mask; + uint32_t pattern_shift; + uint32_t n_entries; + uint32_t data[1]; // actually flexible array, size is known at runtime +}; + +struct Pattern { + uint32_t version; + uint32_t n_entries; + uint32_t pattern_offset; + uint32_t pattern_size; + uint32_t data[1]; // actually flexible array, size is known at runtime + + // accessors + static uint32_t len(uint32_t entry) { return entry >> 26; } + static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; } + const uint8_t* buf(uint32_t entry) const { + return reinterpret_cast(this) + pattern_offset + (entry & 0xfffff); + } +}; + +struct Header { + uint32_t magic; + uint32_t version; + uint32_t alphabet_offset; + uint32_t trie_offset; + uint32_t pattern_offset; + uint32_t file_size; + + // accessors + const uint8_t* bytes() const { return reinterpret_cast(this); } + uint32_t alphabetVersion() const { + return *reinterpret_cast(bytes() + alphabet_offset); } - if (lastWasLetter) { - result.push_back(0); + const AlphabetTable0* alphabetTable0() const { + return reinterpret_cast(bytes() + alphabet_offset); } - Trie* t = &root; - for (size_t i = 0; i < word.size(); i++) { - t = &t->succ[word[i]]; + const AlphabetTable1* alphabetTable1() const { + return reinterpret_cast(bytes() + alphabet_offset); + } + const Trie* trieTable() const { + return reinterpret_cast(bytes() + trie_offset); + } + const Pattern* patternTable() const { + return reinterpret_cast(bytes() + pattern_offset); + } +}; + +Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData) { + Hyphenator* result = new Hyphenator; + result->patternData = patternData; + return result; +} + +void Hyphenator::hyphenate(vector* result, const uint16_t* word, size_t len) { + result->clear(); + result->resize(len); + const size_t paddedLen = len + 2; // start and stop code each count for 1 + if (patternData != nullptr && + (int)len >= MIN_PREFIX + MIN_SUFFIX && paddedLen <= MAX_HYPHENATED_SIZE) { + uint16_t alpha_codes[MAX_HYPHENATED_SIZE]; + if (alphabetLookup(alpha_codes, word, len)) { + hyphenateFromCodes(result->data(), alpha_codes, paddedLen); + return; + } + // TODO: try NFC normalization + // TODO: handle non-BMP Unicode (requires remapping of offsets) } - t->result = result; + hyphenateSoft(result->data(), word, len); } // If any soft hyphen is present in the word, use soft hyphens to decide hyphenation, // as recommended in UAX #14 (Use of Soft Hyphen) -void Hyphenator::hyphenateSoft(vector* result, const uint16_t* word, size_t len) { - (*result)[0] = 0; +void Hyphenator::hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len) { + result[0] = 0; for (size_t i = 1; i < len; i++) { - (*result)[i] = word[i - 1] == CHAR_SOFT_HYPHEN; - } + result[i] = word[i - 1] == CHAR_SOFT_HYPHEN; + } } -void Hyphenator::hyphenate(vector* result, const uint16_t* word, size_t len) { - result->clear(); - result->resize(len); - if (len < MIN_PREFIX + MIN_SUFFIX) return; - size_t maxOffset = len - MIN_SUFFIX + 1; - for (size_t i = 0; i < len + 1; i++) { - const Trie* node = &root; - for (size_t j = i; j < len + 2; j++) { - uint16_t c; - if (j == 0 || j == len + 1) { - c = '.'; // word boundary character in pattern data files - } else { - c = word[j - 1]; - if (c == CHAR_SOFT_HYPHEN) { - hyphenateSoft(result, word, len); - return; - } - // TODO: This uses ICU's simple character to character lowercasing, which ignores - // the locale, and ignores cases when lowercasing a character results in more than - // one character. It should be fixed to consider the locale (in order for it to work - // correctly for Turkish and Azerbaijani), as well as support one-to-many, and - // many-to-many case conversions (including non-BMP cases). - if (c < 0x00C0) { // U+00C0 is the lowest uppercase non-ASCII character - // Convert uppercase ASCII to lowercase ASCII, but keep other characters as-is - if (0x0041 <= c && c <= 0x005A) { - c += 0x0020; - } - } else { - c = u_tolower(c); - } +bool Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len) { + const Header* header = getHeader(); + // TODO: check header magic + uint32_t alphabetVersion = header->alphabetVersion(); + if (alphabetVersion == 0) { + const AlphabetTable0* alphabet = header->alphabetTable0(); + uint32_t min_codepoint = alphabet->min_codepoint; + uint32_t max_codepoint = alphabet->max_codepoint; + alpha_codes[0] = 0; // word start + for (size_t i = 0; i < len; i++) { + uint16_t c = word[i]; + if (c < min_codepoint || c >= max_codepoint) { + return false; + } + uint8_t code = alphabet->data[c - min_codepoint]; + if (code == 0) { + return false; + } + alpha_codes[i + 1] = code; + } + alpha_codes[len + 1] = 0; // word termination + return true; + } else if (alphabetVersion == 1) { + const AlphabetTable1* alphabet = header->alphabetTable1(); + size_t n_entries = alphabet->n_entries; + const uint32_t* begin = alphabet->data; + const uint32_t* end = begin + n_entries; + alpha_codes[0] = 0; + for (size_t i = 0; i < len; i++) { + uint16_t c = word[i]; + auto p = std::lower_bound(begin, end, c << 11); + if (p == end) { + return false; } - auto search = node->succ.find(c); - if (search != node->succ.end()) { - node = &search->second; + uint32_t entry = *p; + if (AlphabetTable1::codepoint(entry) != c) { + return false; + } + alpha_codes[i + 1] = AlphabetTable1::value(entry); + } + alpha_codes[len + 1] = 0; + return true; + } + return false; +} + +/** + * Internal implementation, after conversion to codes. All case folding and normalization + * has been done by now, and all characters have been found in the alphabet. + * Note: len here is the padded length including 0 codes at start and end. + **/ +void Hyphenator::hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size_t len) { + const Header* header = getHeader(); + const Trie* trie = header->trieTable(); + const Pattern* pattern = header->patternTable(); + uint32_t char_mask = trie->char_mask; + uint32_t link_shift = trie->link_shift; + uint32_t link_mask = trie->link_mask; + uint32_t pattern_shift = trie->pattern_shift; + size_t maxOffset = len - MIN_SUFFIX - 1; + for (size_t i = 0; i < len - 1; i++) { + uint32_t node = 0; // index into Trie table + for (size_t j = i; j < len; j++) { + uint16_t c = codes[j]; + uint32_t entry = trie->data[node + c]; + if ((entry & char_mask) == c) { + node = (entry & link_mask) >> link_shift; } else { break; } - if (!node->result.empty()) { - int resultLen = node->result.size(); - int offset = j + 1 - resultLen; + uint32_t pat_ix = trie->data[node] >> pattern_shift; + // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset + // into the buf pool. This is the pattern for the substring (i..j) we just matched, + // which we combine (via point-wise max) into the result vector. + if (pat_ix != 0) { + uint32_t pat_entry = pattern->data[pat_ix]; + int pat_len = Pattern::len(pat_entry); + int pat_shift = Pattern::shift(pat_entry); + const uint8_t* pat_buf = pattern->buf(pat_entry); + int offset = j + 1 - (pat_len + pat_shift); + // offset is the index within result that lines up with the start of pat_buf int start = std::max(MIN_PREFIX - offset, 0); - int end = std::min(resultLen, (int)maxOffset - offset); - // TODO performance: this inner loop can profitably be optimized + int end = std::min(pat_len, (int)maxOffset - offset); for (int k = start; k < end; k++) { - (*result)[offset + k] = std::max((*result)[offset + k], node->result[k]); - } -#if 0 - // debug printing of matched patterns - std::string dbg; - for (size_t k = i; k <= j + 1; k++) { - int off = k - j - 2 + resultLen; - if (off >= 0 && node->result[off] != 0) { - dbg.push_back((char)('0' + node->result[off])); - } - if (k < j + 1) { - uint16_t c = (k == 0 || k == len + 1) ? '.' : word[k - 1]; - dbg.push_back((char)c); - } + result[offset + k] = std::max(result[offset + k], pat_buf[k]); } - ALOGD("%d:%d %s", i, j, dbg.c_str()); -#endif } } } // Since the above calculation does not modify values outside // [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0. for (size_t i = MIN_PREFIX; i < maxOffset; i++) { - (*result)[i] &= 1; + result[i] &= 1; } } -Hyphenator* Hyphenator::load(const uint16_t *patternData, size_t size) { - Hyphenator* result = new Hyphenator; - for (size_t i = 0; i < size; i++) { - size_t end = i; - while (patternData[end] != '\n') end++; - result->addPattern(patternData + i, end - i); - i = end; - } - return result; -} - } // namespace android diff --git a/tools/mk_hyb_file.py b/tools/mk_hyb_file.py new file mode 100755 index 0000000..c078454 --- /dev/null +++ b/tools/mk_hyb_file.py @@ -0,0 +1,567 @@ +#!/usr/bin/env python + +# Copyright (C) 2015 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Convert hyphen files in standard TeX format (a trio of pat, chr, and hyp) +into binary format. See doc/hyb_file_format.md for more information. + +Usage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb + +Optional -v parameter turns on verbose debugging. + +""" + +from __future__ import print_function + +import io +import sys +import struct +import math +import getopt + + +VERBOSE = False + + +if sys.version_info[0] >= 3: + def unichr(x): + return chr(x) + + +# number of bits required to represent numbers up to n inclusive +def num_bits(n): + return 1 + int(math.log(n, 2)) if n > 0 else 0 + + +class Node: + + def __init__(self): + self.succ = {} + self.res = None + self.fsm_pat = None + self.fail = None + + +# List of free slots, implemented as doubly linked list +class Freelist: + + def __init__(self): + self.first = None + self.last = None + self.pred = [] + self.succ = [] + + def grow(self): + this = len(self.pred) + self.pred.append(self.last) + self.succ.append(None) + if self.last is None: + self.first = this + else: + self.succ[self.last] = this + self.last = this + + def next(self, cursor): + if cursor == 0: + cursor = self.first + if cursor is None: + self.grow() + result = self.last + else: + result = cursor + return result, self.succ[result] + + def is_free(self, ix): + while ix >= len(self.pred): + self.grow() + return self.pred[ix] != -1 + + def use(self, ix): + if self.pred[ix] is None: + self.first = self.succ[ix] + else: + self.succ[self.pred[ix]] = self.succ[ix] + if self.succ[ix] is None: + self.last = self.pred[ix] + else: + self.pred[self.succ[ix]] = self.pred[ix] + if self.pred[ix] == -1: + assert self.pred[ix] != -1, 'double free!' + self.pred[ix] = -1 + + +def combine(a, b): + if a is None: return b + if b is None: return a + if len(b) < len(a): a, b = b, a + res = b[:len(b) - len(a)] + for i in range(len(a)): + res.append(max(a[i], b[i + len(b) - len(a)])) + return res + + +def trim(pattern): + for ix in range(len(pattern)): + if pattern[ix] != 0: + return pattern[ix:] + + +def pat_to_binary(pattern): + return b''.join(struct.pack('B', x) for x in pattern) + + +class Hyph: + + def __init__(self): + self.root = Node() + self.root.str = '' + self.node_list = [self.root] + + # Add a pattern (word fragment with numeric codes, such as ".ad4der") + def add_pat(self, pat): + lastWasLetter = False + haveSeenNumber = False + result = [] + word = '' + for c in pat: + if c.isdigit(): + result.append(int(c)) + lastWasLetter = False + haveSeenNumber = True + else: + word += c + if lastWasLetter and haveSeenNumber: + result.append(0) + lastWasLetter = True + if lastWasLetter: + result.append(0) + + self.add_word_res(word, result) + + # Add an exception (word with hyphens, such as "ta-ble") + def add_exception(self, hyph_word): + res = [] + word = ['.'] + need_10 = False + for c in hyph_word: + if c == '-': + res.append(11) + need_10 = False + else: + if need_10: + res.append(10) + word.append(c) + need_10 = True + word.append('.') + res.append(0) + res.append(0) + if VERBOSE: + print(word, res) + self.add_word_res(''.join(word), res) + + def add_word_res(self, word, result): + if VERBOSE: + print(word, result) + + t = self.root + s = '' + for c in word: + s += c + if c not in t.succ: + new_node = Node() + new_node.str = s + self.node_list.append(new_node) + t.succ[c] = new_node + t = t.succ[c] + t.res = result + + def pack(self, node_list, ch_map, use_node=False): + size = 0 + self.node_map = {} + nodes = Freelist() + edges = Freelist() + edge_start = 1 if use_node else 0 + for node in node_list: + succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()]) + if len(succ): + cursor = 0 + while True: + edge_ix, cursor = edges.next(cursor) + ix = edge_ix - succ[0] + if (ix >= 0 and nodes.is_free(ix) and + all(edges.is_free(ix + s) for s in succ) and + ((not use_node) or edges.is_free(ix))): + break + elif use_node: + ix, _ = edges.next(0) + nodes.is_free(ix) # actually don't need nodes at all when use_node, + # but keep it happy + else: + ix, _ = nodes.next(0) + node.ix = ix + self.node_map[ix] = node + nodes.use(ix) + size = max(size, ix) + if use_node: + edges.use(ix) + for s in succ: + edges.use(ix + s) + size += max(ch_map.values()) + 1 + return size + + # return list of nodes in bfs order + def bfs(self, ch_map): + result = [self.root] + ix = 0 + while ix < len(result): + node = result[ix] + node.bfs_ix = ix + mapped = {} + for c, next in node.succ.items(): + assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c)) + mapped[ch_map[c]] = next + for i in sorted(mapped.keys()): + result.append(mapped[i]) + ix += 1 + self.bfs_order = result + return result + + # suffix compression - convert the trie into an acyclic digraph, merging nodes when + # the subtries are identical + def dedup(self): + uniques = [] + dupmap = {} + dedup_ix = [0] * len(self.bfs_order) + for ix in reversed(range(len(self.bfs_order))): + # construct string representation of node + node = self.bfs_order[ix] + if node.res is None: + s = '' + else: + s = ''.join(str(c) for c in node.res) + for c in sorted(node.succ.keys()): + succ = node.succ[c] + s += ' ' + c + str(dedup_ix[succ.bfs_ix]) + if s in dupmap: + dedup_ix[ix] = dupmap[s] + else: + uniques.append(node) + dedup_ix[ix] = ix + dupmap[s] = dedup_ix[ix] + uniques.reverse() + print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total') + return dedup_ix, uniques + + +# load the ".pat" file, which contains patterns such as a1b2c3 +def load(fn): + hyph = Hyph() + with io.open(fn, encoding='UTF-8') as f: + for l in f: + pat = l.strip() + hyph.add_pat(pat) + return hyph + + +# load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc. +def load_chr(fn): + ch_map = {'.': 0} + with io.open(fn, encoding='UTF-8') as f: + for i, l in enumerate(f): + l = l.strip() + if len(l) > 2: + # lowercase maps to multi-character uppercase sequence, ignore uppercase for now + l = l[:1] + else: + assert len(l) == 2, 'expected 2 chars in chr' + for c in l: + ch_map[c] = i + 1 + return ch_map + + +# load exceptions with explicit hyphens +def load_hyp(hyph, fn): + with io.open(fn, encoding='UTF-8') as f: + for l in f: + hyph.add_exception(l.strip()) + + +def generate_header(alphabet, trie, pattern): + alphabet_off = 6 * 4 + trie_off = alphabet_off + len(alphabet) + pattern_off = trie_off + len(trie) + file_size = pattern_off + len(pattern) + data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size] + return struct.pack('<6I', *data) + + +def generate_alphabet(ch_map): + ch_map = ch_map.copy() + del ch_map['.'] + min_ch = ord(min(ch_map)) + max_ch = ord(max(ch_map)) + if max_ch - min_ch < 1024 and max(ch_map.values()) < 256: + # generate format 0 + data = [0] * (max_ch - min_ch + 1) + for c, val in ch_map.items(): + data[ord(c) - min_ch] = val + result = [struct.pack('<3I', 0, min_ch, max_ch + 1)] + for b in data: + result.append(struct.pack('> 26 + pat_shift = (entry >> 20) & 0x1f + offset = pattern_offset + (entry & 0xfffff) + return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift + + +def traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions): + (char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20]) + node_entry = struct.unpack('> pattern_shift + if pattern: + result = [] + is_exception = False + pat = get_pattern(pattern_data, pattern) + for i in range(len(s) + 1): + pat_off = i - 1 + len(pat) - len(s) + if pat_off < 0: + code = 0 + else: + code = struct.unpack('B', pat[pat_off : pat_off + 1])[0] + if 1 <= code <= 9: + result.append('%d' % code) + elif code == 10: + is_exception = True + elif code == 11: + result.append('-') + is_exception = True + else: + assert code == 0, 'unexpected code' + if i < len(s): + result.append(s[i]) + pat_str = ''.join(result) + #print(`pat_str`, `pat`) + if is_exception: + assert pat_str[0] == '.', "expected leading '.'" + assert pat_str[-1] == '.', "expected trailing '.'" + exceptions.append(pat_str[1:-1]) # strip leading and trailing '.' + else: + patterns.append(pat_str) + for ch in ch_map: + edge_entry = struct.unpack('> link_shift + if link != 0 and ch == (edge_entry & char_mask): + sch = s + ch_map[ch] + traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions) + + +# Verify the generated binary file by reconstructing the textual representations +# from the binary hyb file, then checking that they're identical (mod the order of +# lines within the file, which is irrelevant). This function makes assumptions that +# are stronger than absolutely necessary (in particular, that the patterns are in +# lowercase as defined by python islower). +def verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn): + with open(hyb_fn, 'rb') as f: + hyb_data = f.read() + header = hyb_data[0: 6 * 4] + (magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header) + alphabet_data = hyb_data[alphabet_off:trie_off] + trie_data = hyb_data[trie_off:pattern_off] + pattern_data = hyb_data[pattern_off:file_size] + + # reconstruct alphabet table + alphabet_version = struct.unpack('> 11)] = entry & 0x7ff + + ch_map, reconstructed_chr = map_to_chr(alphabet_map) + + # EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587 + if u'\u0587' in reconstructed_chr: + reconstructed_chr.remove(u'\u0587') + reconstructed_chr.append(u'\u0587\u0535\u0552') + + assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified' + + # reconstruct trie + patterns = [] + exceptions = [] + traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions) + assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified' + assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified' + + +def main(): + global VERBOSE + try: + opts, args = getopt.getopt(sys.argv[1:], 'v') + except getopt.GetoptError as err: + print(str(err)) + sys.exit(1) + for o, _ in opts: + if o == '-v': + VERBOSE = True + pat_fn, out_fn = args + hyph = load(pat_fn) + if pat_fn.endswith('.pat.txt'): + chr_fn = pat_fn[:-8] + '.chr.txt' + ch_map = load_chr(chr_fn) + hyp_fn = pat_fn[:-8] + '.hyp.txt' + load_hyp(hyph, hyp_fn) + generate_hyb_file(hyph, ch_map, out_fn) + verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn) + +if __name__ == '__main__': + main() -- cgit v1.2.3