From 6e2cccdc518f8d3424c84ae6fbe0e87ae3c3f66a Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Thu, 27 Aug 2015 13:50:00 -0700 Subject: Binary format for hyphenation patterns In the current state, hyphenation in all languages than Sanskrit seems to work (case-folding edge cases). Thus, we just disable Sanskrit. Packed tries are implemented, but not the finite state machine (space/speed tradeoff). This commit contains a throw-away test app, which runs on the host. I think I want to replace it with unit tests, but I'm including it in the CL because it's useful during development. Bug: 21562869 Bug: 21826930 Bug: 23317038 Bug: 23317904 Bug: 24570591 Change-Id: I7479a565a4a062fa319651c2c14c0fa18c5ceaea (cherry picked from commit f0be43de02a1e07308d3d95408349c3c7f973430) --- app/Android.mk | 36 +++ app/HyphTool.cpp | 62 +++++ doc/hyb_file_format.md | 135 +++++++++++ include/minikin/Hyphenator.h | 39 ++- libs/minikin/Android.mk | 25 ++ libs/minikin/Hyphenator.cpp | 272 +++++++++++++-------- tools/mk_hyb_file.py | 567 +++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 1027 insertions(+), 109 deletions(-) create mode 100644 app/Android.mk create mode 100644 app/HyphTool.cpp create mode 100644 doc/hyb_file_format.md create mode 100755 tools/mk_hyb_file.py diff --git a/app/Android.mk b/app/Android.mk new file mode 100644 index 0000000..2038683 --- /dev/null +++ b/app/Android.mk @@ -0,0 +1,36 @@ +# Copyright (C) 2015 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# see how_to_run.txt for instructions on running these tests + +LOCAL_PATH := $(call my-dir) + +include $(CLEAR_VARS) + +LOCAL_MODULE := hyphtool +LOCAL_MODULE_TAGS := optional + +LOCAL_STATIC_LIBRARIES := libminikin_host + +# Shared libraries which are dependencies of minikin; these are not automatically +# pulled in by the build system (and thus sadly must be repeated). + +LOCAL_SHARED_LIBRARIES := \ + liblog \ + libicuuc-host + +LOCAL_SRC_FILES += \ + HyphTool.cpp + +include $(BUILD_HOST_EXECUTABLE) diff --git a/app/HyphTool.cpp b/app/HyphTool.cpp new file mode 100644 index 0000000..730abad --- /dev/null +++ b/app/HyphTool.cpp @@ -0,0 +1,62 @@ +#include +#include +#include + +#include "utils/Log.h" + +#include +#include + +using android::Hyphenator; + +Hyphenator* loadHybFile(const char* fn) { + struct stat statbuf; + int status = stat(fn, &statbuf); + if (status < 0) { + fprintf(stderr, "error opening %s\n", fn); + return nullptr; + } + size_t size = statbuf.st_size; + FILE* f = fopen(fn, "rb"); + if (f == NULL) { + fprintf(stderr, "error opening %s\n", fn); + return nullptr; + } + uint8_t* buf = new uint8_t[size]; + size_t read_size = fread(buf, 1, size, f); + if (read_size < size) { + fprintf(stderr, "error reading %s\n", fn); + delete[] buf; + return nullptr; + } + return Hyphenator::loadBinary(buf); +} + +int main(int argc, char** argv) { + Hyphenator* hyph = loadHybFile("/tmp/en.hyb"); // should also be configurable + std::vector result; + std::vector word; + if (argc < 2) { + fprintf(stderr, "usage: hyphtool word\n"); + return 1; + } + char* asciiword = argv[1]; + size_t len = strlen(asciiword); + for (size_t i = 0; i < len; i++) { + uint32_t c = asciiword[i]; + if (c == '-') { + c = 0x00AD; + } + // ASCII (or possibly ISO Latin 1), but kinda painful to do utf conversion :( + word.push_back(c); + } + hyph->hyphenate(&result, word.data(), word.size()); + for (size_t i = 0; i < len; i++) { + if (result[i] != 0) { + printf("-"); + } + printf("%c", word[i]); + } + printf("\n"); + return 0; +} diff --git a/doc/hyb_file_format.md b/doc/hyb_file_format.md new file mode 100644 index 0000000..3065a6f --- /dev/null +++ b/doc/hyb_file_format.md @@ -0,0 +1,135 @@ +# Hyb (hyphenation pattern binary) file format + +The hyb file format is how hyphenation patterns are stored in the system image. + +Goals include: + +* Concise (system image space is at a premium) +* Usable when mmap'ed, so it doesn't take significant physical RAM +* Fast to compute +* Simple + +It is _not_ intended as an interchange format, so there is no attempt to make the format +extensible or facilitate backward and forward compatibility. + +Further, at some point we will probably pack patterns for multiple languages into a single +physical file, to reduce number of open mmap'ed files. This document doesn't cover that. + +## Theoretical basis + +At heart, the file contains packed tries with suffix compression, actually quite similar +to the implementation in TeX. + +The file contains three sections. The first section represents the "alphabet," including +case folding. It is effectively a map from Unicode code point to a small integer. + +The second section contains the trie in packed form. It is an array of 3-tuples, packed +into a 32 bit integer. Each (suffix-compressed) trie node has a unique index within this +array, and the pattern field in the tuple is the pattern for that node. Further, each edge +in the trie has an entry in the array, and the character and link fields in the tuple +represent the label and destination of the edge. The packing strategy is as in +[Word Hy-phen-a-tion by Com-put-er](http://www.tug.org/docs/liang/liang-thesis.pdf) by +Franklin Mark Liang. + +The trie representation is similar but not identical to the "double-array trie". +The fundamental operation of lookup of the edge from `s` to `t` with label `c` is +to compare `c == character[s + c]`, and if so, `t = link[s + c]`. + +The third section contains the pattern strings. This section is in two parts: first, +an array with a 3-tuple for each pattern (length, number of trailing 0's, and offset +into the string pool); and second, the string pool. Each pattern is encoded as a byte +(packing 2 per byte would be possible but the space savings would not be signficant). + +As much as possible of the file is represented as 32 bit integers, as that is especially +efficent to access. All are little-endian (this could be revised if the code ever needs +to be ported to big-endian systems). + +## Header + +``` +uint32_t magic == 0x62ad7968 +uint32_t version = 0 +uint32_t alphabet_offset (in bytes) +uint32_t trie_offset (in bytes) +uint32_t pattern_offset (in bytes) +uint32_t file_size (in bytes) +``` + +Offsets are from the front of the file, and in bytes. + +## Alphabet + +The alphabet table comes in two versions. The first is well suited to dense Unicode +ranges and is limited to 256. The second is more general, but lookups will be slower. + +### Alphabet, direct version + +``` +uint32_t version = 0 +uint32_t min_codepoint +uint32_t max_codepoint (exclusive) +uint8_t[] data +``` + +The size of the data array is max_codepoint - min_codepoint. 0 represents an unmapped +character. Note that, in the current implementation, automatic hyphenation is disabled +for any word containing an unmapped character. + +In general, pad bytes follow this table, aligning the next table to a 4-byte boundary. + +### Alphabet, general version + +``` +uint32_t version = 1 +uint32_t n_entries +uint32_t[n_entries] data +``` + +Each element in the data table is `(codepoint << 11) | value`. Note that this is +restricted to 11 bits (2048 possible values). The largest known current value is 483 +(for Sanskrit). + +The entries are sorted by codepoint, to facilitate binary search. Another reasonable +implementation for consumers of the data would be to build a hash table at load time. + +## Trie + +``` +uint32_t version = 0 +uint32_t char_mask +uint32_t link_shift +uint32_t link_mask +uint32_t pattern_shift +uint32_t n_entries +uint32_t[n_entries] data +``` + +Each element in the data table is `(pattern << pattern_shift) | (link << link_shift) | char`. + +All known pattern tables fit in 32 bits total. If this is exceeded, there is a fairly +straightforward tweak, where each node occupies a slot by itself (as opposed to sharing +it with edge slots), which would require very minimal changes to the implementation (TODO +present in more detail). + +## Pattern + +``` +uint32_t version = 0 +uint32_t n_entries +uint32_t pattern_offset (in bytes) +uint32_t pattern_size (in bytes) +uint32_t[n_entries] data +uint8_t[] pattern_buf +``` + +Each element in data table is `(len << 26) | (shift << 20) | offset`, where an offset of 0 +points to the first byte of pattern_buf. + +Generally pattern_offset is `16 + 4 * n_entries`. + +For example, 'a4m5ato' would be represented as `[4, 5, 0, 0, 0]`, then len = 2, shift = 3, and +offset points to [4, 5] in the pattern buffer. + +Future extension: additional data representing nonstandard hyphenation. See +[Automatic non-standard hyphenation in OpenOffice.org](https://www.tug.org/TUGboat/tb27-1/tb86nemeth.pdf) +for more information about that issue. diff --git a/include/minikin/Hyphenator.h b/include/minikin/Hyphenator.h index 581c657..9605205 100644 --- a/include/minikin/Hyphenator.h +++ b/include/minikin/Hyphenator.h @@ -26,11 +26,8 @@ namespace android { -class Trie { -public: - std::vector result; - std::unordered_map succ; -}; +// hyb file header; implementation details are in the .cpp file +struct Header; class Hyphenator { public: @@ -44,19 +41,43 @@ public: // Example: word is "hyphen", result is [0 0 1 0 0 0], corresponding to "hy-phen". void hyphenate(std::vector* result, const uint16_t* word, size_t len); + // pattern data is in binary format, as described in doc/hyb_file_format.md. Note: + // the caller is responsible for ensuring that the lifetime of the pattern data is + // at least as long as the Hyphenator object. + + // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens + static Hyphenator* loadBinary(const uint8_t* patternData); + private: - void addPattern(const uint16_t* pattern, size_t size); + // apply soft hyphens only, ignoring patterns + void hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len); - void hyphenateSoft(std::vector* result, const uint16_t* word, size_t len); + // try looking up word in alphabet table, return false if any code units fail to map + // Note that this methor writes len+2 entries into alpha_codes (including start and stop) + bool alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len); + + // calculate hyphenation from patterns, assuming alphabet lookup has already been done + void hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size_t len); // TODO: these should become parameters, as they might vary by locale, screen size, and // possibly explicit user control. static const int MIN_PREFIX = 2; static const int MIN_SUFFIX = 3; - Trie root; + // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so + // that temporary buffers can be stack-allocated without waste, which is a slightly + // different use case. It measures UTF-16 code units. + static const size_t MAX_HYPHENATED_SIZE = 64; + + const uint8_t* patternData; + + // accessors for binary data + const Header* getHeader() const { + return reinterpret_cast(patternData); + } + }; } // namespace android -#endif // MINIKIN_HYPHENATOR_H \ No newline at end of file +#endif // MINIKIN_HYPHENATOR_H diff --git a/libs/minikin/Android.mk b/libs/minikin/Android.mk index 873f279..7558f83 100644 --- a/libs/minikin/Android.mk +++ b/libs/minikin/Android.mk @@ -48,3 +48,28 @@ LOCAL_SHARED_LIBRARIES := \ libutils include $(BUILD_SHARED_LIBRARY) + +include $(CLEAR_VARS) + +LOCAL_MODULE := libminikin +LOCAL_MODULE_TAGS := optional +LOCAL_EXPORT_C_INCLUDE_DIRS := frameworks/minikin/include +LOCAL_SRC_FILES := $(minikin_src_files) +LOCAL_C_INCLUDES := $(minikin_c_includes) +LOCAL_SHARED_LIBRARIES := $(minikin_shared_libraries) + +include $(BUILD_STATIC_LIBRARY) + +include $(CLEAR_VARS) + +# Reduced library (currently just hyphenation) for host + +LOCAL_MODULE := libminikin_host +LOCAL_MODULE_TAGS := optional +LOCAL_EXPORT_C_INCLUDE_DIRS := frameworks/minikin/include +LOCAL_C_INCLUDES := $(minikin_c_includes) +LOCAL_SHARED_LIBRARIES := liblog libicuuc-host + +LOCAL_SRC_FILES := Hyphenator.cpp + +include $(BUILD_HOST_STATIC_LIBRARY) diff --git a/libs/minikin/Hyphenator.cpp b/libs/minikin/Hyphenator.cpp index 3eb151b..c5eb60b 100644 --- a/libs/minikin/Hyphenator.cpp +++ b/libs/minikin/Hyphenator.cpp @@ -34,130 +34,202 @@ namespace android { static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD; -void Hyphenator::addPattern(const uint16_t* pattern, size_t size) { - vector word; - vector result; - - // start by parsing the Liang-format pattern into a word and a result vector, the - // vector right-aligned but without leading zeros. Examples: - // a1bc2d -> abcd [1, 0, 2, 0] - // abc1 -> abc [1] - // 1a2b3c4d5 -> abcd [1, 2, 3, 4, 5] - bool lastWasLetter = false; - bool haveSeenNumber = false; - for (size_t i = 0; i < size; i++) { - uint16_t c = pattern[i]; - if (isdigit(c)) { - result.push_back(c - '0'); - lastWasLetter = false; - haveSeenNumber = true; - } else { - word.push_back(c); - if (lastWasLetter && haveSeenNumber) { - result.push_back(0); - } - lastWasLetter = true; - } +// The following are structs that correspond to tables inside the hyb file format + +struct AlphabetTable0 { + uint32_t version; + uint32_t min_codepoint; + uint32_t max_codepoint; + uint8_t data[1]; // actually flexible array, size is known at runtime +}; + +struct AlphabetTable1 { + uint32_t version; + uint32_t n_entries; + uint32_t data[1]; // actually flexible array, size is known at runtime + + static uint32_t codepoint(uint32_t entry) { return entry >> 11; } + static uint32_t value(uint32_t entry) { return entry & 0x7ff; } +}; + +struct Trie { + uint32_t version; + uint32_t char_mask; + uint32_t link_shift; + uint32_t link_mask; + uint32_t pattern_shift; + uint32_t n_entries; + uint32_t data[1]; // actually flexible array, size is known at runtime +}; + +struct Pattern { + uint32_t version; + uint32_t n_entries; + uint32_t pattern_offset; + uint32_t pattern_size; + uint32_t data[1]; // actually flexible array, size is known at runtime + + // accessors + static uint32_t len(uint32_t entry) { return entry >> 26; } + static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; } + const uint8_t* buf(uint32_t entry) const { + return reinterpret_cast(this) + pattern_offset + (entry & 0xfffff); + } +}; + +struct Header { + uint32_t magic; + uint32_t version; + uint32_t alphabet_offset; + uint32_t trie_offset; + uint32_t pattern_offset; + uint32_t file_size; + + // accessors + const uint8_t* bytes() const { return reinterpret_cast(this); } + uint32_t alphabetVersion() const { + return *reinterpret_cast(bytes() + alphabet_offset); } - if (lastWasLetter) { - result.push_back(0); + const AlphabetTable0* alphabetTable0() const { + return reinterpret_cast(bytes() + alphabet_offset); } - Trie* t = &root; - for (size_t i = 0; i < word.size(); i++) { - t = &t->succ[word[i]]; + const AlphabetTable1* alphabetTable1() const { + return reinterpret_cast(bytes() + alphabet_offset); + } + const Trie* trieTable() const { + return reinterpret_cast(bytes() + trie_offset); + } + const Pattern* patternTable() const { + return reinterpret_cast(bytes() + pattern_offset); + } +}; + +Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData) { + Hyphenator* result = new Hyphenator; + result->patternData = patternData; + return result; +} + +void Hyphenator::hyphenate(vector* result, const uint16_t* word, size_t len) { + result->clear(); + result->resize(len); + const size_t paddedLen = len + 2; // start and stop code each count for 1 + if (patternData != nullptr && + (int)len >= MIN_PREFIX + MIN_SUFFIX && paddedLen <= MAX_HYPHENATED_SIZE) { + uint16_t alpha_codes[MAX_HYPHENATED_SIZE]; + if (alphabetLookup(alpha_codes, word, len)) { + hyphenateFromCodes(result->data(), alpha_codes, paddedLen); + return; + } + // TODO: try NFC normalization + // TODO: handle non-BMP Unicode (requires remapping of offsets) } - t->result = result; + hyphenateSoft(result->data(), word, len); } // If any soft hyphen is present in the word, use soft hyphens to decide hyphenation, // as recommended in UAX #14 (Use of Soft Hyphen) -void Hyphenator::hyphenateSoft(vector* result, const uint16_t* word, size_t len) { - (*result)[0] = 0; +void Hyphenator::hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len) { + result[0] = 0; for (size_t i = 1; i < len; i++) { - (*result)[i] = word[i - 1] == CHAR_SOFT_HYPHEN; - } + result[i] = word[i - 1] == CHAR_SOFT_HYPHEN; + } } -void Hyphenator::hyphenate(vector* result, const uint16_t* word, size_t len) { - result->clear(); - result->resize(len); - if (len < MIN_PREFIX + MIN_SUFFIX) return; - size_t maxOffset = len - MIN_SUFFIX + 1; - for (size_t i = 0; i < len + 1; i++) { - const Trie* node = &root; - for (size_t j = i; j < len + 2; j++) { - uint16_t c; - if (j == 0 || j == len + 1) { - c = '.'; // word boundary character in pattern data files - } else { - c = word[j - 1]; - if (c == CHAR_SOFT_HYPHEN) { - hyphenateSoft(result, word, len); - return; - } - // TODO: This uses ICU's simple character to character lowercasing, which ignores - // the locale, and ignores cases when lowercasing a character results in more than - // one character. It should be fixed to consider the locale (in order for it to work - // correctly for Turkish and Azerbaijani), as well as support one-to-many, and - // many-to-many case conversions (including non-BMP cases). - if (c < 0x00C0) { // U+00C0 is the lowest uppercase non-ASCII character - // Convert uppercase ASCII to lowercase ASCII, but keep other characters as-is - if (0x0041 <= c && c <= 0x005A) { - c += 0x0020; - } - } else { - c = u_tolower(c); - } +bool Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len) { + const Header* header = getHeader(); + // TODO: check header magic + uint32_t alphabetVersion = header->alphabetVersion(); + if (alphabetVersion == 0) { + const AlphabetTable0* alphabet = header->alphabetTable0(); + uint32_t min_codepoint = alphabet->min_codepoint; + uint32_t max_codepoint = alphabet->max_codepoint; + alpha_codes[0] = 0; // word start + for (size_t i = 0; i < len; i++) { + uint16_t c = word[i]; + if (c < min_codepoint || c >= max_codepoint) { + return false; + } + uint8_t code = alphabet->data[c - min_codepoint]; + if (code == 0) { + return false; + } + alpha_codes[i + 1] = code; + } + alpha_codes[len + 1] = 0; // word termination + return true; + } else if (alphabetVersion == 1) { + const AlphabetTable1* alphabet = header->alphabetTable1(); + size_t n_entries = alphabet->n_entries; + const uint32_t* begin = alphabet->data; + const uint32_t* end = begin + n_entries; + alpha_codes[0] = 0; + for (size_t i = 0; i < len; i++) { + uint16_t c = word[i]; + auto p = std::lower_bound(begin, end, c << 11); + if (p == end) { + return false; } - auto search = node->succ.find(c); - if (search != node->succ.end()) { - node = &search->second; + uint32_t entry = *p; + if (AlphabetTable1::codepoint(entry) != c) { + return false; + } + alpha_codes[i + 1] = AlphabetTable1::value(entry); + } + alpha_codes[len + 1] = 0; + return true; + } + return false; +} + +/** + * Internal implementation, after conversion to codes. All case folding and normalization + * has been done by now, and all characters have been found in the alphabet. + * Note: len here is the padded length including 0 codes at start and end. + **/ +void Hyphenator::hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size_t len) { + const Header* header = getHeader(); + const Trie* trie = header->trieTable(); + const Pattern* pattern = header->patternTable(); + uint32_t char_mask = trie->char_mask; + uint32_t link_shift = trie->link_shift; + uint32_t link_mask = trie->link_mask; + uint32_t pattern_shift = trie->pattern_shift; + size_t maxOffset = len - MIN_SUFFIX - 1; + for (size_t i = 0; i < len - 1; i++) { + uint32_t node = 0; // index into Trie table + for (size_t j = i; j < len; j++) { + uint16_t c = codes[j]; + uint32_t entry = trie->data[node + c]; + if ((entry & char_mask) == c) { + node = (entry & link_mask) >> link_shift; } else { break; } - if (!node->result.empty()) { - int resultLen = node->result.size(); - int offset = j + 1 - resultLen; + uint32_t pat_ix = trie->data[node] >> pattern_shift; + // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset + // into the buf pool. This is the pattern for the substring (i..j) we just matched, + // which we combine (via point-wise max) into the result vector. + if (pat_ix != 0) { + uint32_t pat_entry = pattern->data[pat_ix]; + int pat_len = Pattern::len(pat_entry); + int pat_shift = Pattern::shift(pat_entry); + const uint8_t* pat_buf = pattern->buf(pat_entry); + int offset = j + 1 - (pat_len + pat_shift); + // offset is the index within result that lines up with the start of pat_buf int start = std::max(MIN_PREFIX - offset, 0); - int end = std::min(resultLen, (int)maxOffset - offset); - // TODO performance: this inner loop can profitably be optimized + int end = std::min(pat_len, (int)maxOffset - offset); for (int k = start; k < end; k++) { - (*result)[offset + k] = std::max((*result)[offset + k], node->result[k]); - } -#if 0 - // debug printing of matched patterns - std::string dbg; - for (size_t k = i; k <= j + 1; k++) { - int off = k - j - 2 + resultLen; - if (off >= 0 && node->result[off] != 0) { - dbg.push_back((char)('0' + node->result[off])); - } - if (k < j + 1) { - uint16_t c = (k == 0 || k == len + 1) ? '.' : word[k - 1]; - dbg.push_back((char)c); - } + result[offset + k] = std::max(result[offset + k], pat_buf[k]); } - ALOGD("%d:%d %s", i, j, dbg.c_str()); -#endif } } } // Since the above calculation does not modify values outside // [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0. for (size_t i = MIN_PREFIX; i < maxOffset; i++) { - (*result)[i] &= 1; + result[i] &= 1; } } -Hyphenator* Hyphenator::load(const uint16_t *patternData, size_t size) { - Hyphenator* result = new Hyphenator; - for (size_t i = 0; i < size; i++) { - size_t end = i; - while (patternData[end] != '\n') end++; - result->addPattern(patternData + i, end - i); - i = end; - } - return result; -} - } // namespace android diff --git a/tools/mk_hyb_file.py b/tools/mk_hyb_file.py new file mode 100755 index 0000000..c078454 --- /dev/null +++ b/tools/mk_hyb_file.py @@ -0,0 +1,567 @@ +#!/usr/bin/env python + +# Copyright (C) 2015 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the 'License'); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an 'AS IS' BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Convert hyphen files in standard TeX format (a trio of pat, chr, and hyp) +into binary format. See doc/hyb_file_format.md for more information. + +Usage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb + +Optional -v parameter turns on verbose debugging. + +""" + +from __future__ import print_function + +import io +import sys +import struct +import math +import getopt + + +VERBOSE = False + + +if sys.version_info[0] >= 3: + def unichr(x): + return chr(x) + + +# number of bits required to represent numbers up to n inclusive +def num_bits(n): + return 1 + int(math.log(n, 2)) if n > 0 else 0 + + +class Node: + + def __init__(self): + self.succ = {} + self.res = None + self.fsm_pat = None + self.fail = None + + +# List of free slots, implemented as doubly linked list +class Freelist: + + def __init__(self): + self.first = None + self.last = None + self.pred = [] + self.succ = [] + + def grow(self): + this = len(self.pred) + self.pred.append(self.last) + self.succ.append(None) + if self.last is None: + self.first = this + else: + self.succ[self.last] = this + self.last = this + + def next(self, cursor): + if cursor == 0: + cursor = self.first + if cursor is None: + self.grow() + result = self.last + else: + result = cursor + return result, self.succ[result] + + def is_free(self, ix): + while ix >= len(self.pred): + self.grow() + return self.pred[ix] != -1 + + def use(self, ix): + if self.pred[ix] is None: + self.first = self.succ[ix] + else: + self.succ[self.pred[ix]] = self.succ[ix] + if self.succ[ix] is None: + self.last = self.pred[ix] + else: + self.pred[self.succ[ix]] = self.pred[ix] + if self.pred[ix] == -1: + assert self.pred[ix] != -1, 'double free!' + self.pred[ix] = -1 + + +def combine(a, b): + if a is None: return b + if b is None: return a + if len(b) < len(a): a, b = b, a + res = b[:len(b) - len(a)] + for i in range(len(a)): + res.append(max(a[i], b[i + len(b) - len(a)])) + return res + + +def trim(pattern): + for ix in range(len(pattern)): + if pattern[ix] != 0: + return pattern[ix:] + + +def pat_to_binary(pattern): + return b''.join(struct.pack('B', x) for x in pattern) + + +class Hyph: + + def __init__(self): + self.root = Node() + self.root.str = '' + self.node_list = [self.root] + + # Add a pattern (word fragment with numeric codes, such as ".ad4der") + def add_pat(self, pat): + lastWasLetter = False + haveSeenNumber = False + result = [] + word = '' + for c in pat: + if c.isdigit(): + result.append(int(c)) + lastWasLetter = False + haveSeenNumber = True + else: + word += c + if lastWasLetter and haveSeenNumber: + result.append(0) + lastWasLetter = True + if lastWasLetter: + result.append(0) + + self.add_word_res(word, result) + + # Add an exception (word with hyphens, such as "ta-ble") + def add_exception(self, hyph_word): + res = [] + word = ['.'] + need_10 = False + for c in hyph_word: + if c == '-': + res.append(11) + need_10 = False + else: + if need_10: + res.append(10) + word.append(c) + need_10 = True + word.append('.') + res.append(0) + res.append(0) + if VERBOSE: + print(word, res) + self.add_word_res(''.join(word), res) + + def add_word_res(self, word, result): + if VERBOSE: + print(word, result) + + t = self.root + s = '' + for c in word: + s += c + if c not in t.succ: + new_node = Node() + new_node.str = s + self.node_list.append(new_node) + t.succ[c] = new_node + t = t.succ[c] + t.res = result + + def pack(self, node_list, ch_map, use_node=False): + size = 0 + self.node_map = {} + nodes = Freelist() + edges = Freelist() + edge_start = 1 if use_node else 0 + for node in node_list: + succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()]) + if len(succ): + cursor = 0 + while True: + edge_ix, cursor = edges.next(cursor) + ix = edge_ix - succ[0] + if (ix >= 0 and nodes.is_free(ix) and + all(edges.is_free(ix + s) for s in succ) and + ((not use_node) or edges.is_free(ix))): + break + elif use_node: + ix, _ = edges.next(0) + nodes.is_free(ix) # actually don't need nodes at all when use_node, + # but keep it happy + else: + ix, _ = nodes.next(0) + node.ix = ix + self.node_map[ix] = node + nodes.use(ix) + size = max(size, ix) + if use_node: + edges.use(ix) + for s in succ: + edges.use(ix + s) + size += max(ch_map.values()) + 1 + return size + + # return list of nodes in bfs order + def bfs(self, ch_map): + result = [self.root] + ix = 0 + while ix < len(result): + node = result[ix] + node.bfs_ix = ix + mapped = {} + for c, next in node.succ.items(): + assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c)) + mapped[ch_map[c]] = next + for i in sorted(mapped.keys()): + result.append(mapped[i]) + ix += 1 + self.bfs_order = result + return result + + # suffix compression - convert the trie into an acyclic digraph, merging nodes when + # the subtries are identical + def dedup(self): + uniques = [] + dupmap = {} + dedup_ix = [0] * len(self.bfs_order) + for ix in reversed(range(len(self.bfs_order))): + # construct string representation of node + node = self.bfs_order[ix] + if node.res is None: + s = '' + else: + s = ''.join(str(c) for c in node.res) + for c in sorted(node.succ.keys()): + succ = node.succ[c] + s += ' ' + c + str(dedup_ix[succ.bfs_ix]) + if s in dupmap: + dedup_ix[ix] = dupmap[s] + else: + uniques.append(node) + dedup_ix[ix] = ix + dupmap[s] = dedup_ix[ix] + uniques.reverse() + print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total') + return dedup_ix, uniques + + +# load the ".pat" file, which contains patterns such as a1b2c3 +def load(fn): + hyph = Hyph() + with io.open(fn, encoding='UTF-8') as f: + for l in f: + pat = l.strip() + hyph.add_pat(pat) + return hyph + + +# load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc. +def load_chr(fn): + ch_map = {'.': 0} + with io.open(fn, encoding='UTF-8') as f: + for i, l in enumerate(f): + l = l.strip() + if len(l) > 2: + # lowercase maps to multi-character uppercase sequence, ignore uppercase for now + l = l[:1] + else: + assert len(l) == 2, 'expected 2 chars in chr' + for c in l: + ch_map[c] = i + 1 + return ch_map + + +# load exceptions with explicit hyphens +def load_hyp(hyph, fn): + with io.open(fn, encoding='UTF-8') as f: + for l in f: + hyph.add_exception(l.strip()) + + +def generate_header(alphabet, trie, pattern): + alphabet_off = 6 * 4 + trie_off = alphabet_off + len(alphabet) + pattern_off = trie_off + len(trie) + file_size = pattern_off + len(pattern) + data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size] + return struct.pack('<6I', *data) + + +def generate_alphabet(ch_map): + ch_map = ch_map.copy() + del ch_map['.'] + min_ch = ord(min(ch_map)) + max_ch = ord(max(ch_map)) + if max_ch - min_ch < 1024 and max(ch_map.values()) < 256: + # generate format 0 + data = [0] * (max_ch - min_ch + 1) + for c, val in ch_map.items(): + data[ord(c) - min_ch] = val + result = [struct.pack('<3I', 0, min_ch, max_ch + 1)] + for b in data: + result.append(struct.pack('> 26 + pat_shift = (entry >> 20) & 0x1f + offset = pattern_offset + (entry & 0xfffff) + return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift + + +def traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions): + (char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20]) + node_entry = struct.unpack('> pattern_shift + if pattern: + result = [] + is_exception = False + pat = get_pattern(pattern_data, pattern) + for i in range(len(s) + 1): + pat_off = i - 1 + len(pat) - len(s) + if pat_off < 0: + code = 0 + else: + code = struct.unpack('B', pat[pat_off : pat_off + 1])[0] + if 1 <= code <= 9: + result.append('%d' % code) + elif code == 10: + is_exception = True + elif code == 11: + result.append('-') + is_exception = True + else: + assert code == 0, 'unexpected code' + if i < len(s): + result.append(s[i]) + pat_str = ''.join(result) + #print(`pat_str`, `pat`) + if is_exception: + assert pat_str[0] == '.', "expected leading '.'" + assert pat_str[-1] == '.', "expected trailing '.'" + exceptions.append(pat_str[1:-1]) # strip leading and trailing '.' + else: + patterns.append(pat_str) + for ch in ch_map: + edge_entry = struct.unpack('> link_shift + if link != 0 and ch == (edge_entry & char_mask): + sch = s + ch_map[ch] + traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions) + + +# Verify the generated binary file by reconstructing the textual representations +# from the binary hyb file, then checking that they're identical (mod the order of +# lines within the file, which is irrelevant). This function makes assumptions that +# are stronger than absolutely necessary (in particular, that the patterns are in +# lowercase as defined by python islower). +def verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn): + with open(hyb_fn, 'rb') as f: + hyb_data = f.read() + header = hyb_data[0: 6 * 4] + (magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header) + alphabet_data = hyb_data[alphabet_off:trie_off] + trie_data = hyb_data[trie_off:pattern_off] + pattern_data = hyb_data[pattern_off:file_size] + + # reconstruct alphabet table + alphabet_version = struct.unpack('> 11)] = entry & 0x7ff + + ch_map, reconstructed_chr = map_to_chr(alphabet_map) + + # EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587 + if u'\u0587' in reconstructed_chr: + reconstructed_chr.remove(u'\u0587') + reconstructed_chr.append(u'\u0587\u0535\u0552') + + assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified' + + # reconstruct trie + patterns = [] + exceptions = [] + traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions) + assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified' + assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified' + + +def main(): + global VERBOSE + try: + opts, args = getopt.getopt(sys.argv[1:], 'v') + except getopt.GetoptError as err: + print(str(err)) + sys.exit(1) + for o, _ in opts: + if o == '-v': + VERBOSE = True + pat_fn, out_fn = args + hyph = load(pat_fn) + if pat_fn.endswith('.pat.txt'): + chr_fn = pat_fn[:-8] + '.chr.txt' + ch_map = load_chr(chr_fn) + hyp_fn = pat_fn[:-8] + '.hyp.txt' + load_hyp(hyph, hyp_fn) + generate_hyb_file(hyph, ch_map, out_fn) + verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn) + +if __name__ == '__main__': + main() -- cgit v1.2.3 From e5e7aa0e8031f6f1c0ed370f70e49778f3570527 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Wed, 30 Sep 2015 23:26:54 -0700 Subject: Explicitly set utf-8 encoding for hyb file verification Not all platforms default to UTF-8 encoding, so we set it explicitly. This patch should fix build breakages resulting from failed verification of binary hyb files for hyphenation patterns. Bug: 24570591 Change-Id: I65ac4536d3436586c2633e2b57554fc6ff16d3a8 (cherry picked from commit 138b93f094584212dd6978a1822d078f93574022) --- tools/mk_hyb_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mk_hyb_file.py b/tools/mk_hyb_file.py index c078454..978c082 100755 --- a/tools/mk_hyb_file.py +++ b/tools/mk_hyb_file.py @@ -416,7 +416,7 @@ def generate_hyb_file(hyph, ch_map, hyb_fn): # Verify that the file contains the same lines as the lines argument, in arbitrary order def verify_file_sorted(lines, fn): - file_lines = [l.strip() for l in io.open(fn)] + file_lines = [l.strip() for l in io.open(fn, encoding='UTF-8')] line_set = set(lines) file_set = set(file_lines) if line_set == file_set: -- cgit v1.2.3 From e8264e065f0edd58a9fa04bbdd777f2af2794789 Mon Sep 17 00:00:00 2001 From: Roozbeh Pournader Date: Wed, 14 Oct 2015 19:34:29 -0700 Subject: Complete half-done cherry-picking of Android.mk. DO NOT MERGE The previous commit, 6e2cccdc518f8d3424c84ae6fbe0e87ae3c3f66a, was incompletely cherry-picked. This adds the missing parts. Bug: 24570591 Change-Id: I1097c60587fb8a88cfe6b8ffed5b1689d9bdd429 --- libs/minikin/Android.mk | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/libs/minikin/Android.mk b/libs/minikin/Android.mk index 7558f83..c3e70db 100644 --- a/libs/minikin/Android.mk +++ b/libs/minikin/Android.mk @@ -16,7 +16,7 @@ LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) -LOCAL_SRC_FILES := \ +minikin_src_files := \ AnalyzeStyle.cpp \ CmapCoverage.cpp \ FontCollection.cpp \ @@ -31,14 +31,12 @@ LOCAL_SRC_FILES := \ MinikinFontFreeType.cpp \ SparseBitSet.cpp -LOCAL_MODULE := libminikin - -LOCAL_C_INCLUDES += \ +minikin_c_includes += \ external/harfbuzz_ng/src \ external/freetype/include \ frameworks/minikin/include -LOCAL_SHARED_LIBRARIES := \ +minikin_shared_libraries := \ libharfbuzz_ng \ libft2 \ liblog \ @@ -47,6 +45,12 @@ LOCAL_SHARED_LIBRARIES := \ libicuuc \ libutils +LOCAL_MODULE := libminikin +LOCAL_EXPORT_C_INCLUDE_DIRS := frameworks/minikin/include +LOCAL_SRC_FILES := $(minikin_src_files) +LOCAL_C_INCLUDES := $(minikin_c_includes) +LOCAL_SHARED_LIBRARIES := $(minikin_shared_libraries) + include $(BUILD_SHARED_LIBRARY) include $(CLEAR_VARS) -- cgit v1.2.3 From c5af5f6f47d346e78c5920f4a6851b38929a71e5 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Thu, 29 Oct 2015 11:39:58 -0700 Subject: Accept variation selector in emoji sequences - DO NOT MERGE This patch basically ignores variation selectors for the purpose of itemization into font runs. This allows GSUB to be applied when input sequences contain variation selectors. Bug: 25368653 Change-Id: I9c1d325ae0cd322c21b7e850d0ec4d73551b2372 --- libs/minikin/FontCollection.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/libs/minikin/FontCollection.cpp b/libs/minikin/FontCollection.cpp index b4bfe31..2bcbc03 100644 --- a/libs/minikin/FontCollection.cpp +++ b/libs/minikin/FontCollection.cpp @@ -167,6 +167,10 @@ static bool isStickyWhitelisted(uint32_t c) { return false; } +static bool isVariationSelector(uint32_t c) { + return (0xFE00 <= c && c <= 0xFE0F) || (0xE0100 <= c && c <= 0xE01EF); +} + void FontCollection::itemize(const uint16_t *string, size_t string_size, FontStyle style, vector* result) const { FontLanguage lang = style.getLanguage(); @@ -184,9 +188,11 @@ void FontCollection::itemize(const uint16_t *string, size_t string_size, FontSty nShorts = 2; } } - // Continue using existing font as long as it has coverage and is whitelisted + // Continue using existing font as long as it has coverage and is whitelisted; + // also variation sequences continue existing run. if (lastFamily == NULL - || !(isStickyWhitelisted(ch) && lastFamily->getCoverage()->get(ch))) { + || !((isStickyWhitelisted(ch) && lastFamily->getCoverage()->get(ch)) + || isVariationSelector(ch))) { FontFamily* family = getFamilyForChar(ch, lang, variant); if (i == 0 || family != lastFamily) { size_t start = i; -- cgit v1.2.3 From 86542797761632890092ef89b7fb58c2c2cdfc11 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Mon, 2 Nov 2015 17:17:24 -0800 Subject: Suppress linebreaks in emoji ZWJ sequences - DO NOT MERGE Due to the way emoji ZWJ sequences are defined, the ICU line breaking algorithm determines that there are valid line breaks inside the sequence. This patch suppresses these line breaks. Bug: 25433289 Change-Id: I225ebebc0f4186e4b8f48fee399c4a62b3f0218a --- libs/minikin/LineBreaker.cpp | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/libs/minikin/LineBreaker.cpp b/libs/minikin/LineBreaker.cpp index a832ca2..77374fe 100644 --- a/libs/minikin/LineBreaker.cpp +++ b/libs/minikin/LineBreaker.cpp @@ -17,6 +17,7 @@ #define VERBOSE_DEBUG 0 #include +#include #define LOG_TAG "Minikin" #include @@ -30,6 +31,7 @@ namespace android { const int CHAR_TAB = 0x0009; const uint16_t CHAR_SOFT_HYPHEN = 0x00AD; +const uint16_t CHAR_ZWJ = 0x200D; // Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these // constants are larger than any reasonable actual width score. @@ -123,6 +125,32 @@ static bool isLineBreakingHyphen(uint16_t c) { c == 0x2E40); // DOUBLE HYPHEN } +/** + * Determine whether a line break at position i within the buffer buf is valid. This + * represents customization beyond the ICU behavior, because plain ICU provides some + * line break opportunities that we don't want. + **/ +static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) { + if (codeUnit == CHAR_SOFT_HYPHEN) { + return false; + } + if (codeUnit == CHAR_ZWJ) { + // Possible emoji ZWJ sequence + uint32_t next_codepoint; + U16_NEXT(buf, i, bufEnd, next_codepoint); + if (next_codepoint == 0x2764 || // HEAVY BLACK HEART + next_codepoint == 0x1F466 || // BOY + next_codepoint == 0x1F467 || // GIRL + next_codepoint == 0x1F468 || // MAN + next_codepoint == 0x1F469 || // WOMAN + next_codepoint == 0x1F48B || // KISS MARK + next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE + return false; + } + } + return true; +} + // Ordinarily, this method measures the text in the range given. However, when paint // is nullptr, it assumes the widths have already been calculated and stored in the // width buffer. @@ -175,8 +203,9 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa } if (i + 1 == current) { // Override ICU's treatment of soft hyphen as a break opportunity, because we want it - // to be a hyphen break, with penalty and drawing behavior. - if (c != CHAR_SOFT_HYPHEN) { + // to be a hyphen break, with penalty and drawing behavior. Also, suppress line + // breaks within emoji ZWJ sequences. + if (isBreakValid(c, mTextBuf.data(), end, i + 1)) { // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so // we can pass the whole word down to Hyphenator like the soft hyphen case. bool wordEndsInHyphen = isLineBreakingHyphen(c); -- cgit v1.2.3