diff options
author | Raph Levien <raph@google.com> | 2015-08-27 13:50:00 -0700 |
---|---|---|
committer | Raph Levien <raph@google.com> | 2015-09-30 21:37:31 -0700 |
commit | f0be43de02a1e07308d3d95408349c3c7f973430 (patch) | |
tree | 6570cc1b2b5c8af359a29681abf124a48b49db52 /include | |
parent | 2a79f59e73294e43f32cc0138e23fcde34eec28a (diff) | |
download | android_frameworks_minikin-f0be43de02a1e07308d3d95408349c3c7f973430.tar.gz android_frameworks_minikin-f0be43de02a1e07308d3d95408349c3c7f973430.tar.bz2 android_frameworks_minikin-f0be43de02a1e07308d3d95408349c3c7f973430.zip |
Binary format for hyphenation patterns
In the current state, hyphenation in all languages than Sanskrit seems
to work (case-folding edge cases). Thus, we just disable Sanskrit.
Packed tries are implemented, but not the finite state machine
(space/speed tradeoff).
This commit contains a throw-away test app, which runs on the host.
I think I want to replace it with unit tests, but I'm including it in
the CL because it's useful during development.
Bug: 21562869
Bug: 21826930
Bug: 23317038
Bug: 23317904
Change-Id: I7479a565a4a062fa319651c2c14c0fa18c5ceaea
Diffstat (limited to 'include')
-rw-r--r-- | include/minikin/Hyphenator.h | 39 |
1 files changed, 30 insertions, 9 deletions
diff --git a/include/minikin/Hyphenator.h b/include/minikin/Hyphenator.h index 581c657..9605205 100644 --- a/include/minikin/Hyphenator.h +++ b/include/minikin/Hyphenator.h @@ -26,11 +26,8 @@ namespace android { -class Trie { -public: - std::vector<uint8_t> result; - std::unordered_map<uint16_t, Trie> succ; -}; +// hyb file header; implementation details are in the .cpp file +struct Header; class Hyphenator { public: @@ -44,19 +41,43 @@ public: // Example: word is "hyphen", result is [0 0 1 0 0 0], corresponding to "hy-phen". void hyphenate(std::vector<uint8_t>* result, const uint16_t* word, size_t len); + // pattern data is in binary format, as described in doc/hyb_file_format.md. Note: + // the caller is responsible for ensuring that the lifetime of the pattern data is + // at least as long as the Hyphenator object. + + // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens + static Hyphenator* loadBinary(const uint8_t* patternData); + private: - void addPattern(const uint16_t* pattern, size_t size); + // apply soft hyphens only, ignoring patterns + void hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len); - void hyphenateSoft(std::vector<uint8_t>* result, const uint16_t* word, size_t len); + // try looking up word in alphabet table, return false if any code units fail to map + // Note that this methor writes len+2 entries into alpha_codes (including start and stop) + bool alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len); + + // calculate hyphenation from patterns, assuming alphabet lookup has already been done + void hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size_t len); // TODO: these should become parameters, as they might vary by locale, screen size, and // possibly explicit user control. static const int MIN_PREFIX = 2; static const int MIN_SUFFIX = 3; - Trie root; + // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so + // that temporary buffers can be stack-allocated without waste, which is a slightly + // different use case. It measures UTF-16 code units. + static const size_t MAX_HYPHENATED_SIZE = 64; + + const uint8_t* patternData; + + // accessors for binary data + const Header* getHeader() const { + return reinterpret_cast<const Header*>(patternData); + } + }; } // namespace android -#endif // MINIKIN_HYPHENATOR_H
\ No newline at end of file +#endif // MINIKIN_HYPHENATOR_H |