summaryrefslogtreecommitdiffstats
path: root/include/minikin
diff options
context:
space:
mode:
authorRaph Levien <raph@google.com>2015-08-27 13:50:00 -0700
committerRaph Levien <raph@google.com>2015-09-30 21:37:31 -0700
commitf0be43de02a1e07308d3d95408349c3c7f973430 (patch)
tree6570cc1b2b5c8af359a29681abf124a48b49db52 /include/minikin
parent2a79f59e73294e43f32cc0138e23fcde34eec28a (diff)
downloadandroid_frameworks_minikin-f0be43de02a1e07308d3d95408349c3c7f973430.tar.gz
android_frameworks_minikin-f0be43de02a1e07308d3d95408349c3c7f973430.tar.bz2
android_frameworks_minikin-f0be43de02a1e07308d3d95408349c3c7f973430.zip
Binary format for hyphenation patterns
In the current state, hyphenation in all languages than Sanskrit seems to work (case-folding edge cases). Thus, we just disable Sanskrit. Packed tries are implemented, but not the finite state machine (space/speed tradeoff). This commit contains a throw-away test app, which runs on the host. I think I want to replace it with unit tests, but I'm including it in the CL because it's useful during development. Bug: 21562869 Bug: 21826930 Bug: 23317038 Bug: 23317904 Change-Id: I7479a565a4a062fa319651c2c14c0fa18c5ceaea
Diffstat (limited to 'include/minikin')
-rw-r--r--include/minikin/Hyphenator.h39
1 files changed, 30 insertions, 9 deletions
diff --git a/include/minikin/Hyphenator.h b/include/minikin/Hyphenator.h
index 581c657..9605205 100644
--- a/include/minikin/Hyphenator.h
+++ b/include/minikin/Hyphenator.h
@@ -26,11 +26,8 @@
namespace android {
-class Trie {
-public:
- std::vector<uint8_t> result;
- std::unordered_map<uint16_t, Trie> succ;
-};
+// hyb file header; implementation details are in the .cpp file
+struct Header;
class Hyphenator {
public:
@@ -44,19 +41,43 @@ public:
// Example: word is "hyphen", result is [0 0 1 0 0 0], corresponding to "hy-phen".
void hyphenate(std::vector<uint8_t>* result, const uint16_t* word, size_t len);
+ // pattern data is in binary format, as described in doc/hyb_file_format.md. Note:
+ // the caller is responsible for ensuring that the lifetime of the pattern data is
+ // at least as long as the Hyphenator object.
+
+ // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens
+ static Hyphenator* loadBinary(const uint8_t* patternData);
+
private:
- void addPattern(const uint16_t* pattern, size_t size);
+ // apply soft hyphens only, ignoring patterns
+ void hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len);
- void hyphenateSoft(std::vector<uint8_t>* result, const uint16_t* word, size_t len);
+ // try looking up word in alphabet table, return false if any code units fail to map
+ // Note that this methor writes len+2 entries into alpha_codes (including start and stop)
+ bool alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len);
+
+ // calculate hyphenation from patterns, assuming alphabet lookup has already been done
+ void hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size_t len);
// TODO: these should become parameters, as they might vary by locale, screen size, and
// possibly explicit user control.
static const int MIN_PREFIX = 2;
static const int MIN_SUFFIX = 3;
- Trie root;
+ // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
+ // that temporary buffers can be stack-allocated without waste, which is a slightly
+ // different use case. It measures UTF-16 code units.
+ static const size_t MAX_HYPHENATED_SIZE = 64;
+
+ const uint8_t* patternData;
+
+ // accessors for binary data
+ const Header* getHeader() const {
+ return reinterpret_cast<const Header*>(patternData);
+ }
+
};
} // namespace android
-#endif // MINIKIN_HYPHENATOR_H \ No newline at end of file
+#endif // MINIKIN_HYPHENATOR_H