Merge tag 'android-6.0.1_r3' of https://android.googlesource.com/platform/frameworks/minikin into HEAD

Android 6.0.1 release 3
author: Steve Kondik <steve@cyngn.com> 2015-12-07 16:53:23 -0800
committer: Steve Kondik <steve@cyngn.com> 2015-12-07 16:53:23 -0800
commit: 4055ec7d38073b5cf93864ba25e6f12cb8720917 (patch)
tree: 6c0cc09fad685d13b9e7a797842bc6a143626560 /libs/minikin
parent: 9b9883f253c09a6ff1bdc56648cdec524a583cf3 (diff)
parent: 86542797761632890092ef89b7fb58c2c2cdfc11 (diff)
download: android_frameworks_minikin-4055ec7d38073b5cf93864ba25e6f12cb8720917.tar.gz
android_frameworks_minikin-4055ec7d38073b5cf93864ba25e6f12cb8720917.tar.bz2
android_frameworks_minikin-4055ec7d38073b5cf93864ba25e6f12cb8720917.zip
4 files changed, 245 insertions, 109 deletions
diff --git a/libs/minikin/Android.mk b/libs/minikin/Android.mk
index 873f279..c3e70db 100644
--- a/libs/minikin/Android.mk
+++ b/libs/minikin/Android.mk
@@ -16,7 +16,7 @@ LOCAL_PATH := $(call my-dir)
 
 include $(CLEAR_VARS)
 
-LOCAL_SRC_FILES := \
+minikin_src_files := \
     AnalyzeStyle.cpp \
     CmapCoverage.cpp \
     FontCollection.cpp \
@@ -31,14 +31,12 @@ LOCAL_SRC_FILES := \
     MinikinFontFreeType.cpp \
     SparseBitSet.cpp
 
-LOCAL_MODULE := libminikin
-
-LOCAL_C_INCLUDES += \
+minikin_c_includes += \
     external/harfbuzz_ng/src \
     external/freetype/include \
     frameworks/minikin/include
 
-LOCAL_SHARED_LIBRARIES := \
+minikin_shared_libraries := \
     libharfbuzz_ng \
     libft2 \
     liblog \
@@ -47,4 +45,35 @@ LOCAL_SHARED_LIBRARIES := \
     libicuuc \
     libutils
 
+LOCAL_MODULE := libminikin
+LOCAL_EXPORT_C_INCLUDE_DIRS := frameworks/minikin/include
+LOCAL_SRC_FILES := $(minikin_src_files)
+LOCAL_C_INCLUDES := $(minikin_c_includes)
+LOCAL_SHARED_LIBRARIES := $(minikin_shared_libraries)
+
 include $(BUILD_SHARED_LIBRARY)
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libminikin
+LOCAL_MODULE_TAGS := optional
+LOCAL_EXPORT_C_INCLUDE_DIRS := frameworks/minikin/include
+LOCAL_SRC_FILES := $(minikin_src_files)
+LOCAL_C_INCLUDES := $(minikin_c_includes)
+LOCAL_SHARED_LIBRARIES := $(minikin_shared_libraries)
+
+include $(BUILD_STATIC_LIBRARY)
+
+include $(CLEAR_VARS)
+
+# Reduced library (currently just hyphenation) for host
+
+LOCAL_MODULE := libminikin_host
+LOCAL_MODULE_TAGS := optional
+LOCAL_EXPORT_C_INCLUDE_DIRS := frameworks/minikin/include
+LOCAL_C_INCLUDES := $(minikin_c_includes)
+LOCAL_SHARED_LIBRARIES := liblog libicuuc-host
+
+LOCAL_SRC_FILES := Hyphenator.cpp
+
+include $(BUILD_HOST_STATIC_LIBRARY)
diff --git a/libs/minikin/FontCollection.cpp b/libs/minikin/FontCollection.cpp
index b4bfe31..2bcbc03 100644
--- a/libs/minikin/FontCollection.cpp
+++ b/libs/minikin/FontCollection.cpp
@@ -167,6 +167,10 @@ static bool isStickyWhitelisted(uint32_t c) {
     return false;
 }
 
+static bool isVariationSelector(uint32_t c) {
+    return (0xFE00 <= c && c <= 0xFE0F) || (0xE0100 <= c && c <= 0xE01EF);
+}
+
 void FontCollection::itemize(const uint16_t *string, size_t string_size, FontStyle style,
         vector<Run>* result) const {
     FontLanguage lang = style.getLanguage();
@@ -184,9 +188,11 @@ void FontCollection::itemize(const uint16_t *string, size_t string_size, FontSty
                 nShorts = 2;
             }
         }
-        // Continue using existing font as long as it has coverage and is whitelisted
+        // Continue using existing font as long as it has coverage and is whitelisted;
+        // also variation sequences continue existing run.
         if (lastFamily == NULL
-                || !(isStickyWhitelisted(ch) && lastFamily->getCoverage()->get(ch))) {
+                || !((isStickyWhitelisted(ch) && lastFamily->getCoverage()->get(ch))
+                      || isVariationSelector(ch))) {
             FontFamily* family = getFamilyForChar(ch, lang, variant);
             if (i == 0 || family != lastFamily) {
                 size_t start = i;
diff --git a/libs/minikin/Hyphenator.cpp b/libs/minikin/Hyphenator.cpp
index 3eb151b..c5eb60b 100644
--- a/libs/minikin/Hyphenator.cpp
+++ b/libs/minikin/Hyphenator.cpp
@@ -34,130 +34,202 @@ namespace android {
 
 static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
 
-void Hyphenator::addPattern(const uint16_t* pattern, size_t size) {
-    vector<uint16_t> word;
-    vector<uint8_t> result;
-
-    // start by parsing the Liang-format pattern into a word and a result vector, the
-    // vector right-aligned but without leading zeros. Examples:
-    // a1bc2d -> abcd [1, 0, 2, 0]
-    // abc1 -> abc [1]
-    // 1a2b3c4d5 -> abcd [1, 2, 3, 4, 5]
-    bool lastWasLetter = false;
-    bool haveSeenNumber = false;
-    for (size_t i = 0; i < size; i++) {
-        uint16_t c = pattern[i];
-        if (isdigit(c)) {
-            result.push_back(c - '0');
-            lastWasLetter = false;
-            haveSeenNumber = true;
-        } else {
-            word.push_back(c);
-            if (lastWasLetter && haveSeenNumber) {
-                result.push_back(0);
-            }
-            lastWasLetter = true;
-        }
+// The following are structs that correspond to tables inside the hyb file format
+
+struct AlphabetTable0 {
+    uint32_t version;
+    uint32_t min_codepoint;
+    uint32_t max_codepoint;
+    uint8_t data[1];  // actually flexible array, size is known at runtime
+};
+
+struct AlphabetTable1 {
+    uint32_t version;
+    uint32_t n_entries;
+    uint32_t data[1]; // actually flexible array, size is known at runtime
+
+    static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
+    static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
+};
+
+struct Trie {
+    uint32_t version;
+    uint32_t char_mask;
+    uint32_t link_shift;
+    uint32_t link_mask;
+    uint32_t pattern_shift;
+    uint32_t n_entries;
+    uint32_t data[1];  // actually flexible array, size is known at runtime
+};
+
+struct Pattern {
+    uint32_t version;
+    uint32_t n_entries;
+    uint32_t pattern_offset;
+    uint32_t pattern_size;
+    uint32_t data[1];  // actually flexible array, size is known at runtime
+
+    // accessors
+    static uint32_t len(uint32_t entry) { return entry >> 26; }
+    static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
+    const uint8_t* buf(uint32_t entry) const {
+        return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff);
+    }
+};
+
+struct Header {
+    uint32_t magic;
+    uint32_t version;
+    uint32_t alphabet_offset;
+    uint32_t trie_offset;
+    uint32_t pattern_offset;
+    uint32_t file_size;
+
+    // accessors
+    const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); }
+    uint32_t alphabetVersion() const {
+        return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
     }
-    if (lastWasLetter) {
-        result.push_back(0);
+    const AlphabetTable0* alphabetTable0() const {
+        return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
     }
-    Trie* t = &root;
-    for (size_t i = 0; i < word.size(); i++) {
-        t = &t->succ[word[i]];
+    const AlphabetTable1* alphabetTable1() const {
+        return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
+    }
+    const Trie* trieTable() const {
+        return reinterpret_cast<const Trie*>(bytes() + trie_offset);
+    }
+    const Pattern* patternTable() const {
+        return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
+    }
+};
+
+Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData) {
+    Hyphenator* result = new Hyphenator;
+    result->patternData = patternData;
+    return result;
+}
+
+void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t len) {
+    result->clear();
+    result->resize(len);
+    const size_t paddedLen = len + 2;  // start and stop code each count for 1
+    if (patternData != nullptr &&
+            (int)len >= MIN_PREFIX + MIN_SUFFIX && paddedLen <= MAX_HYPHENATED_SIZE) {
+        uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
+        if (alphabetLookup(alpha_codes, word, len)) {
+            hyphenateFromCodes(result->data(), alpha_codes, paddedLen);
+            return;
+        }
+        // TODO: try NFC normalization
+        // TODO: handle non-BMP Unicode (requires remapping of offsets)
     }
-    t->result = result;
+    hyphenateSoft(result->data(), word, len);
 }
 
 // If any soft hyphen is present in the word, use soft hyphens to decide hyphenation,
 // as recommended in UAX #14 (Use of Soft Hyphen)
-void Hyphenator::hyphenateSoft(vector<uint8_t>* result, const uint16_t* word, size_t len) {
-    (*result)[0] = 0;
+void Hyphenator::hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len) {
+    result[0] = 0;
     for (size_t i = 1; i < len; i++) {
-        (*result)[i] = word[i - 1] == CHAR_SOFT_HYPHEN;
-    }
+        result[i] = word[i - 1] == CHAR_SOFT_HYPHEN;
+     }
 }
 
-void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t len) {
-    result->clear();
-    result->resize(len);
-    if (len < MIN_PREFIX + MIN_SUFFIX) return;
-    size_t maxOffset = len - MIN_SUFFIX + 1;
-    for (size_t i = 0; i < len + 1; i++) {
-        const Trie* node = &root;
-        for (size_t j = i; j < len + 2; j++) {
-            uint16_t c;
-            if (j == 0 || j == len + 1) {
-                c = '.';  // word boundary character in pattern data files
-            } else {
-                c = word[j - 1];
-                if (c == CHAR_SOFT_HYPHEN) {
-                    hyphenateSoft(result, word, len);
-                    return;
-                }
-                // TODO: This uses ICU's simple character to character lowercasing, which ignores
-                // the locale, and ignores cases when lowercasing a character results in more than
-                // one character. It should be fixed to consider the locale (in order for it to work
-                // correctly for Turkish and Azerbaijani), as well as support one-to-many, and
-                // many-to-many case conversions (including non-BMP cases).
-                if (c < 0x00C0) { // U+00C0 is the lowest uppercase non-ASCII character
-                    // Convert uppercase ASCII to lowercase ASCII, but keep other characters as-is
-                    if (0x0041 <= c && c <= 0x005A) {
-                        c += 0x0020;
-                    }
-                } else {
-                    c = u_tolower(c);
-                }
+bool Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len) {
+    const Header* header = getHeader();
+    // TODO: check header magic
+    uint32_t alphabetVersion = header->alphabetVersion();
+    if (alphabetVersion == 0) {
+        const AlphabetTable0* alphabet = header->alphabetTable0();
+        uint32_t min_codepoint = alphabet->min_codepoint;
+        uint32_t max_codepoint = alphabet->max_codepoint;
+        alpha_codes[0] = 0;  // word start
+        for (size_t i = 0; i < len; i++) {
+            uint16_t c = word[i];
+            if (c < min_codepoint || c >= max_codepoint) {
+                return false;
+            }
+            uint8_t code = alphabet->data[c - min_codepoint];
+            if (code == 0) {
+                return false;
+            }
+            alpha_codes[i + 1] = code;
+        }
+        alpha_codes[len + 1] = 0;  // word termination
+        return true;
+    } else if (alphabetVersion == 1) {
+        const AlphabetTable1* alphabet = header->alphabetTable1();
+        size_t n_entries = alphabet->n_entries;
+        const uint32_t* begin = alphabet->data;
+        const uint32_t* end = begin + n_entries;
+        alpha_codes[0] = 0;
+        for (size_t i = 0; i < len; i++) {
+            uint16_t c = word[i];
+            auto p = std::lower_bound(begin, end, c << 11);
+            if (p == end) {
+                return false;
             }
-            auto search = node->succ.find(c);
-            if (search != node->succ.end()) {
-                node = &search->second;
+            uint32_t entry = *p;
+            if (AlphabetTable1::codepoint(entry) != c) {
+                return false;
+            }
+            alpha_codes[i + 1] = AlphabetTable1::value(entry);
+        }
+        alpha_codes[len + 1] = 0;
+        return true;
+    }
+    return false;
+}
+
+/**
+ * Internal implementation, after conversion to codes. All case folding and normalization
+ * has been done by now, and all characters have been found in the alphabet.
+ * Note: len here is the padded length including 0 codes at start and end.
+ **/
+void Hyphenator::hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size_t len) {
+    const Header* header = getHeader();
+    const Trie* trie = header->trieTable();
+    const Pattern* pattern = header->patternTable();
+    uint32_t char_mask = trie->char_mask;
+    uint32_t link_shift = trie->link_shift;
+    uint32_t link_mask = trie->link_mask;
+    uint32_t pattern_shift = trie->pattern_shift;
+    size_t maxOffset = len - MIN_SUFFIX - 1;
+    for (size_t i = 0; i < len - 1; i++) {
+        uint32_t node = 0;  // index into Trie table
+        for (size_t j = i; j < len; j++) {
+            uint16_t c = codes[j];
+            uint32_t entry = trie->data[node + c];
+            if ((entry & char_mask) == c) {
+                node = (entry & link_mask) >> link_shift;
             } else {
                 break;
             }
-            if (!node->result.empty()) {
-                int resultLen = node->result.size();
-                int offset = j + 1 - resultLen;
+            uint32_t pat_ix = trie->data[node] >> pattern_shift;
+            // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset
+            // into the buf pool. This is the pattern for the substring (i..j) we just matched,
+            // which we combine (via point-wise max) into the result vector.
+            if (pat_ix != 0) {
+                uint32_t pat_entry = pattern->data[pat_ix];
+                int pat_len = Pattern::len(pat_entry);
+                int pat_shift = Pattern::shift(pat_entry);
+                const uint8_t* pat_buf = pattern->buf(pat_entry);
+                int offset = j + 1 - (pat_len + pat_shift);
+                // offset is the index within result that lines up with the start of pat_buf
                 int start = std::max(MIN_PREFIX - offset, 0);
-                int end = std::min(resultLen, (int)maxOffset - offset);
-                // TODO performance: this inner loop can profitably be optimized
+                int end = std::min(pat_len, (int)maxOffset - offset);
                 for (int k = start; k < end; k++) {
-                    (*result)[offset + k] = std::max((*result)[offset + k], node->result[k]);
-                }
-#if 0
-                // debug printing of matched patterns
-                std::string dbg;
-                for (size_t k = i; k <= j + 1; k++) {
-                    int off = k - j - 2 + resultLen;
-                    if (off >= 0 && node->result[off] != 0) {
-                        dbg.push_back((char)('0' + node->result[off]));
-                    }
-                    if (k < j + 1) {
-                        uint16_t c = (k == 0 || k == len + 1) ? '.' : word[k - 1];
-                        dbg.push_back((char)c);
-                    }
+                    result[offset + k] = std::max(result[offset + k], pat_buf[k]);
                 }
-                ALOGD("%d:%d %s", i, j, dbg.c_str());
-#endif
             }
         }
     }
     // Since the above calculation does not modify values outside
     // [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0.
     for (size_t i = MIN_PREFIX; i < maxOffset; i++) {
-        (*result)[i] &= 1;
+        result[i] &= 1;
     }
 }
 
-Hyphenator* Hyphenator::load(const uint16_t *patternData, size_t size) {
-    Hyphenator* result = new Hyphenator;
-    for (size_t i = 0; i < size; i++) {
-        size_t end = i;
-        while (patternData[end] != '\n') end++;
-        result->addPattern(patternData + i, end - i);
-        i = end;
-    }
-    return result;
-}
-
 }  // namespace android
diff --git a/libs/minikin/LineBreaker.cpp b/libs/minikin/LineBreaker.cpp
index a832ca2..77374fe 100644
--- a/libs/minikin/LineBreaker.cpp
+++ b/libs/minikin/LineBreaker.cpp
@@ -17,6 +17,7 @@
 #define VERBOSE_DEBUG 0
 
 #include <limits>
+#include <unicode/utf16.h>
 
 #define LOG_TAG "Minikin"
 #include <cutils/log.h>
@@ -30,6 +31,7 @@ namespace android {
 
 const int CHAR_TAB = 0x0009;
 const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
+const uint16_t CHAR_ZWJ = 0x200D;
 
 // Large scores in a hierarchy; we prefer desperate breaks to an overfull line. All these
 // constants are larger than any reasonable actual width score.
@@ -123,6 +125,32 @@ static bool isLineBreakingHyphen(uint16_t c) {
             c == 0x2E40);  // DOUBLE HYPHEN
 }
 
+/**
+ * Determine whether a line break at position i within the buffer buf is valid. This
+ * represents customization beyond the ICU behavior, because plain ICU provides some
+ * line break opportunities that we don't want.
+ **/
+static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) {
+    if (codeUnit == CHAR_SOFT_HYPHEN) {
+        return false;
+    }
+    if (codeUnit == CHAR_ZWJ) {
+        // Possible emoji ZWJ sequence
+        uint32_t next_codepoint;
+        U16_NEXT(buf, i, bufEnd, next_codepoint);
+        if (next_codepoint == 0x2764 ||       // HEAVY BLACK HEART
+                next_codepoint == 0x1F466 ||  // BOY
+                next_codepoint == 0x1F467 ||  // GIRL
+                next_codepoint == 0x1F468 ||  // MAN
+                next_codepoint == 0x1F469 ||  // WOMAN
+                next_codepoint == 0x1F48B ||  // KISS MARK
+                next_codepoint == 0x1F5E8) {  // LEFT SPEECH BUBBLE
+            return false;
+        }
+    }
+    return true;
+}
+
 // Ordinarily, this method measures the text in the range given. However, when paint
 // is nullptr, it assumes the widths have already been calculated and stored in the
 // width buffer.
@@ -175,8 +203,9 @@ float LineBreaker::addStyleRun(MinikinPaint* paint, const FontCollection* typefa
         }
         if (i + 1 == current) {
             // Override ICU's treatment of soft hyphen as a break opportunity, because we want it
-            // to be a hyphen break, with penalty and drawing behavior.
-            if (c != CHAR_SOFT_HYPHEN) {
+            // to be a hyphen break, with penalty and drawing behavior. Also, suppress line
+            // breaks within emoji ZWJ sequences.
+            if (isBreakValid(c, mTextBuf.data(), end, i + 1)) {
                 // TODO: Add a new type of HyphenEdit for breaks whose hyphen already exists, so
                 // we can pass the whole word down to Hyphenator like the soft hyphen case.
                 bool wordEndsInHyphen = isLineBreakingHyphen(c);
author	Steve Kondik <steve@cyngn.com>	2015-12-07 16:53:23 -0800
committer	Steve Kondik <steve@cyngn.com>	2015-12-07 16:53:23 -0800
commit	4055ec7d38073b5cf93864ba25e6f12cb8720917 (patch)
tree	6c0cc09fad685d13b9e7a797842bc6a143626560 /libs/minikin
parent	9b9883f253c09a6ff1bdc56648cdec524a583cf3 (diff)
parent	86542797761632890092ef89b7fb58c2c2cdfc11 (diff)
download	android_frameworks_minikin-4055ec7d38073b5cf93864ba25e6f12cb8720917.tar.gz android_frameworks_minikin-4055ec7d38073b5cf93864ba25e6f12cb8720917.tar.bz2 android_frameworks_minikin-4055ec7d38073b5cf93864ba25e6f12cb8720917.zip