Suppress linebreaks in emoji ZWJ sequences

Due to the way emoji ZWJ sequences are defined, the ICU line breaking algorithm determines that there are valid line breaks inside the sequence. This patch suppresses these line breaks. This is an adaptation of I225ebebc0f4186e4b8f48fee399c4a62b3f0218a into the nyc-dev branch. Bug: 25433289 Change-Id: I84b50b1e6ef13d436965eab389659d02a30d100f
author: Raph Levien <raph@google.com> 2016-02-18 15:00:24 -0800
committer: Raph Levien <raph@google.com> 2016-02-18 15:00:24 -0800
commit: d3f45892c721fb1738bf02fe19a5143a320ca4bf (patch)
tree: d95d2512d2fe1f5b126fb31cb413cb583665052a
parent: aa736d00548da691e75a09a70deb886e1f68b060 (diff)
download: android_frameworks_minikin-d3f45892c721fb1738bf02fe19a5143a320ca4bf.tar.gz
android_frameworks_minikin-d3f45892c721fb1738bf02fe19a5143a320ca4bf.tar.bz2
android_frameworks_minikin-d3f45892c721fb1738bf02fe19a5143a320ca4bf.zip
2 files changed, 52 insertions, 1 deletions
diff --git a/libs/minikin/WordBreaker.cpp b/libs/minikin/WordBreaker.cpp
index ca69a50..ec84c39 100644
--- a/libs/minikin/WordBreaker.cpp
+++ b/libs/minikin/WordBreaker.cpp
@@ -25,6 +25,7 @@
 namespace android {
 
 const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
+const uint16_t CHAR_ZWJ = 0x200D;
 
 void WordBreaker::setLocale(const icu::Locale& locale) {
     UErrorCode status = U_ZERO_ERROR;
@@ -62,6 +63,32 @@ enum ScanState {
     SAW_COLON_SLASH_SLASH,
 };
 
+/**
+ * Determine whether a line break at position i within the buffer buf is valid. This
+ * represents customization beyond the ICU behavior, because plain ICU provides some
+ * line break opportunities that we don't want.
+ **/
+static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) {
+    if (codeUnit == CHAR_SOFT_HYPHEN) {
+        return false;
+    }
+    if (codeUnit == CHAR_ZWJ) {
+        // Possible emoji ZWJ sequence
+        uint32_t next_codepoint;
+        U16_NEXT(buf, i, bufEnd, next_codepoint);
+        if (next_codepoint == 0x2764 ||       // HEAVY BLACK HEART
+                next_codepoint == 0x1F466 ||  // BOY
+                next_codepoint == 0x1F467 ||  // GIRL
+                next_codepoint == 0x1F468 ||  // MAN
+                next_codepoint == 0x1F469 ||  // WOMAN
+                next_codepoint == 0x1F48B ||  // KISS MARK
+                next_codepoint == 0x1F5E8) {  // LEFT SPEECH BUBBLE
+            return false;
+        }
+    }
+    return true;
+}
+
 // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
 static bool breakAfter(uint16_t c) {
     return c == ':' || c == '=' || c == '&';
@@ -149,7 +176,7 @@ ssize_t WordBreaker::next() {
             result = mBreakIterator->next();
         }
     } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
-             && mText[result - 1] == CHAR_SOFT_HYPHEN);
+            && !isBreakValid(mText[result - 1], mText, mTextSize, result));
     mCurrent = (ssize_t)result;
     return mCurrent;
 }
diff --git a/tests/WordBreakerTests.cpp b/tests/WordBreakerTests.cpp
index 9662b2f..6c5e479 100644
--- a/tests/WordBreakerTests.cpp
+++ b/tests/WordBreakerTests.cpp
@@ -67,6 +67,30 @@ TEST_F(WordBreakerTest, softHyphen) {
     EXPECT_EQ(0, breaker.breakBadness());
 }
 
+TEST_F(WordBreakerTest, zwjEmojiSequences) {
+    uint16_t buf[] = {
+        // man + zwj + heart + zwj + man
+        0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68,
+        // woman + zwj + heart + zwj + woman
+        0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69,
+        // eye + zwj + left speech bubble
+        0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8,
+    };
+    WordBreaker breaker;
+    breaker.setLocale(icu::Locale::getEnglish());
+    breaker.setText(buf, NELEM(buf));
+    EXPECT_EQ(0, breaker.current());
+    EXPECT_EQ(7, breaker.next());  // after man + zwj + heart + zwj + man
+    EXPECT_EQ(0, breaker.wordStart());
+    EXPECT_EQ(7, breaker.wordEnd());
+    EXPECT_EQ(17, breaker.next());  // after woman + zwj + heart + zwj + woman
+    EXPECT_EQ(7, breaker.wordStart());
+    EXPECT_EQ(17, breaker.wordEnd());
+    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
+    EXPECT_EQ(17, breaker.wordStart());
+    EXPECT_EQ(22, breaker.wordEnd());
+}
+
 TEST_F(WordBreakerTest, punct) {
     uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
         '!', '!'};
author	Raph Levien <raph@google.com>	2016-02-18 15:00:24 -0800
committer	Raph Levien <raph@google.com>	2016-02-18 15:00:24 -0800
commit	d3f45892c721fb1738bf02fe19a5143a320ca4bf (patch)
tree	d95d2512d2fe1f5b126fb31cb413cb583665052a
parent	aa736d00548da691e75a09a70deb886e1f68b060 (diff)
download	android_frameworks_minikin-d3f45892c721fb1738bf02fe19a5143a320ca4bf.tar.gz android_frameworks_minikin-d3f45892c721fb1738bf02fe19a5143a320ca4bf.tar.bz2 android_frameworks_minikin-d3f45892c721fb1738bf02fe19a5143a320ca4bf.zip