diff options
author | Raph Levien <raph@google.com> | 2016-02-19 17:53:48 +0000 |
---|---|---|
committer | Android (Google) Code Review <android-gerrit@google.com> | 2016-02-19 17:53:48 +0000 |
commit | 30bf8a7c05925a970f1cab87c38bec8dd97fa82e (patch) | |
tree | a2c066ece8293a937ca1aec22f8362cb3c7ed8e6 | |
parent | ea408fc18e8e78d984ebdf63703da668a15720de (diff) | |
parent | d3f45892c721fb1738bf02fe19a5143a320ca4bf (diff) | |
download | android_frameworks_minikin-30bf8a7c05925a970f1cab87c38bec8dd97fa82e.tar.gz android_frameworks_minikin-30bf8a7c05925a970f1cab87c38bec8dd97fa82e.tar.bz2 android_frameworks_minikin-30bf8a7c05925a970f1cab87c38bec8dd97fa82e.zip |
Merge "Suppress linebreaks in emoji ZWJ sequences" into nyc-dev
-rw-r--r-- | libs/minikin/WordBreaker.cpp | 29 | ||||
-rw-r--r-- | tests/WordBreakerTests.cpp | 24 |
2 files changed, 52 insertions, 1 deletions
diff --git a/libs/minikin/WordBreaker.cpp b/libs/minikin/WordBreaker.cpp index ca69a50..ec84c39 100644 --- a/libs/minikin/WordBreaker.cpp +++ b/libs/minikin/WordBreaker.cpp @@ -25,6 +25,7 @@ namespace android { const uint32_t CHAR_SOFT_HYPHEN = 0x00AD; +const uint16_t CHAR_ZWJ = 0x200D; void WordBreaker::setLocale(const icu::Locale& locale) { UErrorCode status = U_ZERO_ERROR; @@ -62,6 +63,32 @@ enum ScanState { SAW_COLON_SLASH_SLASH, }; +/** + * Determine whether a line break at position i within the buffer buf is valid. This + * represents customization beyond the ICU behavior, because plain ICU provides some + * line break opportunities that we don't want. + **/ +static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) { + if (codeUnit == CHAR_SOFT_HYPHEN) { + return false; + } + if (codeUnit == CHAR_ZWJ) { + // Possible emoji ZWJ sequence + uint32_t next_codepoint; + U16_NEXT(buf, i, bufEnd, next_codepoint); + if (next_codepoint == 0x2764 || // HEAVY BLACK HEART + next_codepoint == 0x1F466 || // BOY + next_codepoint == 0x1F467 || // GIRL + next_codepoint == 0x1F468 || // MAN + next_codepoint == 0x1F469 || // WOMAN + next_codepoint == 0x1F48B || // KISS MARK + next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE + return false; + } + } + return true; +} + // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses static bool breakAfter(uint16_t c) { return c == ':' || c == '=' || c == '&'; @@ -149,7 +176,7 @@ ssize_t WordBreaker::next() { result = mBreakIterator->next(); } } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize - && mText[result - 1] == CHAR_SOFT_HYPHEN); + && !isBreakValid(mText[result - 1], mText, mTextSize, result)); mCurrent = (ssize_t)result; return mCurrent; } diff --git a/tests/WordBreakerTests.cpp b/tests/WordBreakerTests.cpp index 9662b2f..6c5e479 100644 --- a/tests/WordBreakerTests.cpp +++ b/tests/WordBreakerTests.cpp @@ -67,6 +67,30 @@ TEST_F(WordBreakerTest, softHyphen) { EXPECT_EQ(0, breaker.breakBadness()); } +TEST_F(WordBreakerTest, zwjEmojiSequences) { + uint16_t buf[] = { + // man + zwj + heart + zwj + man + 0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68, + // woman + zwj + heart + zwj + woman + 0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69, + // eye + zwj + left speech bubble + 0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8, + }; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man + EXPECT_EQ(0, breaker.wordStart()); + EXPECT_EQ(7, breaker.wordEnd()); + EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman + EXPECT_EQ(7, breaker.wordStart()); + EXPECT_EQ(17, breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_EQ(17, breaker.wordStart()); + EXPECT_EQ(22, breaker.wordEnd()); +} + TEST_F(WordBreakerTest, punct) { uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd', '!', '!'}; |