diff options
author | Raph Levien <raph@google.com> | 2016-02-18 15:00:24 -0800 |
---|---|---|
committer | Raph Levien <raph@google.com> | 2016-02-18 15:00:24 -0800 |
commit | d3f45892c721fb1738bf02fe19a5143a320ca4bf (patch) | |
tree | d95d2512d2fe1f5b126fb31cb413cb583665052a | |
parent | aa736d00548da691e75a09a70deb886e1f68b060 (diff) | |
download | android_frameworks_minikin-d3f45892c721fb1738bf02fe19a5143a320ca4bf.tar.gz android_frameworks_minikin-d3f45892c721fb1738bf02fe19a5143a320ca4bf.tar.bz2 android_frameworks_minikin-d3f45892c721fb1738bf02fe19a5143a320ca4bf.zip |
Suppress linebreaks in emoji ZWJ sequences
Due to the way emoji ZWJ sequences are defined, the ICU line breaking
algorithm determines that there are valid line breaks inside the
sequence. This patch suppresses these line breaks.
This is an adaptation of I225ebebc0f4186e4b8f48fee399c4a62b3f0218a
into the nyc-dev branch.
Bug: 25433289
Change-Id: I84b50b1e6ef13d436965eab389659d02a30d100f
-rw-r--r-- | libs/minikin/WordBreaker.cpp | 29 | ||||
-rw-r--r-- | tests/WordBreakerTests.cpp | 24 |
2 files changed, 52 insertions, 1 deletions
diff --git a/libs/minikin/WordBreaker.cpp b/libs/minikin/WordBreaker.cpp index ca69a50..ec84c39 100644 --- a/libs/minikin/WordBreaker.cpp +++ b/libs/minikin/WordBreaker.cpp @@ -25,6 +25,7 @@ namespace android { const uint32_t CHAR_SOFT_HYPHEN = 0x00AD; +const uint16_t CHAR_ZWJ = 0x200D; void WordBreaker::setLocale(const icu::Locale& locale) { UErrorCode status = U_ZERO_ERROR; @@ -62,6 +63,32 @@ enum ScanState { SAW_COLON_SLASH_SLASH, }; +/** + * Determine whether a line break at position i within the buffer buf is valid. This + * represents customization beyond the ICU behavior, because plain ICU provides some + * line break opportunities that we don't want. + **/ +static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) { + if (codeUnit == CHAR_SOFT_HYPHEN) { + return false; + } + if (codeUnit == CHAR_ZWJ) { + // Possible emoji ZWJ sequence + uint32_t next_codepoint; + U16_NEXT(buf, i, bufEnd, next_codepoint); + if (next_codepoint == 0x2764 || // HEAVY BLACK HEART + next_codepoint == 0x1F466 || // BOY + next_codepoint == 0x1F467 || // GIRL + next_codepoint == 0x1F468 || // MAN + next_codepoint == 0x1F469 || // WOMAN + next_codepoint == 0x1F48B || // KISS MARK + next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE + return false; + } + } + return true; +} + // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses static bool breakAfter(uint16_t c) { return c == ':' || c == '=' || c == '&'; @@ -149,7 +176,7 @@ ssize_t WordBreaker::next() { result = mBreakIterator->next(); } } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize - && mText[result - 1] == CHAR_SOFT_HYPHEN); + && !isBreakValid(mText[result - 1], mText, mTextSize, result)); mCurrent = (ssize_t)result; return mCurrent; } diff --git a/tests/WordBreakerTests.cpp b/tests/WordBreakerTests.cpp index 9662b2f..6c5e479 100644 --- a/tests/WordBreakerTests.cpp +++ b/tests/WordBreakerTests.cpp @@ -67,6 +67,30 @@ TEST_F(WordBreakerTest, softHyphen) { EXPECT_EQ(0, breaker.breakBadness()); } +TEST_F(WordBreakerTest, zwjEmojiSequences) { + uint16_t buf[] = { + // man + zwj + heart + zwj + man + 0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68, + // woman + zwj + heart + zwj + woman + 0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69, + // eye + zwj + left speech bubble + 0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8, + }; + WordBreaker breaker; + breaker.setLocale(icu::Locale::getEnglish()); + breaker.setText(buf, NELEM(buf)); + EXPECT_EQ(0, breaker.current()); + EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man + EXPECT_EQ(0, breaker.wordStart()); + EXPECT_EQ(7, breaker.wordEnd()); + EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman + EXPECT_EQ(7, breaker.wordStart()); + EXPECT_EQ(17, breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_EQ(17, breaker.wordStart()); + EXPECT_EQ(22, breaker.wordEnd()); +} + TEST_F(WordBreakerTest, punct) { uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd', '!', '!'}; |