summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRaph Levien <raph@google.com>2016-02-18 15:00:24 -0800
committerRaph Levien <raph@google.com>2016-02-18 15:00:24 -0800
commitd3f45892c721fb1738bf02fe19a5143a320ca4bf (patch)
treed95d2512d2fe1f5b126fb31cb413cb583665052a
parentaa736d00548da691e75a09a70deb886e1f68b060 (diff)
downloadandroid_frameworks_minikin-d3f45892c721fb1738bf02fe19a5143a320ca4bf.tar.gz
android_frameworks_minikin-d3f45892c721fb1738bf02fe19a5143a320ca4bf.tar.bz2
android_frameworks_minikin-d3f45892c721fb1738bf02fe19a5143a320ca4bf.zip
Suppress linebreaks in emoji ZWJ sequences
Due to the way emoji ZWJ sequences are defined, the ICU line breaking algorithm determines that there are valid line breaks inside the sequence. This patch suppresses these line breaks. This is an adaptation of I225ebebc0f4186e4b8f48fee399c4a62b3f0218a into the nyc-dev branch. Bug: 25433289 Change-Id: I84b50b1e6ef13d436965eab389659d02a30d100f
-rw-r--r--libs/minikin/WordBreaker.cpp29
-rw-r--r--tests/WordBreakerTests.cpp24
2 files changed, 52 insertions, 1 deletions
diff --git a/libs/minikin/WordBreaker.cpp b/libs/minikin/WordBreaker.cpp
index ca69a50..ec84c39 100644
--- a/libs/minikin/WordBreaker.cpp
+++ b/libs/minikin/WordBreaker.cpp
@@ -25,6 +25,7 @@
namespace android {
const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
+const uint16_t CHAR_ZWJ = 0x200D;
void WordBreaker::setLocale(const icu::Locale& locale) {
UErrorCode status = U_ZERO_ERROR;
@@ -62,6 +63,32 @@ enum ScanState {
SAW_COLON_SLASH_SLASH,
};
+/**
+ * Determine whether a line break at position i within the buffer buf is valid. This
+ * represents customization beyond the ICU behavior, because plain ICU provides some
+ * line break opportunities that we don't want.
+ **/
+static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) {
+ if (codeUnit == CHAR_SOFT_HYPHEN) {
+ return false;
+ }
+ if (codeUnit == CHAR_ZWJ) {
+ // Possible emoji ZWJ sequence
+ uint32_t next_codepoint;
+ U16_NEXT(buf, i, bufEnd, next_codepoint);
+ if (next_codepoint == 0x2764 || // HEAVY BLACK HEART
+ next_codepoint == 0x1F466 || // BOY
+ next_codepoint == 0x1F467 || // GIRL
+ next_codepoint == 0x1F468 || // MAN
+ next_codepoint == 0x1F469 || // WOMAN
+ next_codepoint == 0x1F48B || // KISS MARK
+ next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE
+ return false;
+ }
+ }
+ return true;
+}
+
// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
static bool breakAfter(uint16_t c) {
return c == ':' || c == '=' || c == '&';
@@ -149,7 +176,7 @@ ssize_t WordBreaker::next() {
result = mBreakIterator->next();
}
} while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
- && mText[result - 1] == CHAR_SOFT_HYPHEN);
+ && !isBreakValid(mText[result - 1], mText, mTextSize, result));
mCurrent = (ssize_t)result;
return mCurrent;
}
diff --git a/tests/WordBreakerTests.cpp b/tests/WordBreakerTests.cpp
index 9662b2f..6c5e479 100644
--- a/tests/WordBreakerTests.cpp
+++ b/tests/WordBreakerTests.cpp
@@ -67,6 +67,30 @@ TEST_F(WordBreakerTest, softHyphen) {
EXPECT_EQ(0, breaker.breakBadness());
}
+TEST_F(WordBreakerTest, zwjEmojiSequences) {
+ uint16_t buf[] = {
+ // man + zwj + heart + zwj + man
+ 0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68,
+ // woman + zwj + heart + zwj + woman
+ 0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69,
+ // eye + zwj + left speech bubble
+ 0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8,
+ };
+ WordBreaker breaker;
+ breaker.setLocale(icu::Locale::getEnglish());
+ breaker.setText(buf, NELEM(buf));
+ EXPECT_EQ(0, breaker.current());
+ EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man
+ EXPECT_EQ(0, breaker.wordStart());
+ EXPECT_EQ(7, breaker.wordEnd());
+ EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman
+ EXPECT_EQ(7, breaker.wordStart());
+ EXPECT_EQ(17, breaker.wordEnd());
+ EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
+ EXPECT_EQ(17, breaker.wordStart());
+ EXPECT_EQ(22, breaker.wordEnd());
+}
+
TEST_F(WordBreakerTest, punct) {
uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
'!', '!'};