summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRaph Levien <raph@google.com>2016-02-19 17:53:48 +0000
committerAndroid (Google) Code Review <android-gerrit@google.com>2016-02-19 17:53:48 +0000
commit30bf8a7c05925a970f1cab87c38bec8dd97fa82e (patch)
treea2c066ece8293a937ca1aec22f8362cb3c7ed8e6
parentea408fc18e8e78d984ebdf63703da668a15720de (diff)
parentd3f45892c721fb1738bf02fe19a5143a320ca4bf (diff)
downloadandroid_frameworks_minikin-30bf8a7c05925a970f1cab87c38bec8dd97fa82e.tar.gz
android_frameworks_minikin-30bf8a7c05925a970f1cab87c38bec8dd97fa82e.tar.bz2
android_frameworks_minikin-30bf8a7c05925a970f1cab87c38bec8dd97fa82e.zip
Merge "Suppress linebreaks in emoji ZWJ sequences" into nyc-dev
-rw-r--r--libs/minikin/WordBreaker.cpp29
-rw-r--r--tests/WordBreakerTests.cpp24
2 files changed, 52 insertions, 1 deletions
diff --git a/libs/minikin/WordBreaker.cpp b/libs/minikin/WordBreaker.cpp
index ca69a50..ec84c39 100644
--- a/libs/minikin/WordBreaker.cpp
+++ b/libs/minikin/WordBreaker.cpp
@@ -25,6 +25,7 @@
namespace android {
const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
+const uint16_t CHAR_ZWJ = 0x200D;
void WordBreaker::setLocale(const icu::Locale& locale) {
UErrorCode status = U_ZERO_ERROR;
@@ -62,6 +63,32 @@ enum ScanState {
SAW_COLON_SLASH_SLASH,
};
+/**
+ * Determine whether a line break at position i within the buffer buf is valid. This
+ * represents customization beyond the ICU behavior, because plain ICU provides some
+ * line break opportunities that we don't want.
+ **/
+static bool isBreakValid(uint16_t codeUnit, const uint16_t* buf, size_t bufEnd, size_t i) {
+ if (codeUnit == CHAR_SOFT_HYPHEN) {
+ return false;
+ }
+ if (codeUnit == CHAR_ZWJ) {
+ // Possible emoji ZWJ sequence
+ uint32_t next_codepoint;
+ U16_NEXT(buf, i, bufEnd, next_codepoint);
+ if (next_codepoint == 0x2764 || // HEAVY BLACK HEART
+ next_codepoint == 0x1F466 || // BOY
+ next_codepoint == 0x1F467 || // GIRL
+ next_codepoint == 0x1F468 || // MAN
+ next_codepoint == 0x1F469 || // WOMAN
+ next_codepoint == 0x1F48B || // KISS MARK
+ next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE
+ return false;
+ }
+ }
+ return true;
+}
+
// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
static bool breakAfter(uint16_t c) {
return c == ':' || c == '=' || c == '&';
@@ -149,7 +176,7 @@ ssize_t WordBreaker::next() {
result = mBreakIterator->next();
}
} while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
- && mText[result - 1] == CHAR_SOFT_HYPHEN);
+ && !isBreakValid(mText[result - 1], mText, mTextSize, result));
mCurrent = (ssize_t)result;
return mCurrent;
}
diff --git a/tests/WordBreakerTests.cpp b/tests/WordBreakerTests.cpp
index 9662b2f..6c5e479 100644
--- a/tests/WordBreakerTests.cpp
+++ b/tests/WordBreakerTests.cpp
@@ -67,6 +67,30 @@ TEST_F(WordBreakerTest, softHyphen) {
EXPECT_EQ(0, breaker.breakBadness());
}
+TEST_F(WordBreakerTest, zwjEmojiSequences) {
+ uint16_t buf[] = {
+ // man + zwj + heart + zwj + man
+ 0xD83D, 0xDC68, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC68,
+ // woman + zwj + heart + zwj + woman
+ 0xD83D, 0xDC69, 0x200D, 0x2764, 0x200D, 0xD83D, 0xDC8B, 0x200D, 0xD83D, 0xDC69,
+ // eye + zwj + left speech bubble
+ 0xD83D, 0xDC41, 0x200D, 0xD83D, 0xDDE8,
+ };
+ WordBreaker breaker;
+ breaker.setLocale(icu::Locale::getEnglish());
+ breaker.setText(buf, NELEM(buf));
+ EXPECT_EQ(0, breaker.current());
+ EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man
+ EXPECT_EQ(0, breaker.wordStart());
+ EXPECT_EQ(7, breaker.wordEnd());
+ EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman
+ EXPECT_EQ(7, breaker.wordStart());
+ EXPECT_EQ(17, breaker.wordEnd());
+ EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
+ EXPECT_EQ(17, breaker.wordStart());
+ EXPECT_EQ(22, breaker.wordEnd());
+}
+
TEST_F(WordBreakerTest, punct) {
uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
'!', '!'};