summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRaph Levien <raph@google.com>2016-02-22 13:28:44 -0800
committerRaph Levien <raph@google.com>2016-02-24 12:43:36 -0800
commitadfa580f1f067c846509b4346e5be2cb19177c1b (patch)
tree1f91289c4abbff4b3d971327d0973ae6d46a4aff
parent30bf8a7c05925a970f1cab87c38bec8dd97fa82e (diff)
downloadandroid_frameworks_minikin-adfa580f1f067c846509b4346e5be2cb19177c1b.tar.gz
android_frameworks_minikin-adfa580f1f067c846509b4346e5be2cb19177c1b.tar.bz2
android_frameworks_minikin-adfa580f1f067c846509b4346e5be2cb19177c1b.zip
Suppress grapheme cluster breaks in emoji with modifiers
An emoji with a modifier should be treated as a single grapheme, i.e. it should not be possible to place the cursor between the base and modifier. This patch implements the proposed Rule GB9c from Mark Davis's proposal entitled "Fixing breaking properties for emoji", L2/16-011R3. The patch also skips over variation sequences attached the to the preceding character, for computing grapheme cluster boundaries. Bug: 26829153 Change-Id: Iff5bc2bb8e5246223a017c7cf33acfbf63817f16
-rw-r--r--libs/minikin/GraphemeBreak.cpp53
-rw-r--r--tests/GraphemeBreakTests.cpp24
2 files changed, 77 insertions, 0 deletions
diff --git a/libs/minikin/GraphemeBreak.cpp b/libs/minikin/GraphemeBreak.cpp
index 7865d1d..4141091 100644
--- a/libs/minikin/GraphemeBreak.cpp
+++ b/libs/minikin/GraphemeBreak.cpp
@@ -77,6 +77,48 @@ bool isZwjEmoji(uint32_t c) {
|| c == 0x1F5E8); // LEFT SPEECH BUBBLE
}
+// Based on Modifiers from http://www.unicode.org/L2/L2016/16011-data-file.txt
+bool isEmojiModifier(uint32_t c) {
+ return (0x1F3FB <= c && c <= 0x1F3FF);
+}
+
+// Based on Emoji_Modifier_Base from
+// http://www.unicode.org/Public/emoji/3.0/emoji-data.txt
+bool isEmojiBase(uint32_t c) {
+ if (0x261D <= c && c <= 0x270D) {
+ return (c == 0x261D || c == 0x26F9 || (0x270A <= c && c <= 0x270D));
+ } else if (0x1F385 <= c && c <= 0x1F93E) {
+ return (c == 0x1F385
+ || (0x1F3C3 <= c || c <= 0x1F3C4)
+ || (0x1F3CA <= c || c <= 0x1F3CB)
+ || (0x1F442 <= c || c <= 0x1F443)
+ || (0x1F446 <= c || c <= 0x1F450)
+ || (0x1F466 <= c || c <= 0x1F469)
+ || c == 0x1F46E
+ || (0x1F470 <= c || c <= 0x1F478)
+ || c == 0x1F47C
+ || (0x1F481 <= c || c <= 0x1F483)
+ || (0x1F485 <= c || c <= 0x1F487)
+ || c == 0x1F4AA
+ || c == 0x1F575
+ || c == 0x1F57A
+ || c == 0x1F590
+ || (0x1F595 <= c || c <= 0x1F596)
+ || (0x1F645 <= c || c <= 0x1F647)
+ || (0x1F64B <= c || c <= 0x1F64F)
+ || c == 0x1F6A3
+ || (0x1F6B4 <= c || c <= 0x1F6B6)
+ || c == 0x1F6C0
+ || (0x1F918 <= c || c <= 0x1F91E)
+ || c == 0x1F926
+ || c == 0x1F930
+ || (0x1F933 <= c || c <= 0x1F939)
+ || (0x1F93B <= c || c <= 0x1F93E));
+ } else {
+ return false;
+ }
+}
+
bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count,
size_t offset) {
// This implementation closely follows Unicode Standard Annex #29 on
@@ -165,6 +207,17 @@ bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t co
return false;
}
}
+ // Proposed Rule GB9c from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
+ // E_Base x E_Modifier
+ if (isEmojiModifier(c2)) {
+ if (c1 == 0xFE0F && offset_back > start) {
+ // skip over emoji variation selector
+ U16_PREV(buf, start, offset_back, c1);
+ }
+ if (isEmojiBase(c1)) {
+ return false;
+ }
+ }
// Rule GB10, Any รท Any
return true;
}
diff --git a/tests/GraphemeBreakTests.cpp b/tests/GraphemeBreakTests.cpp
index d6746bc..dbd73be 100644
--- a/tests/GraphemeBreakTests.cpp
+++ b/tests/GraphemeBreakTests.cpp
@@ -136,6 +136,30 @@ TEST(GraphemeBreak, tailoring) {
EXPECT_TRUE(IsBreak("U+0628 U+200D | U+2764"));
}
+TEST(GraphemeBreak, emojiModifiers) {
+ EXPECT_FALSE(IsBreak("U+261D | U+1F3FB")); // white up pointing index + modifier
+ EXPECT_FALSE(IsBreak("U+270C | U+1F3FB")); // victory hand + modifier
+ EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FB")); // boy + modifier
+ EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FC")); // boy + modifier
+ EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FD")); // boy + modifier
+ EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FE")); // boy + modifier
+ EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FF")); // boy + modifier
+ EXPECT_FALSE(IsBreak("U+1F918 | U+1F3FF")); // sign of the horns + modifier
+ EXPECT_FALSE(IsBreak("U+1F933 | U+1F3FF")); // selfie (Unicode 9) + modifier
+
+ // adding emoji style variation selector doesn't affect grapheme cluster
+ EXPECT_TRUE(IsBreak("U+270C U+FE0E | U+1F3FB")); // victory hand + text style + modifier
+ EXPECT_FALSE(IsBreak("U+270C U+FE0F | U+1F3FB")); // heart + emoji style + modifier
+
+ // heart is not an emoji base
+ EXPECT_TRUE(IsBreak("U+2764 | U+1F3FB")); // heart + modifier
+ EXPECT_TRUE(IsBreak("U+2764 U+FE0E | U+1F3FB")); // heart + emoji style + modifier
+ EXPECT_TRUE(IsBreak("U+2764 U+FE0F | U+1F3FB")); // heart + emoji style + modifier
+
+ // rat is not an emoji modifer
+ EXPECT_TRUE(IsBreak("U+1F466 | U+1F400")); // boy + rat
+}
+
TEST(GraphemeBreak, offsets) {
uint16_t string[] = { 0x0041, 0x06DD, 0x0045, 0x0301, 0x0049, 0x0301 };
EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 2));