From 77f488345316fba46c271fc04bea470819ae1712 Mon Sep 17 00:00:00 2001
From: Seigo Nonaka <nona@google.com>
Date: Tue, 19 Apr 2016 17:14:27 +0900
Subject: Do not break before and after ZWJ.

The emoji list is generated from external/unicode/emoji-data.txt

Bug: 28248662
Change-Id: Ie49b3782505665d62c24371ca23d317ae5e9c5f7
---
 libs/minikin/Android.mk             |  16 +++++-
 libs/minikin/GraphemeBreak.cpp      |  17 +-----
 libs/minikin/MinikinInternal.cpp    |   6 +++
 libs/minikin/MinikinInternal.h      |   3 ++
 libs/minikin/WordBreaker.cpp        |  15 ++----
 libs/minikin/unicode_emoji_h_gen.py | 105 ++++++++++++++++++++++++++++++++++++
 tests/Android.mk                    |   1 +
 tests/GraphemeBreakTests.cpp        |   4 ++
 tests/MinikinInternalTest.cpp       |  34 ++++++++++++
 tests/WordBreakerTests.cpp          |   7 ++-
 10 files changed, 179 insertions(+), 29 deletions(-)
 create mode 100644 libs/minikin/unicode_emoji_h_gen.py
 create mode 100644 tests/MinikinInternalTest.cpp

diff --git a/libs/minikin/Android.mk b/libs/minikin/Android.mk
index 2b5ff06..9d82579 100644
--- a/libs/minikin/Android.mk
+++ b/libs/minikin/Android.mk
@@ -15,7 +15,20 @@
 LOCAL_PATH := $(call my-dir)
 
 include $(CLEAR_VARS)
+# Generate unicode emoji data from UCD.
+UNICODE_EMOJI_H_GEN_PY := $(LOCAL_PATH)/unicode_emoji_h_gen.py
+UNICODE_EMOJI_DATA := $(TOP)/external/unicode/emoji-data.txt
+
+UNICODE_EMOJI_H := $(intermediates)/generated/UnicodeData.h
+$(UNICODE_EMOJI_H): $(UNICODE_EMOJI_H_GEN_PY) $(UNICODE_EMOJI_DATA)
+$(LOCAL_PATH)/MinikinInternal.cpp: $(UNICODE_EMOJI_H)
+$(UNICODE_EMOJI_H): PRIVATE_CUSTOM_TOOL := python $(UNICODE_EMOJI_H_GEN_PY) \
+    -i $(UNICODE_EMOJI_DATA) \
+    -o $(UNICODE_EMOJI_H)
+$(UNICODE_EMOJI_H):
+		$(transform-generated-source)
 
+include $(CLEAR_VARS)
 minikin_src_files := \
     AnalyzeStyle.cpp \
     CmapCoverage.cpp \
@@ -40,7 +53,8 @@ minikin_src_files := \
 minikin_c_includes := \
     external/harfbuzz_ng/src \
     external/freetype/include \
-    frameworks/minikin/include
+    frameworks/minikin/include \
+    $(intermediates)
 
 minikin_shared_libraries := \
     libharfbuzz_ng \
diff --git a/libs/minikin/GraphemeBreak.cpp b/libs/minikin/GraphemeBreak.cpp
index 1f361ba..45dd0ff 100644
--- a/libs/minikin/GraphemeBreak.cpp
+++ b/libs/minikin/GraphemeBreak.cpp
@@ -66,19 +66,6 @@ bool isPureKiller(uint32_t c) {
             || c == 0xA953 || c == 0xABED || c == 0x11134 || c == 0x112EA || c == 0x1172B);
 }
 
-// Returns true if the character appears before or after zwj in a zwj emoji sequence. See
-// http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html
-bool isZwjEmoji(uint32_t c) {
-    return (c == 0x2764       // HEAVY BLACK HEART
-            || c == 0x1F468   // MAN
-            || c == 0x1F469   // WOMAN
-            || c == 0x1F48B   // KISS MARK
-            || c == 0x1F466   // BOY
-            || c == 0x1F467   // GIRL
-            || c == 0x1F441   // EYE
-            || c == 0x1F5E8); // LEFT SPEECH BUBBLE
-}
-
 bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count,
         size_t offset) {
     // This implementation closely follows Unicode Standard Annex #29 on
@@ -163,7 +150,7 @@ bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t co
         return false;
     }
     // Tailoring: make emoji sequences with ZWJ a single grapheme cluster
-    if (c1 == 0x200D && isZwjEmoji(c2) && offset_back > start) {
+    if (c1 == 0x200D && isEmoji(c2) && offset_back > start) {
         // look at character before ZWJ to see that both can participate in an emoji zwj sequence
         uint32_t c0 = 0;
         U16_PREV(buf, start, offset_back, c0);
@@ -171,7 +158,7 @@ bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t co
             // skip over emoji variation selector
             U16_PREV(buf, start, offset_back, c0);
         }
-        if (isZwjEmoji(c0)) {
+        if (isEmoji(c0)) {
             return false;
         }
     }
diff --git a/libs/minikin/MinikinInternal.cpp b/libs/minikin/MinikinInternal.cpp
index e00f639..7fcc7b7 100644
--- a/libs/minikin/MinikinInternal.cpp
+++ b/libs/minikin/MinikinInternal.cpp
@@ -18,6 +18,7 @@
 
 #include "MinikinInternal.h"
 #include "HbFontCache.h"
+#include "generated/UnicodeData.h"
 
 #include <cutils/log.h>
 
@@ -31,6 +32,11 @@ void assertMinikinLocked() {
 #endif
 }
 
+bool isEmoji(uint32_t c) {
+    const size_t length = sizeof(generated::EMOJI_LIST) / sizeof(generated::EMOJI_LIST[0]);
+    return std::binary_search(generated::EMOJI_LIST, generated::EMOJI_LIST + length, c);
+}
+
 // Based on Modifiers from http://www.unicode.org/L2/L2016/16011-data-file.txt
 bool isEmojiModifier(uint32_t c) {
     return (0x1F3FB <= c && c <= 0x1F3FF);
diff --git a/libs/minikin/MinikinInternal.h b/libs/minikin/MinikinInternal.h
index 709fb9f..88cc947 100644
--- a/libs/minikin/MinikinInternal.h
+++ b/libs/minikin/MinikinInternal.h
@@ -36,6 +36,9 @@ extern Mutex gMinikinLock;
 // Aborts if gMinikinLock is not acquired. Do nothing on the release build.
 void assertMinikinLocked();
 
+// Returns true if c is emoji.
+bool isEmoji(uint32_t c);
+
 // Returns true if c is emoji modifier base.
 bool isEmojiBase(uint32_t c);
 
diff --git a/libs/minikin/WordBreaker.cpp b/libs/minikin/WordBreaker.cpp
index d420a6a..34e7a93 100644
--- a/libs/minikin/WordBreaker.cpp
+++ b/libs/minikin/WordBreaker.cpp
@@ -90,18 +90,9 @@ static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
         }
     }
 
-    // Known emoji ZWJ sequences
-    if (codePoint == CHAR_ZWJ) {
-        // Possible emoji ZWJ sequence
-        if (next_codepoint == 0x2764 ||       // HEAVY BLACK HEART
-                next_codepoint == 0x1F466 ||  // BOY
-                next_codepoint == 0x1F467 ||  // GIRL
-                next_codepoint == 0x1F468 ||  // MAN
-                next_codepoint == 0x1F469 ||  // WOMAN
-                next_codepoint == 0x1F48B ||  // KISS MARK
-                next_codepoint == 0x1F5E8) {  // LEFT SPEECH BUBBLE
-            return false;
-        }
+    // Emoji ZWJ sequences.
+    if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
+        return false;
     }
 
     // Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
diff --git a/libs/minikin/unicode_emoji_h_gen.py b/libs/minikin/unicode_emoji_h_gen.py
new file mode 100644
index 0000000..7233ef6
--- /dev/null
+++ b/libs/minikin/unicode_emoji_h_gen.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+#
+# Copyright (C) 2016 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Generate header file for unicode data."""
+
+import optparse
+import sys
+
+
+UNICODE_EMOJI_TEMPLATE="""
+/* file generated by frameworks/minikin/lib/minikin/Android.mk */
+#ifndef MINIKIN_UNICODE_EMOJI_H
+#define MINIKIN_UNICODE_EMOJI_H
+
+#include <stdint.h>
+
+namespace android {
+namespace generated {
+
+int32_t EMOJI_LIST[] = {
+@@@EMOJI_DATA@@@
+};
+
+}  // namespace generated
+}  // namespace android
+
+#endif  // MINIKIN_UNICODE_EMOJI_H
+"""
+
+
+def _create_opt_parser():
+  parser = optparse.OptionParser()
+  parser.add_option('-i', '--input', type='str', action='store',
+                    help='path to input emoji-data.txt')
+  parser.add_option('-o', '--output', type='str', action='store',
+                    help='path to output UnicodeEmoji.h')
+  return parser
+
+
+def _read_emoji_data(emoji_data_file_path):
+  result = []
+  with open(emoji_data_file_path) as emoji_data_file:
+    for line in emoji_data_file:
+      if '#' in line:
+        line = line[:line.index('#')]  # Drop comments.
+      if not line.strip():
+        continue  # Skip empty line.
+
+      code_points, prop = line.split(';')
+      code_points = code_points.strip()
+      prop = prop.strip()
+      if prop != 'Emoji':
+        break  # Only collect Emoji property code points
+
+      if '..' in code_points:  # code point range
+        cp_start, cp_end = code_points.split('..')
+        result.extend(xrange(int(cp_start, 16), int(cp_end, 16) + 1))
+      else:
+        code_point = int(code_points, 16)
+        result.append(code_point)
+  return result
+
+
+def _generate_header_contents(emoji_list):
+  INDENT = ' ' * 4
+  JOINER = ', '
+
+  hex_list = ['0x%04X' % x for x in emoji_list]
+  lines = []
+  tmp_line = '%s%s' % (INDENT, hex_list[0])
+  for hex_str in hex_list[1:]:
+    if len(tmp_line) + len(JOINER) + len(hex_str) >= 100:
+      lines.append(tmp_line + ',')
+      tmp_line = '%s%s' % (INDENT, hex_str)
+    else:
+      tmp_line = '%s%s%s' % (tmp_line, JOINER, hex_str)
+  lines.append(tmp_line)
+
+  template = UNICODE_EMOJI_TEMPLATE
+  template = template.replace('@@@EMOJI_DATA@@@', '\n'.join(lines))
+  return template
+
+
+if __name__ == '__main__':
+  opt_parser = _create_opt_parser()
+  opts, _ = opt_parser.parse_args()
+
+  emoji_list = _read_emoji_data(opts.input)
+  header = _generate_header_contents(emoji_list)
+  with open(opts.output, 'w') as header_file:
+    header_file.write(header)
+
diff --git a/tests/Android.mk b/tests/Android.mk
index e6586d7..b33631e 100644
--- a/tests/Android.mk
+++ b/tests/Android.mk
@@ -77,6 +77,7 @@ LOCAL_SRC_FILES += \
     FontTestUtils.cpp \
     HbFontCacheTest.cpp \
     MinikinFontForTest.cpp \
+    MinikinInternalTest.cpp \
     GraphemeBreakTests.cpp \
     LayoutUtilsTest.cpp \
     UnicodeUtils.cpp \
diff --git a/tests/GraphemeBreakTests.cpp b/tests/GraphemeBreakTests.cpp
index 3bfa5ec..cec5308 100644
--- a/tests/GraphemeBreakTests.cpp
+++ b/tests/GraphemeBreakTests.cpp
@@ -148,6 +148,10 @@ TEST(GraphemeBreak, tailoring) {
     EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D U+1F467 U+200D | U+1F466"));
     EXPECT_FALSE(IsBreak("U+1F441 U+200D | U+1F5E8"));
 
+    // Do not break before and after zwj with all kind of emoji characters.
+    EXPECT_FALSE(IsBreak("U+1F431 | U+200D U+1F464"));
+    EXPECT_FALSE(IsBreak("U+1F431 U+200D | U+1F464"));
+
     // ARABIC LETTER BEH + ZWJ + heart, not a zwj emoji sequence, so we preserve the break
     EXPECT_TRUE(IsBreak("U+0628 U+200D | U+2764"));
 }
diff --git a/tests/MinikinInternalTest.cpp b/tests/MinikinInternalTest.cpp
new file mode 100644
index 0000000..9c1a1e5
--- /dev/null
+++ b/tests/MinikinInternalTest.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2016 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "MinikinInternal.h"
+
+namespace android {
+
+TEST(MinikinInternalTest, isEmojiTest) {
+    EXPECT_TRUE(isEmoji(0x0023));  // NUMBER SIGN
+    EXPECT_TRUE(isEmoji(0x0035));  // DIGIT FIVE
+    EXPECT_TRUE(isEmoji(0x1F0CF));  // PLAYING CARD BLACK JOKER
+    EXPECT_TRUE(isEmoji(0x1F1E9));  // REGIONAL INDICATOR SYMBOL LETTER D
+
+    EXPECT_FALSE(isEmoji(0x0000));  // <control>
+    EXPECT_FALSE(isEmoji(0x0061));  // LATIN SMALL LETTER A
+    EXPECT_FALSE(isEmoji(0x29E3D));  // A han character.
+}
+
+}  // namespace android
diff --git a/tests/WordBreakerTests.cpp b/tests/WordBreakerTests.cpp
index 480c57d..9fa9da3 100644
--- a/tests/WordBreakerTests.cpp
+++ b/tests/WordBreakerTests.cpp
@@ -93,6 +93,8 @@ TEST_F(WordBreakerTest, zwjEmojiSequences) {
         UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
         // eye + zwj + left speech bubble
         UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
+        // CAT FACE + zwj + BUST IN SILHOUETTE
+        UTF16(0x1F431), 0x200D, UTF16(0x1F464),
     };
     WordBreaker breaker;
     breaker.setLocale(icu::Locale::getEnglish());
@@ -104,9 +106,12 @@ TEST_F(WordBreakerTest, zwjEmojiSequences) {
     EXPECT_EQ(17, breaker.next());  // after woman + zwj + heart + zwj + woman
     EXPECT_EQ(7, breaker.wordStart());
     EXPECT_EQ(17, breaker.wordEnd());
-    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
+    EXPECT_EQ(22, breaker.next());  // after eye + zwj + left speech bubble
     EXPECT_EQ(17, breaker.wordStart());
     EXPECT_EQ(22, breaker.wordEnd());
+    EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
+    EXPECT_EQ(22, breaker.wordStart());
+    EXPECT_EQ(27, breaker.wordEnd());
 }
 
 TEST_F(WordBreakerTest, emojiWithModifier) {
-- 
cgit v1.2.3