From 77f488345316fba46c271fc04bea470819ae1712 Mon Sep 17 00:00:00 2001 From: Seigo Nonaka Date: Tue, 19 Apr 2016 17:14:27 +0900 Subject: Do not break before and after ZWJ. The emoji list is generated from external/unicode/emoji-data.txt Bug: 28248662 Change-Id: Ie49b3782505665d62c24371ca23d317ae5e9c5f7 --- libs/minikin/Android.mk | 16 +++++- libs/minikin/GraphemeBreak.cpp | 17 +----- libs/minikin/MinikinInternal.cpp | 6 +++ libs/minikin/MinikinInternal.h | 3 ++ libs/minikin/WordBreaker.cpp | 15 ++---- libs/minikin/unicode_emoji_h_gen.py | 105 ++++++++++++++++++++++++++++++++++++ tests/Android.mk | 1 + tests/GraphemeBreakTests.cpp | 4 ++ tests/MinikinInternalTest.cpp | 34 ++++++++++++ tests/WordBreakerTests.cpp | 7 ++- 10 files changed, 179 insertions(+), 29 deletions(-) create mode 100644 libs/minikin/unicode_emoji_h_gen.py create mode 100644 tests/MinikinInternalTest.cpp diff --git a/libs/minikin/Android.mk b/libs/minikin/Android.mk index 2b5ff06..9d82579 100644 --- a/libs/minikin/Android.mk +++ b/libs/minikin/Android.mk @@ -15,7 +15,20 @@ LOCAL_PATH := $(call my-dir) include $(CLEAR_VARS) +# Generate unicode emoji data from UCD. +UNICODE_EMOJI_H_GEN_PY := $(LOCAL_PATH)/unicode_emoji_h_gen.py +UNICODE_EMOJI_DATA := $(TOP)/external/unicode/emoji-data.txt + +UNICODE_EMOJI_H := $(intermediates)/generated/UnicodeData.h +$(UNICODE_EMOJI_H): $(UNICODE_EMOJI_H_GEN_PY) $(UNICODE_EMOJI_DATA) +$(LOCAL_PATH)/MinikinInternal.cpp: $(UNICODE_EMOJI_H) +$(UNICODE_EMOJI_H): PRIVATE_CUSTOM_TOOL := python $(UNICODE_EMOJI_H_GEN_PY) \ + -i $(UNICODE_EMOJI_DATA) \ + -o $(UNICODE_EMOJI_H) +$(UNICODE_EMOJI_H): + $(transform-generated-source) +include $(CLEAR_VARS) minikin_src_files := \ AnalyzeStyle.cpp \ CmapCoverage.cpp \ @@ -40,7 +53,8 @@ minikin_src_files := \ minikin_c_includes := \ external/harfbuzz_ng/src \ external/freetype/include \ - frameworks/minikin/include + frameworks/minikin/include \ + $(intermediates) minikin_shared_libraries := \ libharfbuzz_ng \ diff --git a/libs/minikin/GraphemeBreak.cpp b/libs/minikin/GraphemeBreak.cpp index 1f361ba..45dd0ff 100644 --- a/libs/minikin/GraphemeBreak.cpp +++ b/libs/minikin/GraphemeBreak.cpp @@ -66,19 +66,6 @@ bool isPureKiller(uint32_t c) { || c == 0xA953 || c == 0xABED || c == 0x11134 || c == 0x112EA || c == 0x1172B); } -// Returns true if the character appears before or after zwj in a zwj emoji sequence. See -// http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html -bool isZwjEmoji(uint32_t c) { - return (c == 0x2764 // HEAVY BLACK HEART - || c == 0x1F468 // MAN - || c == 0x1F469 // WOMAN - || c == 0x1F48B // KISS MARK - || c == 0x1F466 // BOY - || c == 0x1F467 // GIRL - || c == 0x1F441 // EYE - || c == 0x1F5E8); // LEFT SPEECH BUBBLE -} - bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count, size_t offset) { // This implementation closely follows Unicode Standard Annex #29 on @@ -163,7 +150,7 @@ bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t co return false; } // Tailoring: make emoji sequences with ZWJ a single grapheme cluster - if (c1 == 0x200D && isZwjEmoji(c2) && offset_back > start) { + if (c1 == 0x200D && isEmoji(c2) && offset_back > start) { // look at character before ZWJ to see that both can participate in an emoji zwj sequence uint32_t c0 = 0; U16_PREV(buf, start, offset_back, c0); @@ -171,7 +158,7 @@ bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t co // skip over emoji variation selector U16_PREV(buf, start, offset_back, c0); } - if (isZwjEmoji(c0)) { + if (isEmoji(c0)) { return false; } } diff --git a/libs/minikin/MinikinInternal.cpp b/libs/minikin/MinikinInternal.cpp index e00f639..7fcc7b7 100644 --- a/libs/minikin/MinikinInternal.cpp +++ b/libs/minikin/MinikinInternal.cpp @@ -18,6 +18,7 @@ #include "MinikinInternal.h" #include "HbFontCache.h" +#include "generated/UnicodeData.h" #include @@ -31,6 +32,11 @@ void assertMinikinLocked() { #endif } +bool isEmoji(uint32_t c) { + const size_t length = sizeof(generated::EMOJI_LIST) / sizeof(generated::EMOJI_LIST[0]); + return std::binary_search(generated::EMOJI_LIST, generated::EMOJI_LIST + length, c); +} + // Based on Modifiers from http://www.unicode.org/L2/L2016/16011-data-file.txt bool isEmojiModifier(uint32_t c) { return (0x1F3FB <= c && c <= 0x1F3FF); diff --git a/libs/minikin/MinikinInternal.h b/libs/minikin/MinikinInternal.h index 709fb9f..88cc947 100644 --- a/libs/minikin/MinikinInternal.h +++ b/libs/minikin/MinikinInternal.h @@ -36,6 +36,9 @@ extern Mutex gMinikinLock; // Aborts if gMinikinLock is not acquired. Do nothing on the release build. void assertMinikinLocked(); +// Returns true if c is emoji. +bool isEmoji(uint32_t c); + // Returns true if c is emoji modifier base. bool isEmojiBase(uint32_t c); diff --git a/libs/minikin/WordBreaker.cpp b/libs/minikin/WordBreaker.cpp index d420a6a..34e7a93 100644 --- a/libs/minikin/WordBreaker.cpp +++ b/libs/minikin/WordBreaker.cpp @@ -90,18 +90,9 @@ static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) { } } - // Known emoji ZWJ sequences - if (codePoint == CHAR_ZWJ) { - // Possible emoji ZWJ sequence - if (next_codepoint == 0x2764 || // HEAVY BLACK HEART - next_codepoint == 0x1F466 || // BOY - next_codepoint == 0x1F467 || // GIRL - next_codepoint == 0x1F468 || // MAN - next_codepoint == 0x1F469 || // WOMAN - next_codepoint == 0x1F48B || // KISS MARK - next_codepoint == 0x1F5E8) { // LEFT SPEECH BUBBLE - return false; - } + // Emoji ZWJ sequences. + if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) { + return false; } // Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf diff --git a/libs/minikin/unicode_emoji_h_gen.py b/libs/minikin/unicode_emoji_h_gen.py new file mode 100644 index 0000000..7233ef6 --- /dev/null +++ b/libs/minikin/unicode_emoji_h_gen.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python +# +# Copyright (C) 2016 The Android Open Source Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Generate header file for unicode data.""" + +import optparse +import sys + + +UNICODE_EMOJI_TEMPLATE=""" +/* file generated by frameworks/minikin/lib/minikin/Android.mk */ +#ifndef MINIKIN_UNICODE_EMOJI_H +#define MINIKIN_UNICODE_EMOJI_H + +#include + +namespace android { +namespace generated { + +int32_t EMOJI_LIST[] = { +@@@EMOJI_DATA@@@ +}; + +} // namespace generated +} // namespace android + +#endif // MINIKIN_UNICODE_EMOJI_H +""" + + +def _create_opt_parser(): + parser = optparse.OptionParser() + parser.add_option('-i', '--input', type='str', action='store', + help='path to input emoji-data.txt') + parser.add_option('-o', '--output', type='str', action='store', + help='path to output UnicodeEmoji.h') + return parser + + +def _read_emoji_data(emoji_data_file_path): + result = [] + with open(emoji_data_file_path) as emoji_data_file: + for line in emoji_data_file: + if '#' in line: + line = line[:line.index('#')] # Drop comments. + if not line.strip(): + continue # Skip empty line. + + code_points, prop = line.split(';') + code_points = code_points.strip() + prop = prop.strip() + if prop != 'Emoji': + break # Only collect Emoji property code points + + if '..' in code_points: # code point range + cp_start, cp_end = code_points.split('..') + result.extend(xrange(int(cp_start, 16), int(cp_end, 16) + 1)) + else: + code_point = int(code_points, 16) + result.append(code_point) + return result + + +def _generate_header_contents(emoji_list): + INDENT = ' ' * 4 + JOINER = ', ' + + hex_list = ['0x%04X' % x for x in emoji_list] + lines = [] + tmp_line = '%s%s' % (INDENT, hex_list[0]) + for hex_str in hex_list[1:]: + if len(tmp_line) + len(JOINER) + len(hex_str) >= 100: + lines.append(tmp_line + ',') + tmp_line = '%s%s' % (INDENT, hex_str) + else: + tmp_line = '%s%s%s' % (tmp_line, JOINER, hex_str) + lines.append(tmp_line) + + template = UNICODE_EMOJI_TEMPLATE + template = template.replace('@@@EMOJI_DATA@@@', '\n'.join(lines)) + return template + + +if __name__ == '__main__': + opt_parser = _create_opt_parser() + opts, _ = opt_parser.parse_args() + + emoji_list = _read_emoji_data(opts.input) + header = _generate_header_contents(emoji_list) + with open(opts.output, 'w') as header_file: + header_file.write(header) + diff --git a/tests/Android.mk b/tests/Android.mk index e6586d7..b33631e 100644 --- a/tests/Android.mk +++ b/tests/Android.mk @@ -77,6 +77,7 @@ LOCAL_SRC_FILES += \ FontTestUtils.cpp \ HbFontCacheTest.cpp \ MinikinFontForTest.cpp \ + MinikinInternalTest.cpp \ GraphemeBreakTests.cpp \ LayoutUtilsTest.cpp \ UnicodeUtils.cpp \ diff --git a/tests/GraphemeBreakTests.cpp b/tests/GraphemeBreakTests.cpp index 3bfa5ec..cec5308 100644 --- a/tests/GraphemeBreakTests.cpp +++ b/tests/GraphemeBreakTests.cpp @@ -148,6 +148,10 @@ TEST(GraphemeBreak, tailoring) { EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D U+1F467 U+200D | U+1F466")); EXPECT_FALSE(IsBreak("U+1F441 U+200D | U+1F5E8")); + // Do not break before and after zwj with all kind of emoji characters. + EXPECT_FALSE(IsBreak("U+1F431 | U+200D U+1F464")); + EXPECT_FALSE(IsBreak("U+1F431 U+200D | U+1F464")); + // ARABIC LETTER BEH + ZWJ + heart, not a zwj emoji sequence, so we preserve the break EXPECT_TRUE(IsBreak("U+0628 U+200D | U+2764")); } diff --git a/tests/MinikinInternalTest.cpp b/tests/MinikinInternalTest.cpp new file mode 100644 index 0000000..9c1a1e5 --- /dev/null +++ b/tests/MinikinInternalTest.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2016 The Android Open Source Project + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "MinikinInternal.h" + +namespace android { + +TEST(MinikinInternalTest, isEmojiTest) { + EXPECT_TRUE(isEmoji(0x0023)); // NUMBER SIGN + EXPECT_TRUE(isEmoji(0x0035)); // DIGIT FIVE + EXPECT_TRUE(isEmoji(0x1F0CF)); // PLAYING CARD BLACK JOKER + EXPECT_TRUE(isEmoji(0x1F1E9)); // REGIONAL INDICATOR SYMBOL LETTER D + + EXPECT_FALSE(isEmoji(0x0000)); // + EXPECT_FALSE(isEmoji(0x0061)); // LATIN SMALL LETTER A + EXPECT_FALSE(isEmoji(0x29E3D)); // A han character. +} + +} // namespace android diff --git a/tests/WordBreakerTests.cpp b/tests/WordBreakerTests.cpp index 480c57d..9fa9da3 100644 --- a/tests/WordBreakerTests.cpp +++ b/tests/WordBreakerTests.cpp @@ -93,6 +93,8 @@ TEST_F(WordBreakerTest, zwjEmojiSequences) { UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469), // eye + zwj + left speech bubble UTF16(0x1F441), 0x200D, UTF16(0x1F5E8), + // CAT FACE + zwj + BUST IN SILHOUETTE + UTF16(0x1F431), 0x200D, UTF16(0x1F464), }; WordBreaker breaker; breaker.setLocale(icu::Locale::getEnglish()); @@ -104,9 +106,12 @@ TEST_F(WordBreakerTest, zwjEmojiSequences) { EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman EXPECT_EQ(7, breaker.wordStart()); EXPECT_EQ(17, breaker.wordEnd()); - EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_EQ(22, breaker.next()); // after eye + zwj + left speech bubble EXPECT_EQ(17, breaker.wordStart()); EXPECT_EQ(22, breaker.wordEnd()); + EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end + EXPECT_EQ(22, breaker.wordStart()); + EXPECT_EQ(27, breaker.wordEnd()); } TEST_F(WordBreakerTest, emojiWithModifier) { -- cgit v1.2.3