From 450e96c8170c3d59a5896e734c90d3f9def505f8 Mon Sep 17 00:00:00 2001 From: Seigo Nonaka Date: Wed, 2 Mar 2016 13:53:54 -0800 Subject: Break regional indicators at even numbered code points. Bug: 23288449 Change-Id: If1419ff9e44e8e640616979bae88311f414b42a1 --- libs/minikin/GraphemeBreak.cpp | 27 ++++++++++++++++++--------- tests/GraphemeBreakTests.cpp | 16 ++++++++++++++++ 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/libs/minikin/GraphemeBreak.cpp b/libs/minikin/GraphemeBreak.cpp index ef323d5..1f361ba 100644 --- a/libs/minikin/GraphemeBreak.cpp +++ b/libs/minikin/GraphemeBreak.cpp @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -124,17 +125,25 @@ bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t co if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) { return false; } - // Rule GB8a, Regional_Indicator x Regional_Indicator + // Rule GB8a that looks at even-off cases. // - // Known limitation: This is overly conservative, and returns no grapheme breaks between two - // flags, such as in the character sequence "U+1F1FA U+1F1F8 [potential break] U+1F1FA U+1F1F8". - // Also, it assumes that all combinations of Regional Indicators produce a flag, where they - // don't. - // - // There is no easy solution for doing this correctly, except for querying the font and doing - // some lookback. + // sot (RI RI)* RI x RI + // [^RI] (RI RI)* RI x RI + // RI รท RI if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) { - return false; + // Look at up to 1000 code units. + start = std::max((ssize_t)start, (ssize_t)offset_back - 1000); + while (offset_back > start) { + U16_PREV(buf, start, offset_back, c1); + if (tailoredGraphemeClusterBreak(c1) != U_GCB_REGIONAL_INDICATOR) { + offset_back += U16_LENGTH(c1); + break; + } + } + + // Note that the offset has moved forwared 2 code units by U16_NEXT. + // The number 4 comes from the number of code units in a whole flag. + return (offset - 2 - offset_back) % 4 == 0; } // Rule GB9, x Extend; Rule GB9a, x SpacingMark; Rule GB9b, Prepend x if (p2 == U_GCB_EXTEND || p2 == U_GCB_SPACING_MARK || p1 == U_GCB_PREPEND) { diff --git a/tests/GraphemeBreakTests.cpp b/tests/GraphemeBreakTests.cpp index 7e17203..3bfa5ec 100644 --- a/tests/GraphemeBreakTests.cpp +++ b/tests/GraphemeBreakTests.cpp @@ -84,6 +84,22 @@ TEST(GraphemeBreak, rules) { // Rule GB8a, Regional_Indicator x Regional_Indicator EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8")); + EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag) + EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag) + EXPECT_FALSE(IsBreak("U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag) + + EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag) + EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag) + + EXPECT_TRUE(IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag) + EXPECT_FALSE(IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag) + + EXPECT_TRUE( + IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag) + EXPECT_FALSE( + IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag) + EXPECT_FALSE( + IsBreak("'a' U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag) // Rule GB9, x Extend EXPECT_FALSE(IsBreak("'a' | U+0301")); // combining accent -- cgit v1.2.3