summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--libs/minikin/GraphemeBreak.cpp27
-rw-r--r--tests/GraphemeBreakTests.cpp16
2 files changed, 34 insertions, 9 deletions
diff --git a/libs/minikin/GraphemeBreak.cpp b/libs/minikin/GraphemeBreak.cpp
index ef323d5..1f361ba 100644
--- a/libs/minikin/GraphemeBreak.cpp
+++ b/libs/minikin/GraphemeBreak.cpp
@@ -15,6 +15,7 @@
*/
#include <stdint.h>
+#include <algorithm>
#include <unicode/uchar.h>
#include <unicode/utf16.h>
@@ -124,17 +125,25 @@ bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t co
if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) {
return false;
}
- // Rule GB8a, Regional_Indicator x Regional_Indicator
+ // Rule GB8a that looks at even-off cases.
//
- // Known limitation: This is overly conservative, and returns no grapheme breaks between two
- // flags, such as in the character sequence "U+1F1FA U+1F1F8 [potential break] U+1F1FA U+1F1F8".
- // Also, it assumes that all combinations of Regional Indicators produce a flag, where they
- // don't.
- //
- // There is no easy solution for doing this correctly, except for querying the font and doing
- // some lookback.
+ // sot (RI RI)* RI x RI
+ // [^RI] (RI RI)* RI x RI
+ // RI รท RI
if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
- return false;
+ // Look at up to 1000 code units.
+ start = std::max((ssize_t)start, (ssize_t)offset_back - 1000);
+ while (offset_back > start) {
+ U16_PREV(buf, start, offset_back, c1);
+ if (tailoredGraphemeClusterBreak(c1) != U_GCB_REGIONAL_INDICATOR) {
+ offset_back += U16_LENGTH(c1);
+ break;
+ }
+ }
+
+ // Note that the offset has moved forwared 2 code units by U16_NEXT.
+ // The number 4 comes from the number of code units in a whole flag.
+ return (offset - 2 - offset_back) % 4 == 0;
}
// Rule GB9, x Extend; Rule GB9a, x SpacingMark; Rule GB9b, Prepend x
if (p2 == U_GCB_EXTEND || p2 == U_GCB_SPACING_MARK || p1 == U_GCB_PREPEND) {
diff --git a/tests/GraphemeBreakTests.cpp b/tests/GraphemeBreakTests.cpp
index 7e17203..3bfa5ec 100644
--- a/tests/GraphemeBreakTests.cpp
+++ b/tests/GraphemeBreakTests.cpp
@@ -84,6 +84,22 @@ TEST(GraphemeBreak, rules) {
// Rule GB8a, Regional_Indicator x Regional_Indicator
EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8"));
+ EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
+ EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
+ EXPECT_FALSE(IsBreak("U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag)
+
+ EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag)
+ EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag)
+
+ EXPECT_TRUE(IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag)
+ EXPECT_FALSE(IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag)
+
+ EXPECT_TRUE(
+ IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
+ EXPECT_FALSE(
+ IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
+ EXPECT_FALSE(
+ IsBreak("'a' U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag)
// Rule GB9, x Extend
EXPECT_FALSE(IsBreak("'a' | U+0301")); // combining accent