/* * Copyright (C) 2014 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include namespace android { // Returns true if the character appears before or after zwj in a zwj emoji sequence. See // http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html bool isZwjEmoji(uint32_t c) { return (c == 0x2764 // HEAVY BLACK HEART || c == 0x1F468 // MAN || c == 0x1F469 // WOMAN || c == 0x1F48B // KISS MARK || c == 0x1F466 // BOY || c == 0x1F467 // GIRL || c == 0x1F441 // EYE || c == 0x1F5E8); // LEFT SPEECH BUBBLE } bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count, size_t offset) { // This implementation closely follows Unicode Standard Annex #29 on // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/), // implementing a tailored version of extended grapheme clusters. // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules. // Rule GB1, sot /; Rule GB2, / eot if (offset <= start || offset >= start + count) { return true; } if (U16_IS_TRAIL(buf[offset])) { // Don't break a surrogate pair return false; } uint32_t c1 = 0; uint32_t c2 = 0; size_t offset_back = offset; U16_PREV(buf, start, offset_back, c1); U16_NEXT(buf, offset, start + count, c2); int32_t p1 = u_getIntPropertyValue(c1, UCHAR_GRAPHEME_CLUSTER_BREAK); int32_t p2 = u_getIntPropertyValue(c2, UCHAR_GRAPHEME_CLUSTER_BREAK); // Rule GB3, CR x LF if (p1 == U_GCB_CR && p2 == U_GCB_LF) { return false; } // Rule GB4, (Control | CR | LF) / if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) { return true; } // Rule GB5, / (Control | CR | LF) if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) { // exclude zero-width control characters from breaking (tailoring of UAX #29) if (c2 == 0x00ad || (c2 >= 0x200b && c2 <= 0x200f) || (c2 >= 0x2028 && c2 <= 0x202e) || (c2 >= 0x2060 && c2 <= 0x206f)) { return false; } return true; } // Rule GB6, L x ( L | V | LV | LVT ) if (p1 == U_GCB_L && (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) { return false; } // Rule GB7, ( LV | V ) x ( V | T ) if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) { return false; } // Rule GB8, ( LVT | T ) x T if ((p1 == U_GCB_L || p1 == U_GCB_T) && p2 == U_GCB_T) { return false; } // Rule GB8a, Regional_Indicator x Regional_Indicator if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) { return false; } // Rule GB9, x Extend; Rule GB9a, x SpacingMark if (p2 == U_GCB_EXTEND || p2 == U_GCB_SPACING_MARK) { if (c2 == 0xe33) { // most other implementations break THAI CHARACTER SARA AM // (tailoring of UAX #29) return true; } return false; } // Cluster indic syllables together (tailoring of UAX #29) if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9 // virama && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) { return false; } // Tailoring: make emoji sequences with ZWJ a single grapheme cluster if (c1 == 0x200D && isZwjEmoji(c2) && offset_back > start) { // look at character before ZWJ to see that both can participate in an emoji zwj sequence uint32_t c0 = 0; U16_PREV(buf, start, offset_back, c0); if (c0 == 0xFE0F && offset_back > start) { // skip over emoji variation selector U16_PREV(buf, start, offset_back, c0); } if (isZwjEmoji(c0)) { return false; } } // Rule GB10, Any / Any return true; } size_t GraphemeBreak::getTextRunCursor(const uint16_t* buf, size_t start, size_t count, size_t offset, MoveOpt opt) { switch (opt) { case AFTER: if (offset < start + count) { offset++; } // fall through case AT_OR_AFTER: while (!isGraphemeBreak(buf, start, count, offset)) { offset++; } break; case BEFORE: if (offset > start) { offset--; } // fall through case AT_OR_BEFORE: while (!isGraphemeBreak(buf, start, count, offset)) { offset--; } break; case AT: if (!isGraphemeBreak(buf, start, count, offset)) { offset = (size_t)-1; } break; } return offset; } } // namespace android