libs/minikin/GraphemeBreak.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158

/*
 * Copyright (C) 2014 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <stdint.h>
#include <unicode/uchar.h>
#include <unicode/utf16.h>

#include <minikin/GraphemeBreak.h>

namespace android {

// Returns true if the character appears before or after zwj in a zwj emoji sequence. See
// http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html
bool isZwjEmoji(uint32_t c) {
    return (c == 0x2764       // HEAVY BLACK HEART
            || c == 0x1F468   // MAN
            || c == 0x1F469   // WOMAN
            || c == 0x1F48B   // KISS MARK
            || c == 0x1F466   // BOY
            || c == 0x1F467   // GIRL
            || c == 0x1F441   // EYE
            || c == 0x1F5E8); // LEFT SPEECH BUBBLE
}

bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count,
        size_t offset) {
    // This implementation closely follows Unicode Standard Annex #29 on
    // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/),
    // implementing a tailored version of extended grapheme clusters.
    // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules.

    // Rule GB1, sot /; Rule GB2, / eot
    if (offset <= start || offset >= start + count) {
        return true;
    }
    if (U16_IS_TRAIL(buf[offset])) {
        // Don't break a surrogate pair
        return false;
    }
    uint32_t c1 = 0;
    uint32_t c2 = 0;
    size_t offset_back = offset;
    U16_PREV(buf, start, offset_back, c1);
    U16_NEXT(buf, offset, start + count, c2);
    int32_t p1 = u_getIntPropertyValue(c1, UCHAR_GRAPHEME_CLUSTER_BREAK);
    int32_t p2 = u_getIntPropertyValue(c2, UCHAR_GRAPHEME_CLUSTER_BREAK);
    // Rule GB3, CR x LF
    if (p1 == U_GCB_CR && p2 == U_GCB_LF) {
        return false;
    }
    // Rule GB4, (Control | CR | LF) /
    if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) {
        return true;
    }
    // Rule GB5, / (Control | CR | LF)
    if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) {
        // exclude zero-width control characters from breaking (tailoring of UAX #29)
        if (c2 == 0x00ad
                || (c2 >= 0x200b && c2 <= 0x200f)
                || (c2 >= 0x2028 && c2 <= 0x202e)
                || (c2 >= 0x2060 && c2 <= 0x206f)) {
            return false;
        }
        return true;
    }
    // Rule GB6, L x ( L | V | LV | LVT )
    if (p1 == U_GCB_L && (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) {
        return false;
    }
    // Rule GB7, ( LV | V ) x ( V | T )
    if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) {
        return false;
    }
    // Rule GB8, ( LVT | T ) x T
    if ((p1 == U_GCB_L || p1 == U_GCB_T) && p2 == U_GCB_T) {
        return false;
    }
    // Rule GB8a, Regional_Indicator x Regional_Indicator
    if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
        return false;
    }
    // Rule GB9, x Extend; Rule GB9a, x SpacingMark
    if (p2 == U_GCB_EXTEND || p2 == U_GCB_SPACING_MARK) {
        if (c2 == 0xe33) {
            // most other implementations break THAI CHARACTER SARA AM
            // (tailoring of UAX #29)
            return true;
        }
        return false;
    }
    // Cluster indic syllables together (tailoring of UAX #29)
    if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9  // virama
            && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) {
        return false;
    }
    // Tailoring: make emoji sequences with ZWJ a single grapheme cluster
    if (c1 == 0x200D && isZwjEmoji(c2) && offset_back > start) {
        // look at character before ZWJ to see that both can participate in an emoji zwj sequence
        uint32_t c0 = 0;
        U16_PREV(buf, start, offset_back, c0);
        if (c0 == 0xFE0F && offset_back > start) {
            // skip over emoji variation selector
            U16_PREV(buf, start, offset_back, c0);
        }
        if (isZwjEmoji(c0)) {
            return false;
        }
    }
    // Rule GB10, Any / Any
    return true;
}

size_t GraphemeBreak::getTextRunCursor(const uint16_t* buf, size_t start, size_t count,
        size_t offset, MoveOpt opt) {
    switch (opt) {
    case AFTER:
        if (offset < start + count) {
            offset++;
        }
        // fall through
    case AT_OR_AFTER:
        while (!isGraphemeBreak(buf, start, count, offset)) {
            offset++;
        }
        break;
    case BEFORE:
        if (offset > start) {
            offset--;
        }
        // fall through
    case AT_OR_BEFORE:
        while (!isGraphemeBreak(buf, start, count, offset)) {
            offset--;
        }
        break;
    case AT:
        if (!isGraphemeBreak(buf, start, count, offset)) {
            offset = (size_t)-1;
        }
        break;
    }
    return offset;
}

}  // namespace android