summaryrefslogtreecommitdiffstats
path: root/libs/minikin/WordBreaker.cpp
blob: 34e7a932c9593e57527835c908cb0f9ab7766b59 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
/*
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define LOG_TAG "Minikin"
#include <cutils/log.h>

#include <minikin/WordBreaker.h>
#include "MinikinInternal.h"

#include <unicode/uchar.h>
#include <unicode/utf16.h>

namespace android {

const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
const uint32_t CHAR_ZWJ = 0x200D;

void WordBreaker::setLocale(const icu::Locale& locale) {
    UErrorCode status = U_ZERO_ERROR;
    mBreakIterator.reset(icu::BreakIterator::createLineInstance(locale, status));
    // TODO: handle failure status
    if (mText != nullptr) {
        mBreakIterator->setText(&mUText, status);
    }
    mIteratorWasReset = true;
}

void WordBreaker::setText(const uint16_t* data, size_t size) {
    mText = data;
    mTextSize = size;
    mIteratorWasReset = false;
    mLast = 0;
    mCurrent = 0;
    mScanOffset = 0;
    mInEmailOrUrl = false;
    UErrorCode status = U_ZERO_ERROR;
    utext_openUChars(&mUText, data, size, &status);
    mBreakIterator->setText(&mUText, status);
    mBreakIterator->first();
}

ssize_t WordBreaker::current() const {
    return mCurrent;
}

enum ScanState {
    START,
    SAW_AT,
    SAW_COLON,
    SAW_COLON_SLASH,
    SAW_COLON_SLASH_SLASH,
};

/**
 * Determine whether a line break at position i within the buffer buf is valid. This
 * represents customization beyond the ICU behavior, because plain ICU provides some
 * line break opportunities that we don't want.
 **/
static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
    uint32_t codePoint;
    size_t prev_offset = i;
    U16_PREV(buf, 0, prev_offset, codePoint);
    if (codePoint == CHAR_SOFT_HYPHEN) {
        return false;
    }
    uint32_t next_codepoint;
    size_t next_offset = i;
    U16_NEXT(buf, next_offset, bufEnd, next_codepoint);

    // Proposed change to LB24 from http://www.unicode.org/L2/L2016/16043r-line-break-pr-po.txt
    //(AL | HL) × (PR | PO)
    int32_t lineBreak = u_getIntPropertyValue(codePoint, UCHAR_LINE_BREAK);
    if (lineBreak == U_LB_ALPHABETIC || lineBreak == U_LB_HEBREW_LETTER) {
        lineBreak = u_getIntPropertyValue(next_codepoint, UCHAR_LINE_BREAK);
        if (lineBreak == U_LB_PREFIX_NUMERIC || lineBreak == U_LB_POSTFIX_NUMERIC) {
            return false;
        }
    }

    // Emoji ZWJ sequences.
    if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
        return false;
    }

    // Proposed Rule LB30b from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
    // EB x EM
    if (isEmojiModifier(next_codepoint)) {
        if (codePoint == 0xFE0F && prev_offset > 0) {
            // skip over emoji variation selector
            U16_PREV(buf, 0, prev_offset, codePoint);
        }
        if (isEmojiBase(codePoint)) {
            return false;
        }
    }
    return true;
}

// Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
static bool breakAfter(uint16_t c) {
    return c == ':' || c == '=' || c == '&';
}

// Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
static bool breakBefore(uint16_t c) {
    return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#'
            || c == '%' || c == '=' || c == '&';
}

ssize_t WordBreaker::next() {
    mLast = mCurrent;

    // scan forward from current ICU position for email address or URL
    if (mLast >= mScanOffset) {
        ScanState state = START;
        size_t i;
        for (i = mLast; i < mTextSize; i++) {
            uint16_t c = mText[i];
            // scan only ASCII characters, stop at space
            if (!(' ' < c && c <= 0x007E)) {
                break;
            }
            if (state == START && c == '@') {
                state = SAW_AT;
            } else if (state == START && c == ':') {
                state = SAW_COLON;
            } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
                if (c == '/') {
                    state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
                } else {
                    state = START;
                }
            }
        }
        if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
            if (!mBreakIterator->isBoundary(i)) {
                i = mBreakIterator->following(i);
            }
            mInEmailOrUrl = true;
            mIteratorWasReset = true;
        } else {
            mInEmailOrUrl = false;
        }
        mScanOffset = i;
    }

    if (mInEmailOrUrl) {
        // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
        uint16_t lastChar = mText[mLast];
        ssize_t i;
        for (i = mLast + 1; i < mScanOffset; i++) {
            if (breakAfter(lastChar)) {
                break;
            }
            // break after double slash
            if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
                break;
            }
            uint16_t thisChar = mText[i];
            // never break after hyphen
            if (lastChar != '-') {
                if (breakBefore(thisChar)) {
                    break;
                }
                // break before single slash
                if (thisChar == '/' && lastChar != '/' &&
                            !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
                    break;
                }
            }
            lastChar = thisChar;
        }
        mCurrent = i;
        return mCurrent;
    }

    int32_t result;
    do {
        if (mIteratorWasReset) {
            result = mBreakIterator->following(mCurrent);
            mIteratorWasReset = false;
        } else {
            result = mBreakIterator->next();
        }
    } while (result != icu::BreakIterator::DONE && (size_t)result != mTextSize
            && !isBreakValid(mText, mTextSize, result));
    mCurrent = (ssize_t)result;
    return mCurrent;
}

ssize_t WordBreaker::wordStart() const {
    if (mInEmailOrUrl) {
        return mLast;
    }
    ssize_t result = mLast;
    while (result < mCurrent) {
        UChar32 c;
        ssize_t ix = result;
        U16_NEXT(mText, ix, mCurrent, c);
        int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
        // strip leading punctuation, defined as OP and QU line breaking classes,
        // see UAX #14
        if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
            break;
        }
        result = ix;
    }
    return result;
}

ssize_t WordBreaker::wordEnd() const {
    if (mInEmailOrUrl) {
        return mLast;
    }
    ssize_t result = mCurrent;
    while (result > mLast) {
        UChar32 c;
        ssize_t ix = result;
        U16_PREV(mText, mLast, ix, c);
        int32_t gc_mask = U_GET_GC_MASK(c);
        // strip trailing space and punctuation
        if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
            break;
        }
        result = ix;
    }
    return result;
}

int WordBreaker::breakBadness() const {
    return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
}

void WordBreaker::finish() {
    mText = nullptr;
    // Note: calling utext_close multiply is safe
    utext_close(&mUText);
}

}  // namespace android