summaryrefslogtreecommitdiffstats
path: root/libs/minikin/Hyphenator.cpp
blob: c5eb60b8e50c0e3460f40dc70625fc0b0cfb821e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
/*
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <vector>
#include <memory>
#include <algorithm>
#include <string>
#include <unicode/uchar.h>

// HACK: for reading pattern file
#include <fcntl.h>

#define LOG_TAG "Minikin"
#include "utils/Log.h"

#include "minikin/Hyphenator.h"

using std::vector;

namespace android {

static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;

// The following are structs that correspond to tables inside the hyb file format

struct AlphabetTable0 {
    uint32_t version;
    uint32_t min_codepoint;
    uint32_t max_codepoint;
    uint8_t data[1];  // actually flexible array, size is known at runtime
};

struct AlphabetTable1 {
    uint32_t version;
    uint32_t n_entries;
    uint32_t data[1]; // actually flexible array, size is known at runtime

    static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
    static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
};

struct Trie {
    uint32_t version;
    uint32_t char_mask;
    uint32_t link_shift;
    uint32_t link_mask;
    uint32_t pattern_shift;
    uint32_t n_entries;
    uint32_t data[1];  // actually flexible array, size is known at runtime
};

struct Pattern {
    uint32_t version;
    uint32_t n_entries;
    uint32_t pattern_offset;
    uint32_t pattern_size;
    uint32_t data[1];  // actually flexible array, size is known at runtime

    // accessors
    static uint32_t len(uint32_t entry) { return entry >> 26; }
    static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
    const uint8_t* buf(uint32_t entry) const {
        return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff);
    }
};

struct Header {
    uint32_t magic;
    uint32_t version;
    uint32_t alphabet_offset;
    uint32_t trie_offset;
    uint32_t pattern_offset;
    uint32_t file_size;

    // accessors
    const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); }
    uint32_t alphabetVersion() const {
        return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
    }
    const AlphabetTable0* alphabetTable0() const {
        return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
    }
    const AlphabetTable1* alphabetTable1() const {
        return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
    }
    const Trie* trieTable() const {
        return reinterpret_cast<const Trie*>(bytes() + trie_offset);
    }
    const Pattern* patternTable() const {
        return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
    }
};

Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData) {
    Hyphenator* result = new Hyphenator;
    result->patternData = patternData;
    return result;
}

void Hyphenator::hyphenate(vector<uint8_t>* result, const uint16_t* word, size_t len) {
    result->clear();
    result->resize(len);
    const size_t paddedLen = len + 2;  // start and stop code each count for 1
    if (patternData != nullptr &&
            (int)len >= MIN_PREFIX + MIN_SUFFIX && paddedLen <= MAX_HYPHENATED_SIZE) {
        uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
        if (alphabetLookup(alpha_codes, word, len)) {
            hyphenateFromCodes(result->data(), alpha_codes, paddedLen);
            return;
        }
        // TODO: try NFC normalization
        // TODO: handle non-BMP Unicode (requires remapping of offsets)
    }
    hyphenateSoft(result->data(), word, len);
}

// If any soft hyphen is present in the word, use soft hyphens to decide hyphenation,
// as recommended in UAX #14 (Use of Soft Hyphen)
void Hyphenator::hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len) {
    result[0] = 0;
    for (size_t i = 1; i < len; i++) {
        result[i] = word[i - 1] == CHAR_SOFT_HYPHEN;
     }
}

bool Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len) {
    const Header* header = getHeader();
    // TODO: check header magic
    uint32_t alphabetVersion = header->alphabetVersion();
    if (alphabetVersion == 0) {
        const AlphabetTable0* alphabet = header->alphabetTable0();
        uint32_t min_codepoint = alphabet->min_codepoint;
        uint32_t max_codepoint = alphabet->max_codepoint;
        alpha_codes[0] = 0;  // word start
        for (size_t i = 0; i < len; i++) {
            uint16_t c = word[i];
            if (c < min_codepoint || c >= max_codepoint) {
                return false;
            }
            uint8_t code = alphabet->data[c - min_codepoint];
            if (code == 0) {
                return false;
            }
            alpha_codes[i + 1] = code;
        }
        alpha_codes[len + 1] = 0;  // word termination
        return true;
    } else if (alphabetVersion == 1) {
        const AlphabetTable1* alphabet = header->alphabetTable1();
        size_t n_entries = alphabet->n_entries;
        const uint32_t* begin = alphabet->data;
        const uint32_t* end = begin + n_entries;
        alpha_codes[0] = 0;
        for (size_t i = 0; i < len; i++) {
            uint16_t c = word[i];
            auto p = std::lower_bound(begin, end, c << 11);
            if (p == end) {
                return false;
            }
            uint32_t entry = *p;
            if (AlphabetTable1::codepoint(entry) != c) {
                return false;
            }
            alpha_codes[i + 1] = AlphabetTable1::value(entry);
        }
        alpha_codes[len + 1] = 0;
        return true;
    }
    return false;
}

/**
 * Internal implementation, after conversion to codes. All case folding and normalization
 * has been done by now, and all characters have been found in the alphabet.
 * Note: len here is the padded length including 0 codes at start and end.
 **/
void Hyphenator::hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size_t len) {
    const Header* header = getHeader();
    const Trie* trie = header->trieTable();
    const Pattern* pattern = header->patternTable();
    uint32_t char_mask = trie->char_mask;
    uint32_t link_shift = trie->link_shift;
    uint32_t link_mask = trie->link_mask;
    uint32_t pattern_shift = trie->pattern_shift;
    size_t maxOffset = len - MIN_SUFFIX - 1;
    for (size_t i = 0; i < len - 1; i++) {
        uint32_t node = 0;  // index into Trie table
        for (size_t j = i; j < len; j++) {
            uint16_t c = codes[j];
            uint32_t entry = trie->data[node + c];
            if ((entry & char_mask) == c) {
                node = (entry & link_mask) >> link_shift;
            } else {
                break;
            }
            uint32_t pat_ix = trie->data[node] >> pattern_shift;
            // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset
            // into the buf pool. This is the pattern for the substring (i..j) we just matched,
            // which we combine (via point-wise max) into the result vector.
            if (pat_ix != 0) {
                uint32_t pat_entry = pattern->data[pat_ix];
                int pat_len = Pattern::len(pat_entry);
                int pat_shift = Pattern::shift(pat_entry);
                const uint8_t* pat_buf = pattern->buf(pat_entry);
                int offset = j + 1 - (pat_len + pat_shift);
                // offset is the index within result that lines up with the start of pat_buf
                int start = std::max(MIN_PREFIX - offset, 0);
                int end = std::min(pat_len, (int)maxOffset - offset);
                for (int k = start; k < end; k++) {
                    result[offset + k] = std::max(result[offset + k], pat_buf[k]);
                }
            }
        }
    }
    // Since the above calculation does not modify values outside
    // [MIN_PREFIX, len - MIN_SUFFIX], they are left as 0.
    for (size_t i = MIN_PREFIX; i < maxOffset; i++) {
        result[i] &= 1;
    }
}

}  // namespace android