tests/UnicodeUtils.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

/*
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <gtest/gtest.h>
#include <unicode/utf.h>
#include <cstdlib>

// src is of the form "U+1F431 | 'h' 'i'". Position of "|" gets saved to offset if non-null.
// Size is returned in an out parameter because gtest needs a void return for ASSERT to work.
void ParseUnicode(uint16_t* buf, size_t buf_size, const char* src, size_t* result_size,
        size_t* offset) {
    size_t input_ix = 0;
    size_t output_ix = 0;
    bool seen_offset = false;

    while (src[input_ix] != 0) {
        switch (src[input_ix]) {
        case '\'':
            // single ASCII char
            ASSERT_LT(src[input_ix], 0x80);
            input_ix++;
            ASSERT_NE(src[input_ix], 0);
            ASSERT_LT(output_ix, buf_size);
            buf[output_ix++] = (uint16_t)src[input_ix++];
            ASSERT_EQ(src[input_ix], '\'');
            input_ix++;
            break;
        case 'u':
        case 'U': {
            // Unicode codepoint in hex syntax
            input_ix++;
            ASSERT_EQ(src[input_ix], '+');
            input_ix++;
            char* endptr = (char*)src + input_ix;
            unsigned long int codepoint = strtoul(src + input_ix, &endptr, 16);
            size_t num_hex_digits = endptr - (src + input_ix);
            ASSERT_GE(num_hex_digits, 4u);  // also triggers on invalid number syntax, digits = 0
            ASSERT_LE(num_hex_digits, 6u);
            ASSERT_LE(codepoint, 0x10FFFFu);
            input_ix += num_hex_digits;
            if (U16_LENGTH(codepoint) == 1) {
                ASSERT_LE(output_ix + 1, buf_size);
                buf[output_ix++] = codepoint;
            } else {
                // UTF-16 encoding
                ASSERT_LE(output_ix + 2, buf_size);
                buf[output_ix++] = U16_LEAD(codepoint);
                buf[output_ix++] = U16_TRAIL(codepoint);
            }
            break;
        }
        case ' ':
            input_ix++;
            break;
        case '|':
            ASSERT_FALSE(seen_offset);
            ASSERT_NE(offset, nullptr);
            *offset = output_ix;
            seen_offset = true;
            input_ix++;
            break;
        default:
            FAIL();  // unexpected character
        }
    }
    ASSERT_NE(result_size, nullptr);
    *result_size = output_ix;
    ASSERT_TRUE(seen_offset || offset == nullptr);
}

TEST(UnicodeUtils, parse) {
    const size_t BUF_SIZE = 256;
    uint16_t buf[BUF_SIZE];
    size_t offset;
    size_t size;
    ParseUnicode(buf, BUF_SIZE, "U+000D U+1F431 | 'a'", &size, &offset);
    EXPECT_EQ(size, 4u);
    EXPECT_EQ(offset, 3u);
    EXPECT_EQ(buf[0], 0x000D);
    EXPECT_EQ(buf[1], 0xD83D);
    EXPECT_EQ(buf[2], 0xDC31);
    EXPECT_EQ(buf[3], 'a');
}