diff options
Diffstat (limited to 'runtime/utf.cc')
-rw-r--r-- | runtime/utf.cc | 44 |
1 files changed, 38 insertions, 6 deletions
diff --git a/runtime/utf.cc b/runtime/utf.cc index 7ff296bf0..39c8d153d 100644 --- a/runtime/utf.cc +++ b/runtime/utf.cc @@ -38,15 +38,30 @@ size_t CountModifiedUtf8Chars(const char* utf8) { // two-byte encoding continue; } - // three-byte encoding utf8++; + if ((ic & 0x10) == 0) { + // three-byte encoding + continue; + } + + // four-byte encoding: needs to be converted into a surrogate + // pair. + utf8++; + len++; } return len; } void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_in) { while (*utf8_data_in != '\0') { - *utf16_data_out++ = GetUtf16FromUtf8(&utf8_data_in); + const uint32_t ch = GetUtf16FromUtf8(&utf8_data_in); + const uint16_t leading = GetLeadingUtf16Char(ch); + const uint16_t trailing = GetTrailingUtf16Char(ch); + + *utf16_data_out++ = leading; + if (trailing != 0) { + *utf16_data_out++ = trailing; + } } } @@ -102,12 +117,29 @@ int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t return 1; } - int c1 = GetUtf16FromUtf8(&utf8); - int c2 = *utf16++; + const uint32_t pair = GetUtf16FromUtf8(&utf8); + + // First compare the leading utf16 char. + const uint16_t lhs = GetLeadingUtf16Char(pair); + const uint16_t rhs = *utf16++; --utf16_length; + if (lhs != rhs) { + return lhs > rhs ? 1 : -1; + } - if (c1 != c2) { - return c1 > c2 ? 1 : -1; + // Then compare the trailing utf16 char. First check if there + // are any characters left to consume. + const uint16_t lhs2 = GetTrailingUtf16Char(pair); + if (lhs2 != 0) { + if (utf16_length == 0) { + return 1; + } + + const uint16_t rhs2 = *utf16++; + --utf16_length; + if (lhs2 != rhs2) { + return lhs2 > rhs2 ? 1 : -1; + } } } } |