diff options
author | Narayan Kamath <narayan@google.com> | 2015-01-29 20:06:46 +0000 |
---|---|---|
committer | Narayan Kamath <narayan@google.com> | 2015-02-12 11:54:37 +0000 |
commit | a5afcfc73141e5e378d79a326d02c5c2039fb025 (patch) | |
tree | 424add9558fb816c4f1d2f4edd128f4f2a086d9a /runtime/utf-inl.h | |
parent | 5a3399deaf448c8434d9ba0916ff799b1b791d95 (diff) | |
download | art-a5afcfc73141e5e378d79a326d02c5c2039fb025.tar.gz art-a5afcfc73141e5e378d79a326d02c5c2039fb025.tar.bz2 art-a5afcfc73141e5e378d79a326d02c5c2039fb025.zip |
Be more lenient with 4 byte UTF-8 sequences.
Accept 4 byte sequences and convert them into surrogate
pairs instead of expecting 2 separate 3 byte sequences
each encoding one half of a surrogate pair.
Note that in addition to supporting 4 byte sequences in
strings from JNI, we also tolerate them in dex files. This
is mainly for consistency, and there's no need to claim any
sort of official support.
bug: 18848397
bug: https://code.google.com/p/android/issues/detail?id=81341
Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
Diffstat (limited to 'runtime/utf-inl.h')
-rw-r--r-- | runtime/utf-inl.h | 100 |
1 files changed, 49 insertions, 51 deletions
diff --git a/runtime/utf-inl.h b/runtime/utf-inl.h index 1373d1704..b2d6765fb 100644 --- a/runtime/utf-inl.h +++ b/runtime/utf-inl.h @@ -21,26 +21,57 @@ namespace art { -inline uint16_t GetUtf16FromUtf8(const char** utf8_data_in) { - uint8_t one = *(*utf8_data_in)++; +inline uint16_t GetTrailingUtf16Char(uint32_t maybe_pair) { + return static_cast<uint16_t>(maybe_pair >> 16); +} + +inline uint16_t GetLeadingUtf16Char(uint32_t maybe_pair) { + return static_cast<uint16_t>(maybe_pair & 0x0000FFFF); +} + +inline uint32_t GetUtf16FromUtf8(const char** utf8_data_in) { + const uint8_t one = *(*utf8_data_in)++; if ((one & 0x80) == 0) { // one-byte encoding return one; } - // two- or three-byte encoding - uint8_t two = *(*utf8_data_in)++; + + const uint8_t two = *(*utf8_data_in)++; if ((one & 0x20) == 0) { // two-byte encoding return ((one & 0x1f) << 6) | (two & 0x3f); } - // three-byte encoding - uint8_t three = *(*utf8_data_in)++; - return ((one & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f); + + const uint8_t three = *(*utf8_data_in)++; + if ((one & 0x10) == 0) { + return ((one & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f); + } + + // Four byte encodings need special handling. We'll have + // to convert them into a surrogate pair. + const uint8_t four = *(*utf8_data_in)++; + + // Since this is a 4 byte UTF-8 sequence, it will lie between + // U+10000 and U+1FFFFF. + // + // TODO: What do we do about values in (U+10FFFF, U+1FFFFF) ? The + // spec says they're invalid but nobody appears to check for them. + const uint32_t code_point = ((one & 0x0f) << 18) | ((two & 0x3f) << 12) + | ((three & 0x3f) << 6) | (four & 0x3f); + + uint32_t surrogate_pair = 0; + // Step two: Write out the high (leading) surrogate to the bottom 16 bits + // of the of the 32 bit type. + surrogate_pair |= ((code_point >> 10) + 0xd7c0) & 0xffff; + // Step three : Write out the low (trailing) surrogate to the top 16 bits. + surrogate_pair |= ((code_point & 0x03ff) + 0xdc00) << 16; + + return surrogate_pair; } inline int CompareModifiedUtf8ToModifiedUtf8AsUtf16CodePointValues(const char* utf8_1, const char* utf8_2) { - uint16_t c1, c2; + uint32_t c1, c2; do { c1 = *utf8_1; c2 = *utf8_2; @@ -50,50 +81,17 @@ inline int CompareModifiedUtf8ToModifiedUtf8AsUtf16CodePointValues(const char* u } else if (c2 == 0) { return 1; } - // Assume 1-byte value and handle all cases first. - utf8_1++; - utf8_2++; - if ((c1 & 0x80) == 0) { - if (c1 == c2) { - // Matching 1-byte values. - continue; - } else { - // Non-matching values. - if ((c2 & 0x80) == 0) { - // 1-byte value, do nothing. - } else if ((c2 & 0x20) == 0) { - // 2-byte value. - c2 = ((c2 & 0x1f) << 6) | (*utf8_2 & 0x3f); - } else { - // 3-byte value. - c2 = ((c2 & 0x0f) << 12) | ((utf8_2[0] & 0x3f) << 6) | (utf8_2[1] & 0x3f); - } - return static_cast<int>(c1) - static_cast<int>(c2); - } - } - // Non-matching or multi-byte values. - if ((c1 & 0x20) == 0) { - // 2-byte value. - c1 = ((c1 & 0x1f) << 6) | (*utf8_1 & 0x3f); - utf8_1++; - } else { - // 3-byte value. - c1 = ((c1 & 0x0f) << 12) | ((utf8_1[0] & 0x3f) << 6) | (utf8_1[1] & 0x3f); - utf8_1 += 2; - } - if ((c2 & 0x80) == 0) { - // 1-byte value, do nothing. - } else if ((c2 & 0x20) == 0) { - // 2-byte value. - c2 = ((c2 & 0x1f) << 6) | (*utf8_2 & 0x3f); - utf8_2++; - } else { - // 3-byte value. - c2 = ((c2 & 0x0f) << 12) | ((utf8_2[0] & 0x3f) << 6) | (utf8_2[1] & 0x3f); - utf8_2 += 2; - } + + c1 = GetUtf16FromUtf8(&utf8_1); + c2 = GetUtf16FromUtf8(&utf8_2); } while (c1 == c2); - return static_cast<int>(c1) - static_cast<int>(c2); + + const uint32_t leading_surrogate_diff = GetLeadingUtf16Char(c1) - GetLeadingUtf16Char(c2); + if (leading_surrogate_diff != 0) { + return static_cast<int>(leading_surrogate_diff); + } + + return GetTrailingUtf16Char(c1) - GetTrailingUtf16Char(c2); } } // namespace art |