diff options
author | Narayan Kamath <narayan@google.com> | 2015-05-06 14:55:43 +0100 |
---|---|---|
committer | Narayan Kamath <narayan@google.com> | 2015-05-07 11:22:36 +0100 |
commit | 3ba8671d60061359fd833f60f7a9dca14878cc0b (patch) | |
tree | 47cc0f1c2aeb9bb9c4a5878297c075fbcbe3fd38 /runtime/utils.cc | |
parent | 5c8fe3028655b2f1fcab77080272f071cc0d8bc4 (diff) | |
download | art-3ba8671d60061359fd833f60f7a9dca14878cc0b.tar.gz art-3ba8671d60061359fd833f60f7a9dca14878cc0b.tar.bz2 art-3ba8671d60061359fd833f60f7a9dca14878cc0b.zip |
Fix broken checks in IsValidPartOfMemberNameUtf8Slow.
GetUtf16FromUtf8 returns a surrogate pair only if it encounters
a 4-byte UTF sequence. Three byte UTF sequences will only return the
first or second half of a pair so we need to check for that
explicitly.
bug: 20844537
Change-Id: Icb660fae77ac8a852fc768e6c1cd5766117e68e4
Diffstat (limited to 'runtime/utils.cc')
-rw-r--r-- | runtime/utils.cc | 52 |
1 files changed, 31 insertions, 21 deletions
diff --git a/runtime/utils.cc b/runtime/utils.cc index 650214f67..7986cdcbf 100644 --- a/runtime/utils.cc +++ b/runtime/utils.cc @@ -827,14 +827,21 @@ bool IsValidPartOfMemberNameUtf8Slow(const char** pUtf8Ptr) { */ const uint32_t pair = GetUtf16FromUtf8(pUtf8Ptr); - const uint16_t leading = GetLeadingUtf16Char(pair); - const uint32_t trailing = GetTrailingUtf16Char(pair); - if (trailing == 0) { - // Perform follow-up tests based on the high 8 bits of the - // lower surrogate. - switch (leading >> 8) { + // We have a surrogate pair resulting from a valid 4 byte UTF sequence. + // No further checks are necessary because 4 byte sequences span code + // points [U+10000, U+1FFFFF], which are valid codepoints in a dex + // identifier. Furthermore, GetUtf16FromUtf8 guarantees that each of + // the surrogate halves are valid and well formed in this instance. + if (GetTrailingUtf16Char(pair) != 0) { + return true; + } + + + // We've encountered a one, two or three byte UTF-8 sequence. The + // three byte UTF-8 sequence could be one half of a surrogate pair. + switch (leading >> 8) { case 0x00: // It's only valid if it's above the ISO-8859-1 high space (0xa0). return (leading > 0x00a0); @@ -842,9 +849,14 @@ bool IsValidPartOfMemberNameUtf8Slow(const char** pUtf8Ptr) { case 0xd9: case 0xda: case 0xdb: - // It looks like a leading surrogate but we didn't find a trailing - // surrogate if we're here. - return false; + { + // We found a three byte sequence encoding one half of a surrogate. + // Look for the other half. + const uint32_t pair2 = GetUtf16FromUtf8(pUtf8Ptr); + const uint16_t trailing = GetLeadingUtf16Char(pair2); + + return (GetTrailingUtf16Char(pair2) == 0) && (0xdc00 <= trailing && trailing <= 0xdfff); + } case 0xdc: case 0xdd: case 0xde: @@ -855,21 +867,19 @@ bool IsValidPartOfMemberNameUtf8Slow(const char** pUtf8Ptr) { case 0xff: // It's in the range that has spaces, controls, and specials. switch (leading & 0xfff8) { - case 0x2000: - case 0x2008: - case 0x2028: - case 0xfff0: - case 0xfff8: - return false; + case 0x2000: + case 0x2008: + case 0x2028: + case 0xfff0: + case 0xfff8: + return false; } - break; - } - - return true; + return true; + default: + return true; } - // We have a surrogate pair. Check that trailing surrogate is well formed. - return (trailing >= 0xdc00 && trailing <= 0xdfff); + UNREACHABLE(); } /* Return whether the pointed-at modified-UTF-8 encoded character is |