diff options
Diffstat (limited to 'runtime')
-rw-r--r-- | runtime/utf.h | 6 | ||||
-rw-r--r-- | runtime/utils.cc | 52 | ||||
-rw-r--r-- | runtime/utils_test.cc | 23 |
3 files changed, 57 insertions, 24 deletions
diff --git a/runtime/utf.h b/runtime/utf.h index dd38afa17..7f05248c2 100644 --- a/runtime/utf.h +++ b/runtime/utf.h @@ -87,9 +87,9 @@ size_t ComputeModifiedUtf8Hash(const char* chars); /* * Retrieve the next UTF-16 character or surrogate pair from a UTF-8 string. * single byte, 2-byte and 3-byte UTF-8 sequences result in a single UTF-16 - * character whereas 4-byte UTF-8 sequences result in a surrogate pair. Use - * GetLeadingUtf16Char and GetTrailingUtf16Char to process the return value - * of this function. + * character (possibly one half of a surrogate) whereas 4-byte UTF-8 sequences + * result in a surrogate pair. Use GetLeadingUtf16Char and GetTrailingUtf16Char + * to process the return value of this function. * * Advances "*utf8_data_in" to the start of the next character. * diff --git a/runtime/utils.cc b/runtime/utils.cc index 650214f67..7986cdcbf 100644 --- a/runtime/utils.cc +++ b/runtime/utils.cc @@ -827,14 +827,21 @@ bool IsValidPartOfMemberNameUtf8Slow(const char** pUtf8Ptr) { */ const uint32_t pair = GetUtf16FromUtf8(pUtf8Ptr); - const uint16_t leading = GetLeadingUtf16Char(pair); - const uint32_t trailing = GetTrailingUtf16Char(pair); - if (trailing == 0) { - // Perform follow-up tests based on the high 8 bits of the - // lower surrogate. - switch (leading >> 8) { + // We have a surrogate pair resulting from a valid 4 byte UTF sequence. + // No further checks are necessary because 4 byte sequences span code + // points [U+10000, U+1FFFFF], which are valid codepoints in a dex + // identifier. Furthermore, GetUtf16FromUtf8 guarantees that each of + // the surrogate halves are valid and well formed in this instance. + if (GetTrailingUtf16Char(pair) != 0) { + return true; + } + + + // We've encountered a one, two or three byte UTF-8 sequence. The + // three byte UTF-8 sequence could be one half of a surrogate pair. + switch (leading >> 8) { case 0x00: // It's only valid if it's above the ISO-8859-1 high space (0xa0). return (leading > 0x00a0); @@ -842,9 +849,14 @@ bool IsValidPartOfMemberNameUtf8Slow(const char** pUtf8Ptr) { case 0xd9: case 0xda: case 0xdb: - // It looks like a leading surrogate but we didn't find a trailing - // surrogate if we're here. - return false; + { + // We found a three byte sequence encoding one half of a surrogate. + // Look for the other half. + const uint32_t pair2 = GetUtf16FromUtf8(pUtf8Ptr); + const uint16_t trailing = GetLeadingUtf16Char(pair2); + + return (GetTrailingUtf16Char(pair2) == 0) && (0xdc00 <= trailing && trailing <= 0xdfff); + } case 0xdc: case 0xdd: case 0xde: @@ -855,21 +867,19 @@ bool IsValidPartOfMemberNameUtf8Slow(const char** pUtf8Ptr) { case 0xff: // It's in the range that has spaces, controls, and specials. switch (leading & 0xfff8) { - case 0x2000: - case 0x2008: - case 0x2028: - case 0xfff0: - case 0xfff8: - return false; + case 0x2000: + case 0x2008: + case 0x2028: + case 0xfff0: + case 0xfff8: + return false; } - break; - } - - return true; + return true; + default: + return true; } - // We have a surrogate pair. Check that trailing surrogate is well formed. - return (trailing >= 0xdc00 && trailing <= 0xdfff); + UNREACHABLE(); } /* Return whether the pointed-at modified-UTF-8 encoded character is diff --git a/runtime/utils_test.cc b/runtime/utils_test.cc index 195de0c12..869d30512 100644 --- a/runtime/utils_test.cc +++ b/runtime/utils_test.cc @@ -521,4 +521,27 @@ TEST_F(UtilsTest, TestSleep) { EXPECT_GT(NanoTime() - start, MsToNs(1000)); } +TEST_F(UtilsTest, IsValidDescriptor) { + std::vector<uint8_t> descriptor( + { 'L', 'a', '/', 'b', '$', 0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80, ';', 0x00 }); + EXPECT_TRUE(IsValidDescriptor(reinterpret_cast<char*>(&descriptor[0]))); + + std::vector<uint8_t> unpaired_surrogate( + { 'L', 'a', '/', 'b', '$', 0xed, 0xa0, 0x80, ';', 0x00 }); + EXPECT_FALSE(IsValidDescriptor(reinterpret_cast<char*>(&unpaired_surrogate[0]))); + + std::vector<uint8_t> unpaired_surrogate_at_end( + { 'L', 'a', '/', 'b', '$', 0xed, 0xa0, 0x80, 0x00 }); + EXPECT_FALSE(IsValidDescriptor(reinterpret_cast<char*>(&unpaired_surrogate_at_end[0]))); + + std::vector<uint8_t> invalid_surrogate( + { 'L', 'a', '/', 'b', '$', 0xed, 0xb0, 0x80, ';', 0x00 }); + EXPECT_FALSE(IsValidDescriptor(reinterpret_cast<char*>(&invalid_surrogate[0]))); + + std::vector<uint8_t> unpaired_surrogate_with_multibyte_sequence( + { 'L', 'a', '/', 'b', '$', 0xed, 0xb0, 0x80, 0xf0, 0x9f, 0x8f, 0xa0, ';', 0x00 }); + EXPECT_FALSE( + IsValidDescriptor(reinterpret_cast<char*>(&unpaired_surrogate_with_multibyte_sequence[0]))); +} + } // namespace art |