diff options
author | Narayan Kamath <narayan@google.com> | 2015-02-13 11:49:22 +0000 |
---|---|---|
committer | Narayan Kamath <narayan@google.com> | 2015-03-25 08:46:51 +0000 |
commit | e16dad1d6388b0305f13e2171308a77f42e7c682 (patch) | |
tree | a56d002ba622ecce7f1310e8fac56adca6547d5a /runtime/utf_test.cc | |
parent | ebfd062af849a915bd75eebd81c6fdea15b1d8d5 (diff) | |
download | art-e16dad1d6388b0305f13e2171308a77f42e7c682.tar.gz art-e16dad1d6388b0305f13e2171308a77f42e7c682.tar.bz2 art-e16dad1d6388b0305f13e2171308a77f42e7c682.zip |
Emit 4 byte UTF-sequences in place of encoded surrogate pairs.
Symmetric with a5afcfc73141e5e378d79a326d0 which converts 4 byte UTF-8
sequences to surrogate pairs.
bug: 18848397
Change-Id: I42adc275b7e0df0cbbd9d8a799e8b0447d8f5cae
Diffstat (limited to 'runtime/utf_test.cc')
-rw-r--r-- | runtime/utf_test.cc | 50 |
1 files changed, 50 insertions, 0 deletions
diff --git a/runtime/utf_test.cc b/runtime/utf_test.cc index 8048bbdbe0..94a6ea57e2 100644 --- a/runtime/utf_test.cc +++ b/runtime/utf_test.cc @@ -19,6 +19,8 @@ #include "common_runtime_test.h" #include "utf-inl.h" +#include <vector> + namespace art { class UtfTest : public CommonRuntimeTest {}; @@ -110,4 +112,52 @@ TEST_F(UtfTest, CountModifiedUtf8Chars) { EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding))); } +static void AssertConversion(const std::vector<uint16_t> input, + const std::vector<uint8_t> expected) { + ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size())); + + std::vector<uint8_t> output(expected.size()); + ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), &input[0], input.size()); + EXPECT_EQ(expected, output); +} + +TEST_F(UtfTest, CountAndConvertUtf8Bytes) { + // Surrogate pairs will be converted into 4 byte sequences. + AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 }); + + // Three byte encodings that are below & above the leading surrogate + // range respectively. + AssertConversion({ 0xdef0 }, { 0xed, 0xbb, 0xb0 }); + AssertConversion({ 0xdcff }, { 0xed, 0xb3, 0xbf }); + // Two byte encoding. + AssertConversion({ 0x0101 }, { 0xc4, 0x81 }); + + // Two byte special case : 0 must use an overlong encoding. + AssertConversion({ 0x0101, 0x0000 }, { 0xc4, 0x81, 0xc0, 0x80 }); + + // One byte encoding. + AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f }); + + AssertConversion({ + 0xd802, 0xdc02, // Surrogate pair + 0xdef0, 0xdcff, // Three byte encodings + 0x0101, 0x0000, // Two byte encodings + 'p' , 'p' // One byte encoding + }, { + 0xf0, 0x90, 0xa0, 0x82, + 0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf, + 0xc4, 0x81, 0xc0, 0x80, + 0x70, 0x70 + }); +} + +TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) { + // Unpaired trailing surrogate at the end of input. + AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 }); + // Unpaired (or incorrectly paired) surrogates in the middle of the input. + AssertConversion({ 'h', 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 'e' }); + AssertConversion({ 'h', 0xd801, 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81, 'e' }); + AssertConversion({ 'h', 0xdc00, 0xdc00, 'e' }, { 'h', 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80, 'e' }); +} + } // namespace art |