diff options
author | Narayan Kamath <narayan@google.com> | 2015-02-13 11:49:22 +0000 |
---|---|---|
committer | Narayan Kamath <narayan@google.com> | 2015-03-25 08:46:51 +0000 |
commit | e16dad1d6388b0305f13e2171308a77f42e7c682 (patch) | |
tree | a56d002ba622ecce7f1310e8fac56adca6547d5a | |
parent | ebfd062af849a915bd75eebd81c6fdea15b1d8d5 (diff) | |
download | android_art-e16dad1d6388b0305f13e2171308a77f42e7c682.tar.gz android_art-e16dad1d6388b0305f13e2171308a77f42e7c682.tar.bz2 android_art-e16dad1d6388b0305f13e2171308a77f42e7c682.zip |
Emit 4 byte UTF-sequences in place of encoded surrogate pairs.
Symmetric with a5afcfc73141e5e378d79a326d0 which converts 4 byte UTF-8
sequences to surrogate pairs.
bug: 18848397
Change-Id: I42adc275b7e0df0cbbd9d8a799e8b0447d8f5cae
-rw-r--r-- | runtime/jni_internal_test.cc | 36 | ||||
-rw-r--r-- | runtime/utf.cc | 53 | ||||
-rw-r--r-- | runtime/utf_test.cc | 50 |
3 files changed, 122 insertions, 17 deletions
diff --git a/runtime/jni_internal_test.cc b/runtime/jni_internal_test.cc index 10482142da..5516eab4f8 100644 --- a/runtime/jni_internal_test.cc +++ b/runtime/jni_internal_test.cc @@ -1355,24 +1355,38 @@ TEST_F(JniInternalTest, NewStringUTF) { s = env_->NewStringUTF("\xed\xa0\x81\xed\xb0\x80"); EXPECT_NE(s, nullptr); EXPECT_EQ(2, env_->GetStringLength(s)); - // Note that this uses 2 x 3 byte UTF sequences, one - // for each half of the surrogate pair. - EXPECT_EQ(6, env_->GetStringUTFLength(s)); + + // The surrogate pair gets encoded into a 4 byte UTF sequence.. + EXPECT_EQ(4, env_->GetStringUTFLength(s)); const char* chars = env_->GetStringUTFChars(s, nullptr); - EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80", chars); + EXPECT_STREQ("\xf0\x90\x90\x80", chars); env_->ReleaseStringUTFChars(s, chars); + // .. but is stored as is in the utf-16 representation. + const jchar* jchars = env_->GetStringChars(s, nullptr); + EXPECT_EQ(0xd801, jchars[0]); + EXPECT_EQ(0xdc00, jchars[1]); + env_->ReleaseStringChars(s, jchars); + // 4 byte UTF sequence appended to an encoded surrogate pair. s = env_->NewStringUTF("\xed\xa0\x81\xed\xb0\x80 \xf0\x9f\x8f\xa0"); EXPECT_NE(s, nullptr); + + // The 4 byte sequence {0xf0, 0x9f, 0x8f, 0xa0} is converted into a surrogate + // pair {0xd83c, 0xdfe0}. EXPECT_EQ(5, env_->GetStringLength(s)); - EXPECT_EQ(13, env_->GetStringUTFLength(s)); + jchars = env_->GetStringChars(s, nullptr); + // The first surrogate pair, encoded as such in the input. + EXPECT_EQ(0xd801, jchars[0]); + EXPECT_EQ(0xdc00, jchars[1]); + // The second surrogate pair, from the 4 byte UTF sequence in the input. + EXPECT_EQ(0xd83c, jchars[3]); + EXPECT_EQ(0xdfe0, jchars[4]); + env_->ReleaseStringChars(s, jchars); + + EXPECT_EQ(9, env_->GetStringUTFLength(s)); chars = env_->GetStringUTFChars(s, nullptr); - // The 4 byte sequence {0xf0, 0x9f, 0x8f, 0xa0} is converted into a surrogate - // pair {0xd83c, 0xdfe0} which is then converted into a two three byte - // sequences {0xed 0xa0, 0xbc} and {0xed, 0xbf, 0xa0}, one for each half of - // the surrogate pair. - EXPECT_STREQ("\xed\xa0\x81\xed\xb0\x80 \xed\xa0\xbc\xed\xbf\xa0", chars); + EXPECT_STREQ("\xf0\x90\x90\x80 \xf0\x9f\x8f\xa0", chars); env_->ReleaseStringUTFChars(s, chars); // A string with 1, 2, 3 and 4 byte UTF sequences with spaces @@ -1380,7 +1394,7 @@ TEST_F(JniInternalTest, NewStringUTF) { s = env_->NewStringUTF("\x24 \xc2\xa2 \xe2\x82\xac \xf0\x9f\x8f\xa0"); EXPECT_NE(s, nullptr); EXPECT_EQ(8, env_->GetStringLength(s)); - EXPECT_EQ(15, env_->GetStringUTFLength(s)); + EXPECT_EQ(13, env_->GetStringUTFLength(s)); } TEST_F(JniInternalTest, NewString) { diff --git a/runtime/utf.cc b/runtime/utf.cc index 39c8d153d5..3d13c3e492 100644 --- a/runtime/utf.cc +++ b/runtime/utf.cc @@ -67,15 +67,39 @@ void ConvertModifiedUtf8ToUtf16(uint16_t* utf16_data_out, const char* utf8_data_ void ConvertUtf16ToModifiedUtf8(char* utf8_out, const uint16_t* utf16_in, size_t char_count) { while (char_count--) { - uint16_t ch = *utf16_in++; + const uint16_t ch = *utf16_in++; if (ch > 0 && ch <= 0x7f) { *utf8_out++ = ch; } else { + // char_count == 0 here implies we've encountered an unpaired + // surrogate and we have no choice but to encode it as 3-byte UTF + // sequence. Note that unpaired surrogates can occur as a part of + // "normal" operation. + if ((ch >= 0xd800 && ch <= 0xdbff) && (char_count > 0)) { + const uint16_t ch2 = *utf16_in; + + // Check if the other half of the pair is within the expected + // range. If it isn't, we will have to emit both "halves" as + // separate 3 byte sequences. + if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { + utf16_in++; + char_count--; + const uint32_t code_point = (ch << 10) + ch2 - 0x035fdc00; + *utf8_out++ = (code_point >> 18) | 0xf0; + *utf8_out++ = ((code_point >> 12) & 0x3f) | 0x80; + *utf8_out++ = ((code_point >> 6) & 0x3f) | 0x80; + *utf8_out++ = (code_point & 0x3f) | 0x80; + continue; + } + } + if (ch > 0x07ff) { + // Three byte encoding. *utf8_out++ = (ch >> 12) | 0xe0; *utf8_out++ = ((ch >> 6) & 0x3f) | 0x80; *utf8_out++ = (ch & 0x3f) | 0x80; } else /*(ch > 0x7f || ch == 0)*/ { + // Two byte encoding. *utf8_out++ = (ch >> 6) | 0xc0; *utf8_out++ = (ch & 0x3f) | 0x80; } @@ -147,15 +171,32 @@ int CompareModifiedUtf8ToUtf16AsCodePointValues(const char* utf8, const uint16_t size_t CountUtf8Bytes(const uint16_t* chars, size_t char_count) { size_t result = 0; while (char_count--) { - uint16_t ch = *chars++; + const uint16_t ch = *chars++; if (ch > 0 && ch <= 0x7f) { ++result; - } else { - if (ch > 0x7ff) { - result += 3; + } else if (ch >= 0xd800 && ch <= 0xdbff) { + if (char_count > 0) { + const uint16_t ch2 = *chars; + // If we find a properly paired surrogate, we emit it as a 4 byte + // UTF sequence. If we find an unpaired leading or trailing surrogate, + // we emit it as a 3 byte sequence like would have done earlier. + if (ch2 >= 0xdc00 && ch2 <= 0xdfff) { + chars++; + char_count--; + + result += 4; + } else { + result += 3; + } } else { - result += 2; + // This implies we found an unpaired trailing surrogate at the end + // of a string. + result += 3; } + } else if (ch > 0x7ff) { + result += 3; + } else { + result += 2; } } return result; diff --git a/runtime/utf_test.cc b/runtime/utf_test.cc index 8048bbdbe0..94a6ea57e2 100644 --- a/runtime/utf_test.cc +++ b/runtime/utf_test.cc @@ -19,6 +19,8 @@ #include "common_runtime_test.h" #include "utf-inl.h" +#include <vector> + namespace art { class UtfTest : public CommonRuntimeTest {}; @@ -110,4 +112,52 @@ TEST_F(UtfTest, CountModifiedUtf8Chars) { EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding))); } +static void AssertConversion(const std::vector<uint16_t> input, + const std::vector<uint8_t> expected) { + ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size())); + + std::vector<uint8_t> output(expected.size()); + ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), &input[0], input.size()); + EXPECT_EQ(expected, output); +} + +TEST_F(UtfTest, CountAndConvertUtf8Bytes) { + // Surrogate pairs will be converted into 4 byte sequences. + AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 }); + + // Three byte encodings that are below & above the leading surrogate + // range respectively. + AssertConversion({ 0xdef0 }, { 0xed, 0xbb, 0xb0 }); + AssertConversion({ 0xdcff }, { 0xed, 0xb3, 0xbf }); + // Two byte encoding. + AssertConversion({ 0x0101 }, { 0xc4, 0x81 }); + + // Two byte special case : 0 must use an overlong encoding. + AssertConversion({ 0x0101, 0x0000 }, { 0xc4, 0x81, 0xc0, 0x80 }); + + // One byte encoding. + AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f }); + + AssertConversion({ + 0xd802, 0xdc02, // Surrogate pair + 0xdef0, 0xdcff, // Three byte encodings + 0x0101, 0x0000, // Two byte encodings + 'p' , 'p' // One byte encoding + }, { + 0xf0, 0x90, 0xa0, 0x82, + 0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf, + 0xc4, 0x81, 0xc0, 0x80, + 0x70, 0x70 + }); +} + +TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) { + // Unpaired trailing surrogate at the end of input. + AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 }); + // Unpaired (or incorrectly paired) surrogates in the middle of the input. + AssertConversion({ 'h', 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 'e' }); + AssertConversion({ 'h', 0xd801, 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81, 'e' }); + AssertConversion({ 'h', 0xdc00, 0xdc00, 'e' }, { 'h', 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80, 'e' }); +} + } // namespace art |