Emit 4 byte UTF-sequences in place of encoded surrogate pairs.

Symmetric with a5afcfc73141e5e378d79a326d0 which converts 4 byte UTF-8 sequences to surrogate pairs. bug: 18848397 Change-Id: I42adc275b7e0df0cbbd9d8a799e8b0447d8f5cae
author: Narayan Kamath <narayan@google.com> 2015-02-13 11:49:22 +0000
committer: Narayan Kamath <narayan@google.com> 2015-03-25 08:46:51 +0000
commit: e16dad1d6388b0305f13e2171308a77f42e7c682 (patch)
tree: a56d002ba622ecce7f1310e8fac56adca6547d5a /runtime/utf_test.cc
parent: ebfd062af849a915bd75eebd81c6fdea15b1d8d5 (diff)
download: art-e16dad1d6388b0305f13e2171308a77f42e7c682.tar.gz
art-e16dad1d6388b0305f13e2171308a77f42e7c682.tar.bz2
art-e16dad1d6388b0305f13e2171308a77f42e7c682.zip
1 files changed, 50 insertions, 0 deletions
diff --git a/runtime/utf_test.cc b/runtime/utf_test.cc
index 8048bbdbe0..94a6ea57e2 100644
--- a/runtime/utf_test.cc
+++ b/runtime/utf_test.cc
@@ -19,6 +19,8 @@
 #include "common_runtime_test.h"
 #include "utf-inl.h"
 
+#include <vector>
+
 namespace art {
 
 class UtfTest : public CommonRuntimeTest {};
@@ -110,4 +112,52 @@ TEST_F(UtfTest, CountModifiedUtf8Chars) {
   EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding)));
 }
 
+static void AssertConversion(const std::vector<uint16_t> input,
+                             const std::vector<uint8_t> expected) {
+  ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
+
+  std::vector<uint8_t> output(expected.size());
+  ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), &input[0], input.size());
+  EXPECT_EQ(expected, output);
+}
+
+TEST_F(UtfTest, CountAndConvertUtf8Bytes) {
+  // Surrogate pairs will be converted into 4 byte sequences.
+  AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 });
+
+  // Three byte encodings that are below & above the leading surrogate
+  // range respectively.
+  AssertConversion({ 0xdef0 }, { 0xed, 0xbb, 0xb0 });
+  AssertConversion({ 0xdcff }, { 0xed, 0xb3, 0xbf });
+  // Two byte encoding.
+  AssertConversion({ 0x0101 }, { 0xc4, 0x81 });
+
+  // Two byte special case : 0 must use an overlong encoding.
+  AssertConversion({ 0x0101, 0x0000 }, { 0xc4, 0x81, 0xc0, 0x80 });
+
+  // One byte encoding.
+  AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
+
+  AssertConversion({
+      0xd802, 0xdc02,  // Surrogate pair
+      0xdef0, 0xdcff,  // Three byte encodings
+      0x0101, 0x0000,  // Two byte encodings
+      'p'   , 'p'      // One byte encoding
+    }, {
+      0xf0, 0x90, 0xa0, 0x82,
+      0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
+      0xc4, 0x81, 0xc0, 0x80,
+      0x70, 0x70
+    });
+}
+
+TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) {
+  // Unpaired trailing surrogate at the end of input.
+  AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 });
+  // Unpaired (or incorrectly paired) surrogates in the middle of the input.
+  AssertConversion({ 'h', 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 'e' });
+  AssertConversion({ 'h', 0xd801, 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81, 'e' });
+  AssertConversion({ 'h', 0xdc00, 0xdc00, 'e' }, { 'h', 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80, 'e' });
+}
+
 }  // namespace art
author	Narayan Kamath <narayan@google.com>	2015-02-13 11:49:22 +0000
committer	Narayan Kamath <narayan@google.com>	2015-03-25 08:46:51 +0000
commit	e16dad1d6388b0305f13e2171308a77f42e7c682 (patch)
tree	a56d002ba622ecce7f1310e8fac56adca6547d5a /runtime/utf_test.cc
parent	ebfd062af849a915bd75eebd81c6fdea15b1d8d5 (diff)
download	art-e16dad1d6388b0305f13e2171308a77f42e7c682.tar.gz art-e16dad1d6388b0305f13e2171308a77f42e7c682.tar.bz2 art-e16dad1d6388b0305f13e2171308a77f42e7c682.zip