summaryrefslogtreecommitdiffstats
path: root/runtime/utf_test.cc
diff options
context:
space:
mode:
authorNarayan Kamath <narayan@google.com>2015-02-13 11:49:22 +0000
committerNarayan Kamath <narayan@google.com>2015-03-25 08:46:51 +0000
commite16dad1d6388b0305f13e2171308a77f42e7c682 (patch)
treea56d002ba622ecce7f1310e8fac56adca6547d5a /runtime/utf_test.cc
parentebfd062af849a915bd75eebd81c6fdea15b1d8d5 (diff)
downloadart-e16dad1d6388b0305f13e2171308a77f42e7c682.tar.gz
art-e16dad1d6388b0305f13e2171308a77f42e7c682.tar.bz2
art-e16dad1d6388b0305f13e2171308a77f42e7c682.zip
Emit 4 byte UTF-sequences in place of encoded surrogate pairs.
Symmetric with a5afcfc73141e5e378d79a326d0 which converts 4 byte UTF-8 sequences to surrogate pairs. bug: 18848397 Change-Id: I42adc275b7e0df0cbbd9d8a799e8b0447d8f5cae
Diffstat (limited to 'runtime/utf_test.cc')
-rw-r--r--runtime/utf_test.cc50
1 files changed, 50 insertions, 0 deletions
diff --git a/runtime/utf_test.cc b/runtime/utf_test.cc
index 8048bbdbe0..94a6ea57e2 100644
--- a/runtime/utf_test.cc
+++ b/runtime/utf_test.cc
@@ -19,6 +19,8 @@
#include "common_runtime_test.h"
#include "utf-inl.h"
+#include <vector>
+
namespace art {
class UtfTest : public CommonRuntimeTest {};
@@ -110,4 +112,52 @@ TEST_F(UtfTest, CountModifiedUtf8Chars) {
EXPECT_EQ(2u, CountModifiedUtf8Chars(reinterpret_cast<const char *>(kSurrogateEncoding)));
}
+static void AssertConversion(const std::vector<uint16_t> input,
+ const std::vector<uint8_t> expected) {
+ ASSERT_EQ(expected.size(), CountUtf8Bytes(&input[0], input.size()));
+
+ std::vector<uint8_t> output(expected.size());
+ ConvertUtf16ToModifiedUtf8(reinterpret_cast<char*>(&output[0]), &input[0], input.size());
+ EXPECT_EQ(expected, output);
+}
+
+TEST_F(UtfTest, CountAndConvertUtf8Bytes) {
+ // Surrogate pairs will be converted into 4 byte sequences.
+ AssertConversion({ 0xd801, 0xdc00 }, { 0xf0, 0x90, 0x90, 0x80 });
+
+ // Three byte encodings that are below & above the leading surrogate
+ // range respectively.
+ AssertConversion({ 0xdef0 }, { 0xed, 0xbb, 0xb0 });
+ AssertConversion({ 0xdcff }, { 0xed, 0xb3, 0xbf });
+ // Two byte encoding.
+ AssertConversion({ 0x0101 }, { 0xc4, 0x81 });
+
+ // Two byte special case : 0 must use an overlong encoding.
+ AssertConversion({ 0x0101, 0x0000 }, { 0xc4, 0x81, 0xc0, 0x80 });
+
+ // One byte encoding.
+ AssertConversion({ 'h', 'e', 'l', 'l', 'o' }, { 0x68, 0x65, 0x6c, 0x6c, 0x6f });
+
+ AssertConversion({
+ 0xd802, 0xdc02, // Surrogate pair
+ 0xdef0, 0xdcff, // Three byte encodings
+ 0x0101, 0x0000, // Two byte encodings
+ 'p' , 'p' // One byte encoding
+ }, {
+ 0xf0, 0x90, 0xa0, 0x82,
+ 0xed, 0xbb, 0xb0, 0xed, 0xb3, 0xbf,
+ 0xc4, 0x81, 0xc0, 0x80,
+ 0x70, 0x70
+ });
+}
+
+TEST_F(UtfTest, CountAndConvertUtf8Bytes_UnpairedSurrogate) {
+ // Unpaired trailing surrogate at the end of input.
+ AssertConversion({ 'h', 'e', 0xd801 }, { 'h', 'e', 0xed, 0xa0, 0x81 });
+ // Unpaired (or incorrectly paired) surrogates in the middle of the input.
+ AssertConversion({ 'h', 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 'e' });
+ AssertConversion({ 'h', 0xd801, 0xd801, 'e' }, { 'h', 0xed, 0xa0, 0x81, 0xed, 0xa0, 0x81, 'e' });
+ AssertConversion({ 'h', 0xdc00, 0xdc00, 'e' }, { 'h', 0xed, 0xb0, 0x80, 0xed, 0xb0, 0x80, 'e' });
+}
+
} // namespace art