summaryrefslogtreecommitdiffstats
path: root/runtime/utf-inl.h
diff options
context:
space:
mode:
authorNarayan Kamath <narayan@google.com>2015-01-29 20:06:46 +0000
committerNarayan Kamath <narayan@google.com>2015-02-12 11:54:37 +0000
commita5afcfc73141e5e378d79a326d02c5c2039fb025 (patch)
tree424add9558fb816c4f1d2f4edd128f4f2a086d9a /runtime/utf-inl.h
parent5a3399deaf448c8434d9ba0916ff799b1b791d95 (diff)
downloadart-a5afcfc73141e5e378d79a326d02c5c2039fb025.tar.gz
art-a5afcfc73141e5e378d79a326d02c5c2039fb025.tar.bz2
art-a5afcfc73141e5e378d79a326d02c5c2039fb025.zip
Be more lenient with 4 byte UTF-8 sequences.
Accept 4 byte sequences and convert them into surrogate pairs instead of expecting 2 separate 3 byte sequences each encoding one half of a surrogate pair. Note that in addition to supporting 4 byte sequences in strings from JNI, we also tolerate them in dex files. This is mainly for consistency, and there's no need to claim any sort of official support. bug: 18848397 bug: https://code.google.com/p/android/issues/detail?id=81341 Change-Id: Ibc98d29e59d98803e640f2489ea4c56912a59b29
Diffstat (limited to 'runtime/utf-inl.h')
-rw-r--r--runtime/utf-inl.h100
1 files changed, 49 insertions, 51 deletions
diff --git a/runtime/utf-inl.h b/runtime/utf-inl.h
index 1373d1704..b2d6765fb 100644
--- a/runtime/utf-inl.h
+++ b/runtime/utf-inl.h
@@ -21,26 +21,57 @@
namespace art {
-inline uint16_t GetUtf16FromUtf8(const char** utf8_data_in) {
- uint8_t one = *(*utf8_data_in)++;
+inline uint16_t GetTrailingUtf16Char(uint32_t maybe_pair) {
+ return static_cast<uint16_t>(maybe_pair >> 16);
+}
+
+inline uint16_t GetLeadingUtf16Char(uint32_t maybe_pair) {
+ return static_cast<uint16_t>(maybe_pair & 0x0000FFFF);
+}
+
+inline uint32_t GetUtf16FromUtf8(const char** utf8_data_in) {
+ const uint8_t one = *(*utf8_data_in)++;
if ((one & 0x80) == 0) {
// one-byte encoding
return one;
}
- // two- or three-byte encoding
- uint8_t two = *(*utf8_data_in)++;
+
+ const uint8_t two = *(*utf8_data_in)++;
if ((one & 0x20) == 0) {
// two-byte encoding
return ((one & 0x1f) << 6) | (two & 0x3f);
}
- // three-byte encoding
- uint8_t three = *(*utf8_data_in)++;
- return ((one & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
+
+ const uint8_t three = *(*utf8_data_in)++;
+ if ((one & 0x10) == 0) {
+ return ((one & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
+ }
+
+ // Four byte encodings need special handling. We'll have
+ // to convert them into a surrogate pair.
+ const uint8_t four = *(*utf8_data_in)++;
+
+ // Since this is a 4 byte UTF-8 sequence, it will lie between
+ // U+10000 and U+1FFFFF.
+ //
+ // TODO: What do we do about values in (U+10FFFF, U+1FFFFF) ? The
+ // spec says they're invalid but nobody appears to check for them.
+ const uint32_t code_point = ((one & 0x0f) << 18) | ((two & 0x3f) << 12)
+ | ((three & 0x3f) << 6) | (four & 0x3f);
+
+ uint32_t surrogate_pair = 0;
+ // Step two: Write out the high (leading) surrogate to the bottom 16 bits
+ // of the of the 32 bit type.
+ surrogate_pair |= ((code_point >> 10) + 0xd7c0) & 0xffff;
+ // Step three : Write out the low (trailing) surrogate to the top 16 bits.
+ surrogate_pair |= ((code_point & 0x03ff) + 0xdc00) << 16;
+
+ return surrogate_pair;
}
inline int CompareModifiedUtf8ToModifiedUtf8AsUtf16CodePointValues(const char* utf8_1,
const char* utf8_2) {
- uint16_t c1, c2;
+ uint32_t c1, c2;
do {
c1 = *utf8_1;
c2 = *utf8_2;
@@ -50,50 +81,17 @@ inline int CompareModifiedUtf8ToModifiedUtf8AsUtf16CodePointValues(const char* u
} else if (c2 == 0) {
return 1;
}
- // Assume 1-byte value and handle all cases first.
- utf8_1++;
- utf8_2++;
- if ((c1 & 0x80) == 0) {
- if (c1 == c2) {
- // Matching 1-byte values.
- continue;
- } else {
- // Non-matching values.
- if ((c2 & 0x80) == 0) {
- // 1-byte value, do nothing.
- } else if ((c2 & 0x20) == 0) {
- // 2-byte value.
- c2 = ((c2 & 0x1f) << 6) | (*utf8_2 & 0x3f);
- } else {
- // 3-byte value.
- c2 = ((c2 & 0x0f) << 12) | ((utf8_2[0] & 0x3f) << 6) | (utf8_2[1] & 0x3f);
- }
- return static_cast<int>(c1) - static_cast<int>(c2);
- }
- }
- // Non-matching or multi-byte values.
- if ((c1 & 0x20) == 0) {
- // 2-byte value.
- c1 = ((c1 & 0x1f) << 6) | (*utf8_1 & 0x3f);
- utf8_1++;
- } else {
- // 3-byte value.
- c1 = ((c1 & 0x0f) << 12) | ((utf8_1[0] & 0x3f) << 6) | (utf8_1[1] & 0x3f);
- utf8_1 += 2;
- }
- if ((c2 & 0x80) == 0) {
- // 1-byte value, do nothing.
- } else if ((c2 & 0x20) == 0) {
- // 2-byte value.
- c2 = ((c2 & 0x1f) << 6) | (*utf8_2 & 0x3f);
- utf8_2++;
- } else {
- // 3-byte value.
- c2 = ((c2 & 0x0f) << 12) | ((utf8_2[0] & 0x3f) << 6) | (utf8_2[1] & 0x3f);
- utf8_2 += 2;
- }
+
+ c1 = GetUtf16FromUtf8(&utf8_1);
+ c2 = GetUtf16FromUtf8(&utf8_2);
} while (c1 == c2);
- return static_cast<int>(c1) - static_cast<int>(c2);
+
+ const uint32_t leading_surrogate_diff = GetLeadingUtf16Char(c1) - GetLeadingUtf16Char(c2);
+ if (leading_surrogate_diff != 0) {
+ return static_cast<int>(leading_surrogate_diff);
+ }
+
+ return GetTrailingUtf16Char(c1) - GetTrailingUtf16Char(c2);
}
} // namespace art