/* * Copyright (C) 2011 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * Validate and manipulate MUTF-8 encoded string data. */ #include "DexUtf.h" /* Compare two '\0'-terminated modified UTF-8 strings, using Unicode * code point values for comparison. This treats different encodings * for the same code point as equivalent, except that only a real '\0' * byte is considered the string terminator. The return value is as * for strcmp(). */ int dexUtf8Cmp(const char* s1, const char* s2) { for (;;) { if (*s1 == '\0') { if (*s2 == '\0') { return 0; } return -1; } else if (*s2 == '\0') { return 1; } int utf1 = dexGetUtf16FromUtf8(&s1); int utf2 = dexGetUtf16FromUtf8(&s2); int diff = utf1 - utf2; if (diff != 0) { return diff; } } } /* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */ u4 DEX_MEMBER_VALID_LOW_ASCII[4] = { 0x00000000, // 00..1f low control characters; nothing valid 0x03ff2010, // 20..3f digits and symbols; valid: '0'..'9', '$', '-' 0x87fffffe, // 40..5f uppercase etc.; valid: 'A'..'Z', '_' 0x07fffffe // 60..7f lowercase etc.; valid: 'a'..'z' }; /* Helper for dexIsValidMemberNameUtf8(); do not call directly. */ bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr) { /* * It's a multibyte encoded character. Decode it and analyze. We * accept anything that isn't (a) an improperly encoded low value, * (b) an improper surrogate pair, (c) an encoded '\0', (d) a high * control character, or (e) a high space, layout, or special * character (U+00a0, U+2000..U+200f, U+2028..U+202f, * U+fff0..U+ffff). This is all specified in the dex format * document. */ u2 utf16 = dexGetUtf16FromUtf8(pUtf8Ptr); // Perform follow-up tests based on the high 8 bits. switch (utf16 >> 8) { case 0x00: { // It's only valid if it's above the ISO-8859-1 high space (0xa0). return (utf16 > 0x00a0); } case 0xd8: case 0xd9: case 0xda: case 0xdb: { /* * It's a leading surrogate. Check to see that a trailing * surrogate follows. */ utf16 = dexGetUtf16FromUtf8(pUtf8Ptr); return (utf16 >= 0xdc00) && (utf16 <= 0xdfff); } case 0xdc: case 0xdd: case 0xde: case 0xdf: { // It's a trailing surrogate, which is not valid at this point. return false; } case 0x20: case 0xff: { // It's in the range that has spaces, controls, and specials. switch (utf16 & 0xfff8) { case 0x2000: case 0x2008: case 0x2028: case 0xfff0: case 0xfff8: { return false; } } break; } } return true; } /* Return whether the given string is a valid field or method name. */ bool dexIsValidMemberName(const char* s) { bool angleName = false; switch (*s) { case '\0': { // The empty string is not a valid name. return false; } case '<': { /* * '<' is allowed only at the start of a name, and if present, * means that the name must end with '>'. */ angleName = true; s++; break; } } for (;;) { switch (*s) { case '\0': { return !angleName; } case '>': { return angleName && s[1] == '\0'; } } if (!dexIsValidMemberNameUtf8(&s)) { return false; } } } /* Helper for validating type descriptors and class names, which is parametric * with respect to type vs. class and dot vs. slash. */ static bool isValidTypeDescriptorOrClassName(const char* s, bool isClassName, bool dotSeparator) { int arrayCount = 0; while (*s == '[') { arrayCount++; s++; } if (arrayCount > 255) { // Arrays may have no more than 255 dimensions. return false; } if (arrayCount != 0) { /* * If we're looking at an array of some sort, then it doesn't * matter if what is being asked for is a class name; the * format looks the same as a type descriptor in that case, so * treat it as such. */ isClassName = false; } if (!isClassName) { /* * We are looking for a descriptor. Either validate it as a * single-character primitive type, or continue on to check the * embedded class name (bracketed by "L" and ";"). */ switch (*(s++)) { case 'B': case 'C': case 'D': case 'F': case 'I': case 'J': case 'S': case 'Z': { // These are all single-character descriptors for primitive types. return (*s == '\0'); } case 'V': { // Non-array void is valid, but you can't have an array of void. return (arrayCount == 0) && (*s == '\0'); } case 'L': { // Class name: Break out and continue below. break; } default: { // Oddball descriptor character. return false; } } } /* * We just consumed the 'L' that introduces a class name as part * of a type descriptor, or we are looking for an unadorned class * name. */ bool sepOrFirst = true; // first character or just encountered a separator. for (;;) { u1 c = (u1) *s; switch (c) { case '\0': { /* * Premature end for a type descriptor, but valid for * a class name as long as we haven't encountered an * empty component (including the degenerate case of * the empty string ""). */ return isClassName && !sepOrFirst; } case ';': { /* * Invalid character for a class name, but the * legitimate end of a type descriptor. In the latter * case, make sure that this is the end of the string * and that it doesn't end with an empty component * (including the degenerate case of "L;"). */ return !isClassName && !sepOrFirst && (s[1] == '\0'); } case '/': case '.': { if (dotSeparator != (c == '.')) { // The wrong separator character. return false; } if (sepOrFirst) { // Separator at start or two separators in a row. return false; } sepOrFirst = true; s++; break; } default: { if (!dexIsValidMemberNameUtf8(&s)) { return false; } sepOrFirst = false; break; } } } } /* Return whether the given string is a valid type descriptor. */ bool dexIsValidTypeDescriptor(const char* s) { return isValidTypeDescriptorOrClassName(s, false, false); } /* (documented in header) */ bool dexIsValidClassName(const char* s, bool dotSeparator) { return isValidTypeDescriptorOrClassName(s, true, dotSeparator); } /* Return whether the given string is a valid reference descriptor. This * is true if dexIsValidTypeDescriptor() returns true and the descriptor * is for a class or array and not a primitive type. */ bool dexIsReferenceDescriptor(const char* s) { if (!dexIsValidTypeDescriptor(s)) { return false; } return (s[0] == 'L') || (s[0] == '['); } /* Return whether the given string is a valid class descriptor. This * is true if dexIsValidTypeDescriptor() returns true and the descriptor * is for a class and not an array or primitive type. */ bool dexIsClassDescriptor(const char* s) { if (!dexIsValidTypeDescriptor(s)) { return false; } return s[0] == 'L'; } /* Return whether the given string is a valid field type descriptor. This * is true if dexIsValidTypeDescriptor() returns true and the descriptor * is for anything but "void". */ bool dexIsFieldDescriptor(const char* s) { if (!dexIsValidTypeDescriptor(s)) { return false; } return s[0] != 'V'; }