diff options
Diffstat (limited to 'libc/bionic/wchar.cpp')
-rw-r--r-- | libc/bionic/wchar.cpp | 208 |
1 files changed, 15 insertions, 193 deletions
diff --git a/libc/bionic/wchar.cpp b/libc/bionic/wchar.cpp index 5da882fc8..acb27617a 100644 --- a/libc/bionic/wchar.cpp +++ b/libc/bionic/wchar.cpp @@ -27,9 +27,12 @@ */ #include <errno.h> -#include <string.h> #include <sys/param.h> +#include <string.h> #include <wchar.h> +#include <uchar.h> + +#include "private/bionic_mbstate.h" // // This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a @@ -50,36 +53,6 @@ // function pointers. // -#define ERR_ILLEGAL_SEQUENCE static_cast<size_t>(-1) -#define ERR_INCOMPLETE_SEQUENCE static_cast<size_t>(-2) - -static size_t mbstate_bytes_so_far(const mbstate_t* ps) { - return - (ps->__seq[2] != 0) ? 3 : - (ps->__seq[1] != 0) ? 2 : - (ps->__seq[0] != 0) ? 1 : 0; -} - -static void mbstate_set_byte(mbstate_t* ps, int i, char byte) { - ps->__seq[i] = static_cast<uint8_t>(byte); -} - -static uint8_t mbstate_get_byte(const mbstate_t* ps, int n) { - return ps->__seq[n]; -} - -static size_t reset_and_return_illegal(int _errno, mbstate_t* ps) { - errno = _errno; - *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0; - return ERR_ILLEGAL_SEQUENCE; -} - -static size_t reset_and_return(int _return, mbstate_t* ps) { - *(reinterpret_cast<uint32_t*>(ps->__seq)) = 0; - return _return; -} - - int mbsinit(const mbstate_t* ps) { return (ps == NULL || (*(reinterpret_cast<const uint32_t*>(ps->__seq)) == 0)); } @@ -88,104 +61,8 @@ size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps) { static mbstate_t __private_state; mbstate_t* state = (ps == NULL) ? &__private_state : ps; - // We should never get to a state which has all 4 bytes of the sequence set. - // Full state verification is done when decoding the sequence (after we have - // all the bytes). - if (mbstate_get_byte(state, 3) != 0) { - return reset_and_return_illegal(EINVAL, state); - } - - if (s == NULL) { - s = ""; - n = 1; - pwc = NULL; - } - - if (n == 0) { - return 0; - } - - uint8_t ch; - if (mbsinit(state) && (((ch = static_cast<uint8_t>(*s)) & ~0x7f) == 0)) { - // Fast path for plain ASCII characters. - if (pwc != NULL) { - *pwc = ch; - } - return (ch != '\0' ? 1 : 0); - } - - // Determine the number of octets that make up this character - // from the first octet, and a mask that extracts the - // interesting bits of the first octet. We already know - // the character is at least two bytes long. - size_t length; - int mask; - - // We also specify a lower bound for the character code to - // detect redundant, non-"shortest form" encodings. For - // example, the sequence C0 80 is _not_ a legal representation - // of the null character. This enforces a 1-to-1 mapping - // between character codes and their multibyte representations. - wchar_t lower_bound; - - // The first byte in the state (if any) tells the length. - size_t bytes_so_far = mbstate_bytes_so_far(state); - ch = bytes_so_far > 0 ? mbstate_get_byte(state, 0) : static_cast<uint8_t>(*s); - if ((ch & 0x80) == 0) { - mask = 0x7f; - length = 1; - lower_bound = 0; - } else if ((ch & 0xe0) == 0xc0) { - mask = 0x1f; - length = 2; - lower_bound = 0x80; - } else if ((ch & 0xf0) == 0xe0) { - mask = 0x0f; - length = 3; - lower_bound = 0x800; - } else if ((ch & 0xf8) == 0xf0) { - mask = 0x07; - length = 4; - lower_bound = 0x10000; - } else { - // Malformed input; input is not UTF-8. See RFC 3629. - return reset_and_return_illegal(EILSEQ, state); - } - - // Fill in the state. - size_t bytes_wanted = length - bytes_so_far; - size_t i; - for (i = 0; i < MIN(bytes_wanted, n); i++) { - if (!mbsinit(state) && ((*s & 0xc0) != 0x80)) { - // Malformed input; bad characters in the middle of a character. - return reset_and_return_illegal(EILSEQ, state); - } - mbstate_set_byte(state, bytes_so_far + i, *s++); - } - if (i < bytes_wanted) { - return ERR_INCOMPLETE_SEQUENCE; - } - - // Decode the octet sequence representing the character in chunks - // of 6 bits, most significant first. - wchar_t wch = mbstate_get_byte(state, 0) & mask; - for (i = 1; i < length; i++) { - wch <<= 6; - wch |= mbstate_get_byte(state, i) & 0x3f; - } - - if (wch < lower_bound) { - // Malformed input; redundant encoding. - return reset_and_return_illegal(EILSEQ, state); - } - if ((wch >= 0xd800 && wch <= 0xdfff) || wch == 0xfffe || wch == 0xffff) { - // Malformed input; invalid code points. - return reset_and_return_illegal(EILSEQ, state); - } - if (pwc != NULL) { - *pwc = wch; - } - return reset_and_return(wch == L'\0' ? 0 : bytes_wanted, state); + // Our wchar_t is UTF-32 + return mbrtoc32(reinterpret_cast<char32_t*>(pwc), s, n, state); } size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) { @@ -212,10 +89,10 @@ size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstat r = 1; } else { r = mbrtowc(NULL, *src + i, nmc - i, state); - if (r == ERR_ILLEGAL_SEQUENCE) { + if (r == __MB_ERR_ILLEGAL_SEQUENCE) { return reset_and_return_illegal(EILSEQ, state); } - if (r == ERR_INCOMPLETE_SEQUENCE) { + if (r == __MB_ERR_INCOMPLETE_SEQUENCE) { return reset_and_return_illegal(EILSEQ, state); } if (r == 0) { @@ -246,11 +123,11 @@ size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstat r = 1; } else { r = mbrtowc(dst + o, *src + i, nmc - i, state); - if (r == ERR_ILLEGAL_SEQUENCE) { + if (r == __MB_ERR_ILLEGAL_SEQUENCE) { *src += i; return reset_and_return_illegal(EILSEQ, state); } - if (r == ERR_INCOMPLETE_SEQUENCE) { + if (r == __MB_ERR_INCOMPLETE_SEQUENCE) { *src += nmc; return reset_and_return(EILSEQ, state); } @@ -272,63 +149,8 @@ size_t wcrtomb(char* s, wchar_t wc, mbstate_t* ps) { static mbstate_t __private_state; mbstate_t* state = (ps == NULL) ? &__private_state : ps; - if (s == NULL) { - // Equivalent to wcrtomb(buf, L'\0', ps). - return reset_and_return(1, state); - } - - // POSIX states that if wc is a null wide character, a null byte shall be - // stored, preceded by any shift sequence needed to restore the initial shift - // state. Since shift states are not supported, only the null byte is stored. - if (wc == L'\0') { - *s = '\0'; - reset_and_return(1, state); - } - - if (!mbsinit(state)) { - return reset_and_return_illegal(EILSEQ, state); - } - - if ((wc & ~0x7f) == 0) { - // Fast path for plain ASCII characters. - *s = wc; - return 1; - } - - // Determine the number of octets needed to represent this character. - // We always output the shortest sequence possible. Also specify the - // first few bits of the first octet, which contains the information - // about the sequence length. - uint8_t lead; - size_t length; - if ((wc & ~0x7f) == 0) { - lead = 0; - length = 1; - } else if ((wc & ~0x7ff) == 0) { - lead = 0xc0; - length = 2; - } else if ((wc & ~0xffff) == 0) { - lead = 0xe0; - length = 3; - } else if ((wc & ~0x1fffff) == 0) { - lead = 0xf0; - length = 4; - } else { - errno = EILSEQ; - return ERR_ILLEGAL_SEQUENCE; - } - - // Output the octets representing the character in chunks - // of 6 bits, least significant last. The first octet is - // a special case because it contains the sequence length - // information. - for (size_t i = length - 1; i > 0; i--) { - s[i] = (wc & 0x3f) | 0x80; - wc >>= 6; - } - *s = (wc & 0xff) | lead; - - return length; + // Our wchar_t is UTF-32 + return c32rtomb(s, static_cast<char32_t>(wc), state); } size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) { @@ -352,7 +174,7 @@ size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstat r = 1; } else { r = wcrtomb(buf, wc, state); - if (r == ERR_ILLEGAL_SEQUENCE) { + if (r == __MB_ERR_ILLEGAL_SEQUENCE) { return r; } } @@ -373,14 +195,14 @@ size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstat } else if (len - o >= sizeof(buf)) { // Enough space to translate in-place. r = wcrtomb(dst + o, wc, state); - if (r == ERR_ILLEGAL_SEQUENCE) { + if (r == __MB_ERR_ILLEGAL_SEQUENCE) { *src += i; return r; } } else { // May not be enough space; use temp buffer. r = wcrtomb(buf, wc, state); - if (r == ERR_ILLEGAL_SEQUENCE) { + if (r == __MB_ERR_ILLEGAL_SEQUENCE) { *src += i; return r; } |