diff options
author | Daniel Veillard <veillard@redhat.com> | 2012-07-13 19:51:15 +0800 |
---|---|---|
committer | Daniel Veillard <veillard@redhat.com> | 2012-07-23 14:24:26 +0800 |
commit | 18d0db25037978a6a274f55a2c058ad82331965a (patch) | |
tree | c5ea37f8c3dac5677bde55cc174223d741c6b618 | |
parent | ade10f2c57b4bd5c3812b96bce1144d8fa1d189e (diff) | |
download | android_external_libxml2-18d0db25037978a6a274f55a2c058ad82331965a.tar.gz android_external_libxml2-18d0db25037978a6a274f55a2c058ad82331965a.tar.bz2 android_external_libxml2-18d0db25037978a6a274f55a2c058ad82331965a.zip |
Adding new encoding function to deal with the new structures
* encoding.c: adds xmlCharEncFirstLineInput, xmlCharEncInput and
xmlCharEncOutput
* enc.h: the functions are not made public but added to this new header
-rw-r--r-- | enc.h | 29 | ||||
-rw-r--r-- | encoding.c | 483 |
2 files changed, 508 insertions, 4 deletions
@@ -0,0 +1,29 @@ +/* + * enc.h: Internal Interfaces for encoding in libxml2 + * + * See Copyright for the status of this software. + * + * daniel@veillard.com + */ + +#ifndef __XML_ENC_H__ +#define __XML_ENC_H__ + +#include <libxml/tree.h> + +#ifdef __cplusplus +extern "C" { +#endif + +int xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out, + xmlBufferPtr in, int len); +int xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len); +int xmlCharEncInput(xmlParserInputBufferPtr input); +int xmlCharEncOutput(xmlOutputBufferPtr output, int init); + +#ifdef __cplusplus +} +#endif +#endif /* __XML_ENC_H__ */ + + @@ -24,6 +24,7 @@ #include "libxml.h" #include <string.h> +#include <limits.h> #ifdef HAVE_CTYPE_H #include <ctype.h> @@ -44,6 +45,9 @@ #include <libxml/globals.h> #include <libxml/xmlerror.h> +#include "buf.h" +#include "enc.h" + static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL; static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL; @@ -1897,9 +1901,6 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, * The real API used by libxml for on-the-fly conversion * * * ************************************************************************/ -int -xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out, - xmlBufferPtr in, int len); /** * xmlCharEncFirstLineInt: @@ -1946,7 +1947,7 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out, toconv = 180; } if (toconv * 2 >= written) { - xmlBufferGrow(out, toconv); + xmlBufferGrow(out, toconv * 2); written = out->size - out->use - 1; } @@ -2029,6 +2030,251 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, } /** + * xmlCharEncInput: + * @input: a parser input buffer + * @len: number of bytes to convert for the first line, or -1 + * + * Front-end for the encoding handler input function, but handle only + * the very first line. Point is that this is based on autodetection + * of the encoding and once that first line is converted we may find + * out that a different decoder is needed to process the input. + * + * Returns the number of byte written if success, or + * -1 general error + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + */ +int +xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len) +{ + int ret = -2; + size_t written; + size_t toconv; + int c_in; + int c_out; + xmlBufPtr in; + xmlBufPtr out; + + if ((input == NULL) || (input->encoder == NULL) || + (input->buffer == NULL) || (input->raw == NULL)) + return (-1); + out = input->buffer; + in = input->raw; + + toconv = xmlBufUse(in); + if (toconv == 0) + return (0); + written = xmlBufAvail(out) - 1; /* count '\0' */ + /* + * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38 + * 45 chars should be sufficient to reach the end of the encoding + * declaration without going too far inside the document content. + * on UTF-16 this means 90bytes, on UCS4 this means 180 + * The actual value depending on guessed encoding is passed as @len + * if provided + */ + if (len >= 0) { + if (toconv > (unsigned int) len) + toconv = len; + } else { + if (toconv > 180) + toconv = 180; + } + if (toconv * 2 >= written) { + xmlBufGrow(out, toconv * 2); + written = xmlBufAvail(out) - 1; + } + if (written > 360) + written = 360; + + c_in = toconv; + c_out = written; + if (input->encoder->input != NULL) { + ret = input->encoder->input(xmlBufEnd(out), &c_out, + xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + } +#ifdef LIBXML_ICONV_ENABLED + else if (input->encoder->iconv_in != NULL) { + ret = xmlIconvWrapper(input->encoder->iconv_in, xmlBufEnd(out), + &c_out, xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + if (ret == -1) + ret = -3; + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (input->encoder->uconv_in != NULL) { + ret = xmlUconvWrapper(input->encoder->uconv_in, 1, xmlBufEnd(out), + &c_out, xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + if (ret == -1) + ret = -3; + } +#endif /* LIBXML_ICU_ENABLED */ + switch (ret) { + case 0: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input\n", + c_in, c_out); +#endif + break; + case -1: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input, %d left\n", + c_in, c_out, (int)xmlBufUse(in)); +#endif + break; + case -3: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input, %d left\n", + c_in, c_out, (int)xmlBufUse(in)); +#endif + break; + case -2: { + char buf[50]; + const xmlChar *content = xmlBufContent(in); + + snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X", + content[0], content[1], + content[2], content[3]); + buf[49] = 0; + xmlEncodingErr(XML_I18N_CONV_FAILED, + "input conversion failed due to input error, bytes %s\n", + buf); + } + } + /* + * Ignore when input buffer is not on a boundary + */ + if (ret == -3) ret = 0; + if (ret == -1) ret = 0; + return(ret); +} + +/** + * xmlCharEncInput: + * @input: a parser input buffer + * + * Generic front-end for the encoding handler on parser input + * + * Returns the number of byte written if success, or + * -1 general error + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + */ +int +xmlCharEncInput(xmlParserInputBufferPtr input) +{ + int ret = -2; + size_t written; + size_t toconv; + int c_in; + int c_out; + xmlBufPtr in; + xmlBufPtr out; + + if ((input == NULL) || (input->encoder == NULL) || + (input->buffer == NULL) || (input->raw == NULL)) + return (-1); + out = input->buffer; + in = input->raw; + + toconv = xmlBufUse(in); + if (toconv == 0) + return (0); + if (toconv > 64 * 1024) + toconv = 64 * 1024; + written = xmlBufAvail(out); + if (written > 0) + written--; /* count '\0' */ + if (toconv * 2 >= written) { + xmlBufGrow(out, toconv * 2); + written = xmlBufAvail(out); + if (written > 0) + written--; /* count '\0' */ + } + if (written > 128 * 1024) + written = 128 * 1024; + + c_in = toconv; + c_out = written; + if (input->encoder->input != NULL) { + ret = input->encoder->input(xmlBufEnd(out), &c_out, + xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + } +#ifdef LIBXML_ICONV_ENABLED + else if (input->encoder->iconv_in != NULL) { + ret = xmlIconvWrapper(input->encoder->iconv_in, xmlBufEnd(out), + &c_out, xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + if (ret == -1) + ret = -3; + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (input->encoder->uconv_in != NULL) { + ret = xmlUconvWrapper(input->encoder->uconv_in, 1, xmlBufEnd(out), + &c_out, xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + if (ret == -1) + ret = -3; + } +#endif /* LIBXML_ICU_ENABLED */ + switch (ret) { + case 0: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input\n", + c_in, c_out); +#endif + break; + case -1: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input, %d left\n", + c_in, c_out, (int)xmlBufUse(in)); +#endif + break; + case -3: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of input, %d left\n", + c_in, c_out, (int)xmlBufUse(in)); +#endif + break; + case -2: { + char buf[50]; + const xmlChar *content = xmlBufContent(in); + + snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X", + content[0], content[1], + content[2], content[3]); + buf[49] = 0; + xmlEncodingErr(XML_I18N_CONV_FAILED, + "input conversion failed due to input error, bytes %s\n", + buf); + } + } + /* + * Ignore when input buffer is not on a boundary + */ + if (ret == -3) + ret = 0; + return (c_out? c_out : ret); +} + +/** * xmlCharEncInFunc: * @handler: char encoding transformation data structure * @out: an xmlBuffer for the output. @@ -2136,6 +2382,235 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, } /** + * xmlCharEncOutput: + * @input: a parser input buffer + * @init: is this an initialization call without data + * + * Generic front-end for the encoding handler on parser output + * a first call with @init == 1 has to be made first to initiate the + * output in case of non-stateless encoding needing to initiate their + * state or the output (like the BOM in UTF16). + * In case of UTF8 sequence conversion errors for the given encoder, + * the content will be automatically remapped to a CharRef sequence. + * + * Returns the number of byte written if success, or + * -1 general error + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + */ +int +xmlCharEncOutput(xmlOutputBufferPtr output, int init) +{ + int ret = -2; + size_t written; + size_t writtentot = 0; + size_t toconv; + int c_in; + int c_out; + xmlBufPtr in; + xmlBufPtr out; + int charref_len = 0; + + if ((output == NULL) || (output->encoder == NULL) || + (output->buffer == NULL) || (output->conv == NULL)) + return (-1); + out = output->conv; + in = output->buffer; + +retry: + + written = xmlBufAvail(out); + if (written > 0) + written--; /* count '\0' */ + + /* + * First specific handling of the initialization call + */ + if (init) { + c_in = 0; + c_out = written; + if (output->encoder->output != NULL) { + ret = output->encoder->output(xmlBufEnd(out), &c_out, + NULL, &c_in); + if (ret > 0) /* Gennady: check return value */ + xmlBufAddLen(out, c_out); + } +#ifdef LIBXML_ICONV_ENABLED + else if (output->encoder->iconv_out != NULL) { + ret = xmlIconvWrapper(output->encoder->iconv_out, xmlBufEnd(out), + &c_out, NULL, &c_in); + xmlBufAddLen(out, c_out); + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (output->encoder->uconv_out != NULL) { + ret = xmlUconvWrapper(output->encoder->uconv_out, 0, xmlBufEnd(out), + &c_out, NULL, &c_in); + xmlBufAddLen(out, c_out); + } +#endif /* LIBXML_ICU_ENABLED */ +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "initialized encoder\n"); +#endif + return(0); + } + + /* + * Conversion itself. + */ + toconv = xmlBufUse(in); + if (toconv == 0) + return (0); + if (toconv > 64 * 1024) + toconv = 64 * 1024; + if (toconv * 4 >= written) { + xmlBufGrow(out, toconv * 4); + written = xmlBufAvail(out) - 1; + } + if (written > 256 * 1024) + written = 256 * 1024; + + c_in = toconv; + c_out = written; + if (output->encoder->output != NULL) { + ret = output->encoder->output(xmlBufEnd(out), &c_out, + xmlBufContent(in), &c_in); + if (c_out > 0) { + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + writtentot += c_out; + } + } +#ifdef LIBXML_ICONV_ENABLED + else if (output->encoder->iconv_out != NULL) { + ret = xmlIconvWrapper(output->encoder->iconv_out, xmlBufEnd(out), + &c_out, xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + writtentot += c_out; + if (ret == -1) { + if (c_out > 0) { + /* + * Can be a limitation of iconv + */ + charref_len = 0; + goto retry; + } + ret = -3; + } + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef LIBXML_ICU_ENABLED + else if (output->encoder->uconv_out != NULL) { + ret = xmlUconvWrapper(output->encoder->uconv_out, 0, xmlBufEnd(out), + &c_out, xmlBufContent(in), &c_in); + xmlBufShrink(in, c_in); + xmlBufAddLen(out, c_out); + writtentot += c_out; + if (ret == -1) { + if (c_out > 0) { + /* + * Can be a limitation of uconv + */ + charref_len = 0; + goto retry; + } + ret = -3; + } + } +#endif /* LIBXML_ICU_ENABLED */ + else { + xmlEncodingErr(XML_I18N_NO_OUTPUT, + "xmlCharEncOutFunc: no output function !\n", NULL); + return(-1); + } + + if (ret >= 0) output += ret; + + /* + * Attempt to handle error cases + */ + switch (ret) { + case 0: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "converted %d bytes to %d bytes of output\n", + c_in, c_out); +#endif + break; + case -1: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "output conversion failed by lack of space\n"); +#endif + break; + case -3: +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n", + c_in, c_out, (int) xmlBufUse(in)); +#endif + break; + case -2: { + int len = (int) xmlBufUse(in); + xmlChar *content = xmlBufContent(in); + int cur; + + cur = xmlGetUTF8Char(content, &len); + if ((charref_len != 0) && (c_out < charref_len)) { + /* + * We attempted to insert a character reference and failed. + * Undo what was written and skip the remaining charref. + */ + xmlBufErase(out, c_out); + writtentot -= c_out; + xmlBufShrink(in, charref_len - c_out); + charref_len = 0; + + ret = -1; + break; + } else if (cur > 0) { + xmlChar charref[20]; + +#ifdef DEBUG_ENCODING + xmlGenericError(xmlGenericErrorContext, + "handling output conversion error\n"); + xmlGenericError(xmlGenericErrorContext, + "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", + content[0], content[1], + content[2], content[3]); +#endif + /* + * Removes the UTF8 sequence, and replace it by a charref + * and continue the transcoding phase, hoping the error + * did not mangle the encoder state. + */ + charref_len = snprintf((char *) &charref[0], sizeof(charref), + "&#%d;", cur); + xmlBufShrink(in, len); + xmlBufAddHead(in, charref, -1); + + goto retry; + } else { + char buf[50]; + + snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X", + content[0], content[1], + content[2], content[3]); + buf[49] = 0; + xmlEncodingErr(XML_I18N_CONV_FAILED, + "output conversion failed due to conv error, bytes %s\n", + buf); + if (xmlBufGetAllocationScheme(in) != XML_BUFFER_ALLOC_IMMUTABLE) + content[0] = ' '; + } + break; + } + } + return(ret); +} + +/** * xmlCharEncOutFunc: * @handler: char enconding transformation data structure * @out: an xmlBuffer for the output. |