aboutsummaryrefslogtreecommitdiffstats
path: root/encoding.c
diff options
context:
space:
mode:
authorDaniel Veillard <veillard@redhat.com>2012-07-13 19:51:15 +0800
committerDaniel Veillard <veillard@redhat.com>2012-07-23 14:24:26 +0800
commit18d0db25037978a6a274f55a2c058ad82331965a (patch)
treec5ea37f8c3dac5677bde55cc174223d741c6b618 /encoding.c
parentade10f2c57b4bd5c3812b96bce1144d8fa1d189e (diff)
downloadandroid_external_libxml2-18d0db25037978a6a274f55a2c058ad82331965a.tar.gz
android_external_libxml2-18d0db25037978a6a274f55a2c058ad82331965a.tar.bz2
android_external_libxml2-18d0db25037978a6a274f55a2c058ad82331965a.zip
Adding new encoding function to deal with the new structures
* encoding.c: adds xmlCharEncFirstLineInput, xmlCharEncInput and xmlCharEncOutput * enc.h: the functions are not made public but added to this new header
Diffstat (limited to 'encoding.c')
-rw-r--r--encoding.c483
1 files changed, 479 insertions, 4 deletions
diff --git a/encoding.c b/encoding.c
index d486dd6e..e7f563f0 100644
--- a/encoding.c
+++ b/encoding.c
@@ -24,6 +24,7 @@
#include "libxml.h"
#include <string.h>
+#include <limits.h>
#ifdef HAVE_CTYPE_H
#include <ctype.h>
@@ -44,6 +45,9 @@
#include <libxml/globals.h>
#include <libxml/xmlerror.h>
+#include "buf.h"
+#include "enc.h"
+
static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
@@ -1897,9 +1901,6 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
* The real API used by libxml for on-the-fly conversion *
* *
************************************************************************/
-int
-xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
- xmlBufferPtr in, int len);
/**
* xmlCharEncFirstLineInt:
@@ -1946,7 +1947,7 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out,
toconv = 180;
}
if (toconv * 2 >= written) {
- xmlBufferGrow(out, toconv);
+ xmlBufferGrow(out, toconv * 2);
written = out->size - out->use - 1;
}
@@ -2029,6 +2030,251 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
}
/**
+ * xmlCharEncInput:
+ * @input: a parser input buffer
+ * @len: number of bytes to convert for the first line, or -1
+ *
+ * Front-end for the encoding handler input function, but handle only
+ * the very first line. Point is that this is based on autodetection
+ * of the encoding and once that first line is converted we may find
+ * out that a different decoder is needed to process the input.
+ *
+ * Returns the number of byte written if success, or
+ * -1 general error
+ * -2 if the transcoding fails (for *in is not valid utf8 string or
+ * the result of transformation can't fit into the encoding we want), or
+ */
+int
+xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len)
+{
+ int ret = -2;
+ size_t written;
+ size_t toconv;
+ int c_in;
+ int c_out;
+ xmlBufPtr in;
+ xmlBufPtr out;
+
+ if ((input == NULL) || (input->encoder == NULL) ||
+ (input->buffer == NULL) || (input->raw == NULL))
+ return (-1);
+ out = input->buffer;
+ in = input->raw;
+
+ toconv = xmlBufUse(in);
+ if (toconv == 0)
+ return (0);
+ written = xmlBufAvail(out) - 1; /* count '\0' */
+ /*
+ * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
+ * 45 chars should be sufficient to reach the end of the encoding
+ * declaration without going too far inside the document content.
+ * on UTF-16 this means 90bytes, on UCS4 this means 180
+ * The actual value depending on guessed encoding is passed as @len
+ * if provided
+ */
+ if (len >= 0) {
+ if (toconv > (unsigned int) len)
+ toconv = len;
+ } else {
+ if (toconv > 180)
+ toconv = 180;
+ }
+ if (toconv * 2 >= written) {
+ xmlBufGrow(out, toconv * 2);
+ written = xmlBufAvail(out) - 1;
+ }
+ if (written > 360)
+ written = 360;
+
+ c_in = toconv;
+ c_out = written;
+ if (input->encoder->input != NULL) {
+ ret = input->encoder->input(xmlBufEnd(out), &c_out,
+ xmlBufContent(in), &c_in);
+ xmlBufShrink(in, c_in);
+ xmlBufAddLen(out, c_out);
+ }
+#ifdef LIBXML_ICONV_ENABLED
+ else if (input->encoder->iconv_in != NULL) {
+ ret = xmlIconvWrapper(input->encoder->iconv_in, xmlBufEnd(out),
+ &c_out, xmlBufContent(in), &c_in);
+ xmlBufShrink(in, c_in);
+ xmlBufAddLen(out, c_out);
+ if (ret == -1)
+ ret = -3;
+ }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+ else if (input->encoder->uconv_in != NULL) {
+ ret = xmlUconvWrapper(input->encoder->uconv_in, 1, xmlBufEnd(out),
+ &c_out, xmlBufContent(in), &c_in);
+ xmlBufShrink(in, c_in);
+ xmlBufAddLen(out, c_out);
+ if (ret == -1)
+ ret = -3;
+ }
+#endif /* LIBXML_ICU_ENABLED */
+ switch (ret) {
+ case 0:
+#ifdef DEBUG_ENCODING
+ xmlGenericError(xmlGenericErrorContext,
+ "converted %d bytes to %d bytes of input\n",
+ c_in, c_out);
+#endif
+ break;
+ case -1:
+#ifdef DEBUG_ENCODING
+ xmlGenericError(xmlGenericErrorContext,
+ "converted %d bytes to %d bytes of input, %d left\n",
+ c_in, c_out, (int)xmlBufUse(in));
+#endif
+ break;
+ case -3:
+#ifdef DEBUG_ENCODING
+ xmlGenericError(xmlGenericErrorContext,
+ "converted %d bytes to %d bytes of input, %d left\n",
+ c_in, c_out, (int)xmlBufUse(in));
+#endif
+ break;
+ case -2: {
+ char buf[50];
+ const xmlChar *content = xmlBufContent(in);
+
+ snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
+ content[0], content[1],
+ content[2], content[3]);
+ buf[49] = 0;
+ xmlEncodingErr(XML_I18N_CONV_FAILED,
+ "input conversion failed due to input error, bytes %s\n",
+ buf);
+ }
+ }
+ /*
+ * Ignore when input buffer is not on a boundary
+ */
+ if (ret == -3) ret = 0;
+ if (ret == -1) ret = 0;
+ return(ret);
+}
+
+/**
+ * xmlCharEncInput:
+ * @input: a parser input buffer
+ *
+ * Generic front-end for the encoding handler on parser input
+ *
+ * Returns the number of byte written if success, or
+ * -1 general error
+ * -2 if the transcoding fails (for *in is not valid utf8 string or
+ * the result of transformation can't fit into the encoding we want), or
+ */
+int
+xmlCharEncInput(xmlParserInputBufferPtr input)
+{
+ int ret = -2;
+ size_t written;
+ size_t toconv;
+ int c_in;
+ int c_out;
+ xmlBufPtr in;
+ xmlBufPtr out;
+
+ if ((input == NULL) || (input->encoder == NULL) ||
+ (input->buffer == NULL) || (input->raw == NULL))
+ return (-1);
+ out = input->buffer;
+ in = input->raw;
+
+ toconv = xmlBufUse(in);
+ if (toconv == 0)
+ return (0);
+ if (toconv > 64 * 1024)
+ toconv = 64 * 1024;
+ written = xmlBufAvail(out);
+ if (written > 0)
+ written--; /* count '\0' */
+ if (toconv * 2 >= written) {
+ xmlBufGrow(out, toconv * 2);
+ written = xmlBufAvail(out);
+ if (written > 0)
+ written--; /* count '\0' */
+ }
+ if (written > 128 * 1024)
+ written = 128 * 1024;
+
+ c_in = toconv;
+ c_out = written;
+ if (input->encoder->input != NULL) {
+ ret = input->encoder->input(xmlBufEnd(out), &c_out,
+ xmlBufContent(in), &c_in);
+ xmlBufShrink(in, c_in);
+ xmlBufAddLen(out, c_out);
+ }
+#ifdef LIBXML_ICONV_ENABLED
+ else if (input->encoder->iconv_in != NULL) {
+ ret = xmlIconvWrapper(input->encoder->iconv_in, xmlBufEnd(out),
+ &c_out, xmlBufContent(in), &c_in);
+ xmlBufShrink(in, c_in);
+ xmlBufAddLen(out, c_out);
+ if (ret == -1)
+ ret = -3;
+ }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+ else if (input->encoder->uconv_in != NULL) {
+ ret = xmlUconvWrapper(input->encoder->uconv_in, 1, xmlBufEnd(out),
+ &c_out, xmlBufContent(in), &c_in);
+ xmlBufShrink(in, c_in);
+ xmlBufAddLen(out, c_out);
+ if (ret == -1)
+ ret = -3;
+ }
+#endif /* LIBXML_ICU_ENABLED */
+ switch (ret) {
+ case 0:
+#ifdef DEBUG_ENCODING
+ xmlGenericError(xmlGenericErrorContext,
+ "converted %d bytes to %d bytes of input\n",
+ c_in, c_out);
+#endif
+ break;
+ case -1:
+#ifdef DEBUG_ENCODING
+ xmlGenericError(xmlGenericErrorContext,
+ "converted %d bytes to %d bytes of input, %d left\n",
+ c_in, c_out, (int)xmlBufUse(in));
+#endif
+ break;
+ case -3:
+#ifdef DEBUG_ENCODING
+ xmlGenericError(xmlGenericErrorContext,
+ "converted %d bytes to %d bytes of input, %d left\n",
+ c_in, c_out, (int)xmlBufUse(in));
+#endif
+ break;
+ case -2: {
+ char buf[50];
+ const xmlChar *content = xmlBufContent(in);
+
+ snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
+ content[0], content[1],
+ content[2], content[3]);
+ buf[49] = 0;
+ xmlEncodingErr(XML_I18N_CONV_FAILED,
+ "input conversion failed due to input error, bytes %s\n",
+ buf);
+ }
+ }
+ /*
+ * Ignore when input buffer is not on a boundary
+ */
+ if (ret == -3)
+ ret = 0;
+ return (c_out? c_out : ret);
+}
+
+/**
* xmlCharEncInFunc:
* @handler: char encoding transformation data structure
* @out: an xmlBuffer for the output.
@@ -2136,6 +2382,235 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out,
}
/**
+ * xmlCharEncOutput:
+ * @input: a parser input buffer
+ * @init: is this an initialization call without data
+ *
+ * Generic front-end for the encoding handler on parser output
+ * a first call with @init == 1 has to be made first to initiate the
+ * output in case of non-stateless encoding needing to initiate their
+ * state or the output (like the BOM in UTF16).
+ * In case of UTF8 sequence conversion errors for the given encoder,
+ * the content will be automatically remapped to a CharRef sequence.
+ *
+ * Returns the number of byte written if success, or
+ * -1 general error
+ * -2 if the transcoding fails (for *in is not valid utf8 string or
+ * the result of transformation can't fit into the encoding we want), or
+ */
+int
+xmlCharEncOutput(xmlOutputBufferPtr output, int init)
+{
+ int ret = -2;
+ size_t written;
+ size_t writtentot = 0;
+ size_t toconv;
+ int c_in;
+ int c_out;
+ xmlBufPtr in;
+ xmlBufPtr out;
+ int charref_len = 0;
+
+ if ((output == NULL) || (output->encoder == NULL) ||
+ (output->buffer == NULL) || (output->conv == NULL))
+ return (-1);
+ out = output->conv;
+ in = output->buffer;
+
+retry:
+
+ written = xmlBufAvail(out);
+ if (written > 0)
+ written--; /* count '\0' */
+
+ /*
+ * First specific handling of the initialization call
+ */
+ if (init) {
+ c_in = 0;
+ c_out = written;
+ if (output->encoder->output != NULL) {
+ ret = output->encoder->output(xmlBufEnd(out), &c_out,
+ NULL, &c_in);
+ if (ret > 0) /* Gennady: check return value */
+ xmlBufAddLen(out, c_out);
+ }
+#ifdef LIBXML_ICONV_ENABLED
+ else if (output->encoder->iconv_out != NULL) {
+ ret = xmlIconvWrapper(output->encoder->iconv_out, xmlBufEnd(out),
+ &c_out, NULL, &c_in);
+ xmlBufAddLen(out, c_out);
+ }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+ else if (output->encoder->uconv_out != NULL) {
+ ret = xmlUconvWrapper(output->encoder->uconv_out, 0, xmlBufEnd(out),
+ &c_out, NULL, &c_in);
+ xmlBufAddLen(out, c_out);
+ }
+#endif /* LIBXML_ICU_ENABLED */
+#ifdef DEBUG_ENCODING
+ xmlGenericError(xmlGenericErrorContext,
+ "initialized encoder\n");
+#endif
+ return(0);
+ }
+
+ /*
+ * Conversion itself.
+ */
+ toconv = xmlBufUse(in);
+ if (toconv == 0)
+ return (0);
+ if (toconv > 64 * 1024)
+ toconv = 64 * 1024;
+ if (toconv * 4 >= written) {
+ xmlBufGrow(out, toconv * 4);
+ written = xmlBufAvail(out) - 1;
+ }
+ if (written > 256 * 1024)
+ written = 256 * 1024;
+
+ c_in = toconv;
+ c_out = written;
+ if (output->encoder->output != NULL) {
+ ret = output->encoder->output(xmlBufEnd(out), &c_out,
+ xmlBufContent(in), &c_in);
+ if (c_out > 0) {
+ xmlBufShrink(in, c_in);
+ xmlBufAddLen(out, c_out);
+ writtentot += c_out;
+ }
+ }
+#ifdef LIBXML_ICONV_ENABLED
+ else if (output->encoder->iconv_out != NULL) {
+ ret = xmlIconvWrapper(output->encoder->iconv_out, xmlBufEnd(out),
+ &c_out, xmlBufContent(in), &c_in);
+ xmlBufShrink(in, c_in);
+ xmlBufAddLen(out, c_out);
+ writtentot += c_out;
+ if (ret == -1) {
+ if (c_out > 0) {
+ /*
+ * Can be a limitation of iconv
+ */
+ charref_len = 0;
+ goto retry;
+ }
+ ret = -3;
+ }
+ }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef LIBXML_ICU_ENABLED
+ else if (output->encoder->uconv_out != NULL) {
+ ret = xmlUconvWrapper(output->encoder->uconv_out, 0, xmlBufEnd(out),
+ &c_out, xmlBufContent(in), &c_in);
+ xmlBufShrink(in, c_in);
+ xmlBufAddLen(out, c_out);
+ writtentot += c_out;
+ if (ret == -1) {
+ if (c_out > 0) {
+ /*
+ * Can be a limitation of uconv
+ */
+ charref_len = 0;
+ goto retry;
+ }
+ ret = -3;
+ }
+ }
+#endif /* LIBXML_ICU_ENABLED */
+ else {
+ xmlEncodingErr(XML_I18N_NO_OUTPUT,
+ "xmlCharEncOutFunc: no output function !\n", NULL);
+ return(-1);
+ }
+
+ if (ret >= 0) output += ret;
+
+ /*
+ * Attempt to handle error cases
+ */
+ switch (ret) {
+ case 0:
+#ifdef DEBUG_ENCODING
+ xmlGenericError(xmlGenericErrorContext,
+ "converted %d bytes to %d bytes of output\n",
+ c_in, c_out);
+#endif
+ break;
+ case -1:
+#ifdef DEBUG_ENCODING
+ xmlGenericError(xmlGenericErrorContext,
+ "output conversion failed by lack of space\n");
+#endif
+ break;
+ case -3:
+#ifdef DEBUG_ENCODING
+ xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n",
+ c_in, c_out, (int) xmlBufUse(in));
+#endif
+ break;
+ case -2: {
+ int len = (int) xmlBufUse(in);
+ xmlChar *content = xmlBufContent(in);
+ int cur;
+
+ cur = xmlGetUTF8Char(content, &len);
+ if ((charref_len != 0) && (c_out < charref_len)) {
+ /*
+ * We attempted to insert a character reference and failed.
+ * Undo what was written and skip the remaining charref.
+ */
+ xmlBufErase(out, c_out);
+ writtentot -= c_out;
+ xmlBufShrink(in, charref_len - c_out);
+ charref_len = 0;
+
+ ret = -1;
+ break;
+ } else if (cur > 0) {
+ xmlChar charref[20];
+
+#ifdef DEBUG_ENCODING
+ xmlGenericError(xmlGenericErrorContext,
+ "handling output conversion error\n");
+ xmlGenericError(xmlGenericErrorContext,
+ "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
+ content[0], content[1],
+ content[2], content[3]);
+#endif
+ /*
+ * Removes the UTF8 sequence, and replace it by a charref
+ * and continue the transcoding phase, hoping the error
+ * did not mangle the encoder state.
+ */
+ charref_len = snprintf((char *) &charref[0], sizeof(charref),
+ "&#%d;", cur);
+ xmlBufShrink(in, len);
+ xmlBufAddHead(in, charref, -1);
+
+ goto retry;
+ } else {
+ char buf[50];
+
+ snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
+ content[0], content[1],
+ content[2], content[3]);
+ buf[49] = 0;
+ xmlEncodingErr(XML_I18N_CONV_FAILED,
+ "output conversion failed due to conv error, bytes %s\n",
+ buf);
+ if (xmlBufGetAllocationScheme(in) != XML_BUFFER_ALLOC_IMMUTABLE)
+ content[0] = ' ';
+ }
+ break;
+ }
+ }
+ return(ret);
+}
+
+/**
* xmlCharEncOutFunc:
* @handler: char enconding transformation data structure
* @out: an xmlBuffer for the output.