diff options
-rw-r--r-- | ChangeLog | 7 | ||||
-rw-r--r-- | parser.c | 104 | ||||
-rw-r--r-- | result/errors/cdata.xml | 0 | ||||
-rw-r--r-- | result/errors/cdata.xml.err | 4 | ||||
-rw-r--r-- | result/errors/cdata.xml.str | 5 | ||||
-rw-r--r-- | test/errors/cdata.xml | 2 |
6 files changed, 117 insertions, 5 deletions
@@ -1,3 +1,10 @@ +Mon Jul 4 13:11:12 CEST 2005 Daniel Veillard <daniel@veillard.com> + + * parser.c: fixed a bug failing to detect UTF-8 violations in + CData in push mode. + * result/errors/cdata.xml* test/errors/cdata.xml: added the test + to the regressions + Mon Jul 4 11:26:57 CEST 2005 Daniel Veillard <daniel@veillard.com> * debugXML.c: added enhancement for #309057 in xmllint shell @@ -9153,6 +9153,73 @@ xmlParseGetLasts(xmlParserCtxtPtr ctxt, const xmlChar **lastlt, } } /** + * xmlCheckCdataPush: + * @cur: pointer to the bock of characters + * @len: length of the block in bytes + * + * Check that the block of characters is okay as SCdata content [20] + * + * Returns the number of bytes to pass if okay, a negative index where an + * UTF-8 error occured otherwise + */ +static int +xmlCheckCdataPush(const xmlChar *utf, int len) { + int ix; + unsigned char c; + int codepoint; + + if ((utf == NULL) || (len <= 0)) + return(0); + + for (ix = 0; ix < len;) { /* string is 0-terminated */ + c = utf[ix]; + if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */ + if (c >= 0x20) + ix++; + else if ((c == 0xA) || (c == 0xD) || (c == 0x9)) + ix++; + else + return(-ix); + } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */ + if (ix + 2 > len) return(ix); + if ((utf[ix+1] & 0xc0 ) != 0x80) + return(-ix); + codepoint = (utf[0] & 0x1f) << 6; + codepoint |= utf[1] & 0x3f; + if (!xmlIsCharQ(codepoint)) + return(-ix); + ix += 2; + } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */ + if (ix + 3 > len) return(ix); + if (((utf[ix+1] & 0xc0) != 0x80) || + ((utf[ix+2] & 0xc0) != 0x80)) + return(-ix); + codepoint = (utf[0] & 0xf) << 12; + codepoint |= (utf[1] & 0x3f) << 6; + codepoint |= utf[2] & 0x3f; + if (!xmlIsCharQ(codepoint)) + return(-ix); + ix += 3; + } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */ + if (ix + 4 > len) return(ix); + if (((utf[ix+1] & 0xc0) != 0x80) || + ((utf[ix+2] & 0xc0) != 0x80) || + ((utf[ix+3] & 0xc0) != 0x80)) + return(-ix); + codepoint = (utf[0] & 0x7) << 18; + codepoint |= (utf[1] & 0x3f) << 12; + codepoint |= (utf[2] & 0x3f) << 6; + codepoint |= utf[3] & 0x3f; + if (!xmlIsCharQ(codepoint)) + return(-ix); + ix += 4; + } else /* unknown encoding */ + return(-ix); + } + return(ix); +} + +/** * xmlParseTryOrFinish: * @ctxt: an XML parser context * @terminate: last chunk indicator @@ -9623,21 +9690,36 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { base = xmlParseLookupSequence(ctxt, ']', ']', '>'); if (base < 0) { if (avail >= XML_PARSER_BIG_BUFFER_SIZE + 2) { + int tmp; + + tmp = xmlCheckCdataPush(ctxt->input->cur, + XML_PARSER_BIG_BUFFER_SIZE); + if (tmp < 0) { + tmp = -tmp; + ctxt->input->cur += tmp; + goto encoding_error; + } if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { if (ctxt->sax->cdataBlock != NULL) ctxt->sax->cdataBlock(ctxt->userData, - ctxt->input->cur, - XML_PARSER_BIG_BUFFER_SIZE); + ctxt->input->cur, tmp); else if (ctxt->sax->characters != NULL) ctxt->sax->characters(ctxt->userData, - ctxt->input->cur, - XML_PARSER_BIG_BUFFER_SIZE); + ctxt->input->cur, tmp); } - SKIPL(XML_PARSER_BIG_BUFFER_SIZE); + SKIPL(tmp); ctxt->checkIndex = 0; } goto done; } else { + int tmp; + + tmp = xmlCheckCdataPush(ctxt->input->cur, base); + if ((tmp < 0) || (tmp != base)) { + tmp = -tmp; + ctxt->input->cur += tmp; + goto encoding_error; + } if ((ctxt->sax != NULL) && (base > 0) && (!ctxt->disableSAX)) { if (ctxt->sax->cdataBlock != NULL) @@ -10038,6 +10120,18 @@ done: xmlGenericError(xmlGenericErrorContext, "PP: done %d\n", ret); #endif return(ret); +encoding_error: + { + char buffer[150]; + + snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", + ctxt->input->cur[0], ctxt->input->cur[1], + ctxt->input->cur[2], ctxt->input->cur[3]); + __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR, + "Input is not proper UTF-8, indicate encoding !\n%s", + BAD_CAST buffer, NULL); + } + return(0); } /** diff --git a/result/errors/cdata.xml b/result/errors/cdata.xml new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/result/errors/cdata.xml diff --git a/result/errors/cdata.xml.err b/result/errors/cdata.xml.err new file mode 100644 index 00000000..f757963f --- /dev/null +++ b/result/errors/cdata.xml.err @@ -0,0 +1,4 @@ +./test/errors/cdata.xml:2: parser error : Input is not proper UTF-8, indicate encoding ! +Bytes: 0xE1 0x72 0x5D 0x5D +<A><![CDATA[Cár]]></A> + ^ diff --git a/result/errors/cdata.xml.str b/result/errors/cdata.xml.str new file mode 100644 index 00000000..e043441a --- /dev/null +++ b/result/errors/cdata.xml.str @@ -0,0 +1,5 @@ +./test/errors/cdata.xml:2: parser error : Input is not proper UTF-8, indicate encoding ! +Bytes: 0x5B 0x43 0xE1 0x72 +<A><![CDATA[Cár]]></A> + ^ +./test/errors/cdata.xml : failed to parse diff --git a/test/errors/cdata.xml b/test/errors/cdata.xml new file mode 100644 index 00000000..8f2bc09c --- /dev/null +++ b/test/errors/cdata.xml @@ -0,0 +1,2 @@ +<?xml version="1.0" encoding="utf-8"?> +<A><![CDATA[Cár]]></A> |