aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ChangeLog7
-rw-r--r--parser.c104
-rw-r--r--result/errors/cdata.xml0
-rw-r--r--result/errors/cdata.xml.err4
-rw-r--r--result/errors/cdata.xml.str5
-rw-r--r--test/errors/cdata.xml2
6 files changed, 117 insertions, 5 deletions
diff --git a/ChangeLog b/ChangeLog
index 5e3208e0..ca85de07 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+Mon Jul 4 13:11:12 CEST 2005 Daniel Veillard <daniel@veillard.com>
+
+ * parser.c: fixed a bug failing to detect UTF-8 violations in
+ CData in push mode.
+ * result/errors/cdata.xml* test/errors/cdata.xml: added the test
+ to the regressions
+
Mon Jul 4 11:26:57 CEST 2005 Daniel Veillard <daniel@veillard.com>
* debugXML.c: added enhancement for #309057 in xmllint shell
diff --git a/parser.c b/parser.c
index ff863d9f..8a7540da 100644
--- a/parser.c
+++ b/parser.c
@@ -9153,6 +9153,73 @@ xmlParseGetLasts(xmlParserCtxtPtr ctxt, const xmlChar **lastlt,
}
}
/**
+ * xmlCheckCdataPush:
+ * @cur: pointer to the bock of characters
+ * @len: length of the block in bytes
+ *
+ * Check that the block of characters is okay as SCdata content [20]
+ *
+ * Returns the number of bytes to pass if okay, a negative index where an
+ * UTF-8 error occured otherwise
+ */
+static int
+xmlCheckCdataPush(const xmlChar *utf, int len) {
+ int ix;
+ unsigned char c;
+ int codepoint;
+
+ if ((utf == NULL) || (len <= 0))
+ return(0);
+
+ for (ix = 0; ix < len;) { /* string is 0-terminated */
+ c = utf[ix];
+ if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */
+ if (c >= 0x20)
+ ix++;
+ else if ((c == 0xA) || (c == 0xD) || (c == 0x9))
+ ix++;
+ else
+ return(-ix);
+ } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
+ if (ix + 2 > len) return(ix);
+ if ((utf[ix+1] & 0xc0 ) != 0x80)
+ return(-ix);
+ codepoint = (utf[0] & 0x1f) << 6;
+ codepoint |= utf[1] & 0x3f;
+ if (!xmlIsCharQ(codepoint))
+ return(-ix);
+ ix += 2;
+ } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
+ if (ix + 3 > len) return(ix);
+ if (((utf[ix+1] & 0xc0) != 0x80) ||
+ ((utf[ix+2] & 0xc0) != 0x80))
+ return(-ix);
+ codepoint = (utf[0] & 0xf) << 12;
+ codepoint |= (utf[1] & 0x3f) << 6;
+ codepoint |= utf[2] & 0x3f;
+ if (!xmlIsCharQ(codepoint))
+ return(-ix);
+ ix += 3;
+ } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
+ if (ix + 4 > len) return(ix);
+ if (((utf[ix+1] & 0xc0) != 0x80) ||
+ ((utf[ix+2] & 0xc0) != 0x80) ||
+ ((utf[ix+3] & 0xc0) != 0x80))
+ return(-ix);
+ codepoint = (utf[0] & 0x7) << 18;
+ codepoint |= (utf[1] & 0x3f) << 12;
+ codepoint |= (utf[2] & 0x3f) << 6;
+ codepoint |= utf[3] & 0x3f;
+ if (!xmlIsCharQ(codepoint))
+ return(-ix);
+ ix += 4;
+ } else /* unknown encoding */
+ return(-ix);
+ }
+ return(ix);
+}
+
+/**
* xmlParseTryOrFinish:
* @ctxt: an XML parser context
* @terminate: last chunk indicator
@@ -9623,21 +9690,36 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
base = xmlParseLookupSequence(ctxt, ']', ']', '>');
if (base < 0) {
if (avail >= XML_PARSER_BIG_BUFFER_SIZE + 2) {
+ int tmp;
+
+ tmp = xmlCheckCdataPush(ctxt->input->cur,
+ XML_PARSER_BIG_BUFFER_SIZE);
+ if (tmp < 0) {
+ tmp = -tmp;
+ ctxt->input->cur += tmp;
+ goto encoding_error;
+ }
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (ctxt->sax->cdataBlock != NULL)
ctxt->sax->cdataBlock(ctxt->userData,
- ctxt->input->cur,
- XML_PARSER_BIG_BUFFER_SIZE);
+ ctxt->input->cur, tmp);
else if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData,
- ctxt->input->cur,
- XML_PARSER_BIG_BUFFER_SIZE);
+ ctxt->input->cur, tmp);
}
- SKIPL(XML_PARSER_BIG_BUFFER_SIZE);
+ SKIPL(tmp);
ctxt->checkIndex = 0;
}
goto done;
} else {
+ int tmp;
+
+ tmp = xmlCheckCdataPush(ctxt->input->cur, base);
+ if ((tmp < 0) || (tmp != base)) {
+ tmp = -tmp;
+ ctxt->input->cur += tmp;
+ goto encoding_error;
+ }
if ((ctxt->sax != NULL) && (base > 0) &&
(!ctxt->disableSAX)) {
if (ctxt->sax->cdataBlock != NULL)
@@ -10038,6 +10120,18 @@ done:
xmlGenericError(xmlGenericErrorContext, "PP: done %d\n", ret);
#endif
return(ret);
+encoding_error:
+ {
+ char buffer[150];
+
+ snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
+ ctxt->input->cur[0], ctxt->input->cur[1],
+ ctxt->input->cur[2], ctxt->input->cur[3]);
+ __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
+ "Input is not proper UTF-8, indicate encoding !\n%s",
+ BAD_CAST buffer, NULL);
+ }
+ return(0);
}
/**
diff --git a/result/errors/cdata.xml b/result/errors/cdata.xml
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/result/errors/cdata.xml
diff --git a/result/errors/cdata.xml.err b/result/errors/cdata.xml.err
new file mode 100644
index 00000000..f757963f
--- /dev/null
+++ b/result/errors/cdata.xml.err
@@ -0,0 +1,4 @@
+./test/errors/cdata.xml:2: parser error : Input is not proper UTF-8, indicate encoding !
+Bytes: 0xE1 0x72 0x5D 0x5D
+<A><![CDATA[Cár]]></A>
+ ^
diff --git a/result/errors/cdata.xml.str b/result/errors/cdata.xml.str
new file mode 100644
index 00000000..e043441a
--- /dev/null
+++ b/result/errors/cdata.xml.str
@@ -0,0 +1,5 @@
+./test/errors/cdata.xml:2: parser error : Input is not proper UTF-8, indicate encoding !
+Bytes: 0x5B 0x43 0xE1 0x72
+<A><![CDATA[Cár]]></A>
+ ^
+./test/errors/cdata.xml : failed to parse
diff --git a/test/errors/cdata.xml b/test/errors/cdata.xml
new file mode 100644
index 00000000..8f2bc09c
--- /dev/null
+++ b/test/errors/cdata.xml
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="utf-8"?>
+<A><![CDATA[Cár]]></A>