aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Veillard <veillard@redhat.com>2015-06-30 11:36:28 +0800
committerDaniel Veillard <veillard@redhat.com>2015-06-30 11:36:28 +0800
commit140c251e8e5653572edcca91b9d675f871735cb4 (patch)
tree5e66db52be6659fc168497798109313025e360fa
parent58b84e1f822d9323b4bbadeb07ee147cccc96e7e (diff)
downloadandroid_external_libxml2-140c251e8e5653572edcca91b9d675f871735cb4.tar.gz
android_external_libxml2-140c251e8e5653572edcca91b9d675f871735cb4.tar.bz2
android_external_libxml2-140c251e8e5653572edcca91b9d675f871735cb4.zip
Recover unescaped less-than character in HTML recovery parsing
As pointed by Christian Schoenebeck <schoenebeck@crudebyte.com> on the list and based on some of his early patches, this preserve content when unescaped opening angle brackets are not escaped in textual content like: <p> a < b </p> <p> a <0 </p> <p> a <=0 </p> while still reporting the error.
-rw-r--r--HTMLparser.c33
1 files changed, 30 insertions, 3 deletions
diff --git a/HTMLparser.c b/HTMLparser.c
index d329d3b5..19c10c3f 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -2948,8 +2948,9 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
/**
- * htmlParseCharData:
+ * htmlParseCharDataInternal:
* @ctxt: an HTML parser context
+ * @readahead: optional read ahead character in ascii range
*
* parse a CharData section.
* if we are within a CDATA section ']]>' marks an end of section.
@@ -2958,12 +2959,15 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
*/
static void
-htmlParseCharData(htmlParserCtxtPtr ctxt) {
- xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
+htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
+ xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
int nbchar = 0;
int cur, l;
int chunk = 0;
+ if (readahead)
+ buf[nbchar++] = readahead;
+
SHRINK;
cur = CUR_CHAR(l);
while (((cur != '<') || (ctxt->token == '<')) &&
@@ -3043,6 +3047,21 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
}
/**
+ * htmlParseCharData:
+ * @ctxt: an HTML parser context
+ *
+ * parse a CharData section.
+ * if we are within a CDATA section ']]>' marks an end of section.
+ *
+ * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
+ */
+
+static void
+htmlParseCharData(htmlParserCtxtPtr ctxt) {
+ htmlParseCharDataInternal(ctxt, 0);
+}
+
+/**
* htmlParseExternalID:
* @ctxt: an HTML parser context
* @publicID: a xmlChar** receiving PubidLiteral
@@ -3690,6 +3709,14 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
"htmlParseStartTag: invalid element name\n",
NULL, NULL);
+ /* if recover preserve text on classic misconstructs */
+ if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
+ (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
+ htmlParseCharDataInternal(ctxt, '<');
+ return(-1);
+ }
+
+
/* Dump the bogus tag like browsers do */
while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
(ctxt->instate != XML_PARSER_EOF))