Recover unescaped less-than character in HTML recovery parsing

As pointed by Christian Schoenebeck <schoenebeck@crudebyte.com> on the list and based on some of his early patches, this preserve content when unescaped opening angle brackets are not escaped in textual content like: <p> a < b </p> <p> a <0 </p> <p> a <=0 </p> while still reporting the error.
author: Daniel Veillard <veillard@redhat.com> 2015-06-30 11:36:28 +0800
committer: Daniel Veillard <veillard@redhat.com> 2015-06-30 11:36:28 +0800
commit: 140c251e8e5653572edcca91b9d675f871735cb4 (patch)
tree: 5e66db52be6659fc168497798109313025e360fa /HTMLparser.c
parent: 58b84e1f822d9323b4bbadeb07ee147cccc96e7e (diff)
download: android_external_libxml2-140c251e8e5653572edcca91b9d675f871735cb4.tar.gz
android_external_libxml2-140c251e8e5653572edcca91b9d675f871735cb4.tar.bz2
android_external_libxml2-140c251e8e5653572edcca91b9d675f871735cb4.zip
1 files changed, 30 insertions, 3 deletions
diff --git a/HTMLparser.c b/HTMLparser.c
index d329d3b5..19c10c3f 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -2948,8 +2948,9 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
 
 
 /**
- * htmlParseCharData:
+ * htmlParseCharDataInternal:
  * @ctxt:  an HTML parser context
+ * @readahead: optional read ahead character in ascii range
  *
  * parse a CharData section.
  * if we are within a CDATA section ']]>' marks an end of section.
@@ -2958,12 +2959,15 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
  */
 
 static void
-htmlParseCharData(htmlParserCtxtPtr ctxt) {
-    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
+htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
+    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
     int nbchar = 0;
     int cur, l;
     int chunk = 0;
 
+    if (readahead)
+        buf[nbchar++] = readahead;
+
     SHRINK;
     cur = CUR_CHAR(l);
     while (((cur != '<') || (ctxt->token == '<')) &&
@@ -3043,6 +3047,21 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
 }
 
 /**
+ * htmlParseCharData:
+ * @ctxt:  an HTML parser context
+ *
+ * parse a CharData section.
+ * if we are within a CDATA section ']]>' marks an end of section.
+ *
+ * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
+ */
+
+static void
+htmlParseCharData(htmlParserCtxtPtr ctxt) {
+    htmlParseCharDataInternal(ctxt, 0);
+}
+
+/**
  * htmlParseExternalID:
  * @ctxt:  an HTML parser context
  * @publicID:  a xmlChar** receiving PubidLiteral
@@ -3690,6 +3709,14 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
 	             "htmlParseStartTag: invalid element name\n",
 		     NULL, NULL);
+	/* if recover preserve text on classic misconstructs */
+	if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
+	    (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
+	    htmlParseCharDataInternal(ctxt, '<');
+	    return(-1);
+	}
+
+
 	/* Dump the bogus tag like browsers do */
 	while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
                (ctxt->instate != XML_PARSER_EOF))
author	Daniel Veillard <veillard@redhat.com>	2015-06-30 11:36:28 +0800
committer	Daniel Veillard <veillard@redhat.com>	2015-06-30 11:36:28 +0800
commit	140c251e8e5653572edcca91b9d675f871735cb4 (patch)
tree	5e66db52be6659fc168497798109313025e360fa /HTMLparser.c
parent	58b84e1f822d9323b4bbadeb07ee147cccc96e7e (diff)
download	android_external_libxml2-140c251e8e5653572edcca91b9d675f871735cb4.tar.gz android_external_libxml2-140c251e8e5653572edcca91b9d675f871735cb4.tar.bz2 android_external_libxml2-140c251e8e5653572edcca91b9d675f871735cb4.zip