diff options
author | Daniel Veillard <veillard@redhat.com> | 2015-06-30 11:36:28 +0800 |
---|---|---|
committer | Daniel Veillard <veillard@redhat.com> | 2015-06-30 11:36:28 +0800 |
commit | 140c251e8e5653572edcca91b9d675f871735cb4 (patch) | |
tree | 5e66db52be6659fc168497798109313025e360fa /HTMLparser.c | |
parent | 58b84e1f822d9323b4bbadeb07ee147cccc96e7e (diff) | |
download | android_external_libxml2-140c251e8e5653572edcca91b9d675f871735cb4.tar.gz android_external_libxml2-140c251e8e5653572edcca91b9d675f871735cb4.tar.bz2 android_external_libxml2-140c251e8e5653572edcca91b9d675f871735cb4.zip |
Recover unescaped less-than character in HTML recovery parsing
As pointed by Christian Schoenebeck <schoenebeck@crudebyte.com>
on the list and based on some of his early patches, this preserve
content when unescaped opening angle brackets are not escaped in
textual content like:
<p> a < b </p>
<p> a <0 </p>
<p> a <=0 </p>
while still reporting the error.
Diffstat (limited to 'HTMLparser.c')
-rw-r--r-- | HTMLparser.c | 33 |
1 files changed, 30 insertions, 3 deletions
diff --git a/HTMLparser.c b/HTMLparser.c index d329d3b5..19c10c3f 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -2948,8 +2948,9 @@ htmlParseScript(htmlParserCtxtPtr ctxt) { /** - * htmlParseCharData: + * htmlParseCharDataInternal: * @ctxt: an HTML parser context + * @readahead: optional read ahead character in ascii range * * parse a CharData section. * if we are within a CDATA section ']]>' marks an end of section. @@ -2958,12 +2959,15 @@ htmlParseScript(htmlParserCtxtPtr ctxt) { */ static void -htmlParseCharData(htmlParserCtxtPtr ctxt) { - xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; +htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) { + xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6]; int nbchar = 0; int cur, l; int chunk = 0; + if (readahead) + buf[nbchar++] = readahead; + SHRINK; cur = CUR_CHAR(l); while (((cur != '<') || (ctxt->token == '<')) && @@ -3043,6 +3047,21 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) { } /** + * htmlParseCharData: + * @ctxt: an HTML parser context + * + * parse a CharData section. + * if we are within a CDATA section ']]>' marks an end of section. + * + * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) + */ + +static void +htmlParseCharData(htmlParserCtxtPtr ctxt) { + htmlParseCharDataInternal(ctxt, 0); +} + +/** * htmlParseExternalID: * @ctxt: an HTML parser context * @publicID: a xmlChar** receiving PubidLiteral @@ -3690,6 +3709,14 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, "htmlParseStartTag: invalid element name\n", NULL, NULL); + /* if recover preserve text on classic misconstructs */ + if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') || + (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) { + htmlParseCharDataInternal(ctxt, '<'); + return(-1); + } + + /* Dump the bogus tag like browsers do */ while ((IS_CHAR_CH(CUR)) && (CUR != '>') && (ctxt->instate != XML_PARSER_EOF)) |