diff options
author | Denis Pauk <pauk.denis@gmail.com> | 2012-05-10 15:34:57 +0800 |
---|---|---|
committer | Daniel Veillard <veillard@redhat.com> | 2012-05-10 15:34:57 +0800 |
commit | 868d92da8915fc5dc5e329d93cc7882370a28475 (patch) | |
tree | 4b39353761e2137b6adce484378b4434c223ef74 | |
parent | 1eabc31401b7b8c3b5273993778f37eeef37a055 (diff) | |
download | android_external_libxml2-868d92da8915fc5dc5e329d93cc7882370a28475.tar.gz android_external_libxml2-868d92da8915fc5dc5e329d93cc7882370a28475.tar.bz2 android_external_libxml2-868d92da8915fc5dc5e329d93cc7882370a28475.zip |
Add HTML parser support for HTML5 meta charset encoding declaration
For https://bugzilla.gnome.org/show_bug.cgi?id=655218
http://www.w3.org/TR/2011/WD-html5-20110525/semantics.html#the-meta-element
"""
The charset attribute specifies the character encoding used by the document.
This is a character encoding declaration. If the attribute is present in an XML
document, its value must be an ASCII case-insensitive match for the string
"UTF-8" (and the document is therefore forced to use UTF-8 as its
encoding).
"""
However, while <meta http-equiv="Content-Type" content="text/html;
charset=utf8"> works, <meta charset="utf8"> does not.
While libxml2 HTML parser is not tuned for HTML5, this is a simple
addition
Also added a testcase
-rw-r--r-- | .gitignore | 3 | ||||
-rw-r--r-- | HTMLparser.c | 53 | ||||
-rw-r--r-- | result/HTML/html5_enc.html | 7 | ||||
-rw-r--r-- | result/HTML/html5_enc.html.err | 0 | ||||
-rw-r--r-- | result/HTML/html5_enc.html.sax | 30 | ||||
-rw-r--r-- | test/HTML/html5_enc.html | 8 |
6 files changed, 84 insertions, 17 deletions
@@ -1,9 +1,6 @@ *.o *.lo -*.xml *.log -*.rng -*.html *.patch .deps .libs diff --git a/HTMLparser.c b/HTMLparser.c index 5580b18e..2eb3fb49 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -727,7 +727,7 @@ static const char* const map_contents[] = { BLOCK, "area", NULL } ; static const char* const name_attr[] = { "name", NULL } ; static const char* const action_attr[] = { "action", NULL } ; static const char* const blockli_elt[] = { BLOCK, "li", NULL } ; -static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ; +static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ; static const char* const content_attr[] = { "content", NULL } ; static const char* const type_attr[] = { "type", NULL } ; static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ; @@ -3435,20 +3435,19 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { } /** - * htmlCheckEncoding: + * htmlCheckEncodingDirect: * @ctxt: an HTML parser context * @attvalue: the attribute value * - * Checks an http-equiv attribute from a Meta tag to detect + * Checks an attribute value to detect * the encoding * If a new encoding is detected the parser is switched to decode * it and pass UTF8 */ static void -htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { - const xmlChar *encoding; +htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) { - if ((ctxt == NULL) || (attvalue == NULL) || + if ((ctxt == NULL) || (encoding == NULL) || (ctxt->options & HTML_PARSE_IGNORE_ENC)) return; @@ -3456,14 +3455,6 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { if (ctxt->input->encoding != NULL) return; - encoding = xmlStrcasestr(attvalue, BAD_CAST"charset="); - if (encoding != NULL) { - encoding += 8; - } else { - encoding = xmlStrcasestr(attvalue, BAD_CAST"charset ="); - if (encoding != NULL) - encoding += 9; - } if (encoding != NULL) { xmlCharEncoding enc; xmlCharEncodingHandlerPtr handler; @@ -3536,6 +3527,38 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { } /** + * htmlCheckEncoding: + * @ctxt: an HTML parser context + * @attvalue: the attribute value + * + * Checks an http-equiv attribute from a Meta tag to detect + * the encoding + * If a new encoding is detected the parser is switched to decode + * it and pass UTF8 + */ +static void +htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { + const xmlChar *encoding; + + if (!attvalue) + return; + + encoding = xmlStrcasestr(attvalue, BAD_CAST"charset"); + if (encoding != NULL) { + encoding += 7; + } + /* + * skip blank + */ + if (encoding && IS_BLANK_CH(*encoding)) + encoding = xmlStrcasestr(attvalue, BAD_CAST"="); + if (encoding && *encoding == '=') { + encoding ++; + htmlCheckEncodingDirect(ctxt, encoding); + } +} + +/** * htmlCheckMeta: * @ctxt: an HTML parser context * @atts: the attributes values @@ -3559,6 +3582,8 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv")) && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) http = 1; + else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset"))) + htmlCheckEncodingDirect(ctxt, value); else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content"))) content = value; att = atts[i++]; diff --git a/result/HTML/html5_enc.html b/result/HTML/html5_enc.html new file mode 100644 index 00000000..596d54d7 --- /dev/null +++ b/result/HTML/html5_enc.html @@ -0,0 +1,7 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd"> +<html> +<head><meta charset="iso-8859-1"></head> +<body> + <p>très</p> +</body> +</html> diff --git a/result/HTML/html5_enc.html.err b/result/HTML/html5_enc.html.err new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/result/HTML/html5_enc.html.err diff --git a/result/HTML/html5_enc.html.sax b/result/HTML/html5_enc.html.sax new file mode 100644 index 00000000..292be575 --- /dev/null +++ b/result/HTML/html5_enc.html.sax @@ -0,0 +1,30 @@ +SAX.setDocumentLocator() +SAX.startDocument() +SAX.startElement(html) +SAX.ignorableWhitespace( +, 1) +SAX.startElement(head) +SAX.ignorableWhitespace( +, 1) +SAX.startElement(meta, charset='iso-8859-1') +SAX.endElement(meta) +SAX.ignorableWhitespace( +, 1) +SAX.endElement(head) +SAX.ignorableWhitespace( +, 1) +SAX.startElement(body) +SAX.characters( + , 3) +SAX.startElement(p) +SAX.characters(très, 5) +SAX.endElement(p) +SAX.characters( +, 1) +SAX.endElement(body) +SAX.ignorableWhitespace( +, 1) +SAX.endElement(html) +SAX.ignorableWhitespace( +, 1) +SAX.endDocument() diff --git a/test/HTML/html5_enc.html b/test/HTML/html5_enc.html new file mode 100644 index 00000000..3ebf491f --- /dev/null +++ b/test/HTML/html5_enc.html @@ -0,0 +1,8 @@ +<html> +<head> +<meta charset="iso-8859-1"/> +</head> +<body> + <p>très</p> +</body> +</html> |