diff options
Diffstat (limited to 'HTMLparser.c')
| -rw-r--r-- | HTMLparser.c | 682 |
1 files changed, 320 insertions, 362 deletions
diff --git a/HTMLparser.c b/HTMLparser.c index 9e60e27e..b9812985 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -69,7 +69,7 @@ static void htmlParseComment(htmlParserCtxtPtr ctxt); /** * htmlErrMemory: * @ctxt: an HTML parser context - * @extra: extra informations + * @extra: extra information * * Handle a redefinition of attribute error */ @@ -317,7 +317,7 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt) #define SKIP_BLANKS htmlSkipBlankChars(ctxt) -/* Inported from XML */ +/* Imported from XML */ /* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */ #define CUR ((int) (*ctxt->input->cur)) @@ -414,6 +414,10 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) { static int htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { + const unsigned char *cur; + unsigned char c; + unsigned int val; + if (ctxt->instate == XML_PARSER_EOF) return(0); @@ -421,99 +425,29 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { *len = 0; return(ctxt->token); } - if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { - /* - * We are supposed to handle UTF8, check it's valid - * From rfc2044: encoding of the Unicode values on UTF-8: - * - * UCS-4 range (hex.) UTF-8 octet sequence (binary) - * 0000 0000-0000 007F 0xxxxxxx - * 0000 0080-0000 07FF 110xxxxx 10xxxxxx - * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx - * - * Check for the 0x110000 limit too - */ - const unsigned char *cur = ctxt->input->cur; - unsigned char c; - unsigned int val; - - c = *cur; - if (c & 0x80) { - if (cur[1] == 0) { - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - cur = ctxt->input->cur; - } - if ((cur[1] & 0xc0) != 0x80) - goto encoding_error; - if ((c & 0xe0) == 0xe0) { + if (ctxt->charset != XML_CHAR_ENCODING_UTF8) { + xmlChar * guess; + xmlCharEncodingHandlerPtr handler; - if (cur[2] == 0) { - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - cur = ctxt->input->cur; - } - if ((cur[2] & 0xc0) != 0x80) - goto encoding_error; - if ((c & 0xf0) == 0xf0) { - if (cur[3] == 0) { - xmlParserInputGrow(ctxt->input, INPUT_CHUNK); - cur = ctxt->input->cur; - } - if (((c & 0xf8) != 0xf0) || - ((cur[3] & 0xc0) != 0x80)) - goto encoding_error; - /* 4-byte code */ - *len = 4; - val = (cur[0] & 0x7) << 18; - val |= (cur[1] & 0x3f) << 12; - val |= (cur[2] & 0x3f) << 6; - val |= cur[3] & 0x3f; - } else { - /* 3-byte code */ - *len = 3; - val = (cur[0] & 0xf) << 12; - val |= (cur[1] & 0x3f) << 6; - val |= cur[2] & 0x3f; - } - } else { - /* 2-byte code */ - *len = 2; - val = (cur[0] & 0x1f) << 6; - val |= cur[1] & 0x3f; - } - if (!IS_CHAR(val)) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Char 0x%X out of allowed range\n", val); - } - return(val); - } else { + /* + * Assume it's a fixed length encoding (1) with + * a compatible encoding for the ASCII set, since + * HTML constructs only use < 128 chars + */ + if ((int) *ctxt->input->cur < 0x80) { + *len = 1; if ((*ctxt->input->cur == 0) && (ctxt->input->cur < ctxt->input->end)) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Char 0x%X out of allowed range\n", 0); - *len = 1; + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Char 0x%X out of allowed range\n", 0); return(' '); } - /* 1-byte code */ - *len = 1; - return((int) *ctxt->input->cur); - } - } - /* - * Assume it's a fixed length encoding (1) with - * a compatible encoding for the ASCII set, since - * XML constructs only use < 128 chars - */ - *len = 1; - if ((int) *ctxt->input->cur < 0x80) - return((int) *ctxt->input->cur); - - /* - * Humm this is bad, do an automatic flow conversion - */ - { - xmlChar * guess; - xmlCharEncodingHandlerPtr handler; + return((int) *ctxt->input->cur); + } + /* + * Humm this is bad, do an automatic flow conversion + */ guess = htmlFindEncoding(ctxt); if (guess == NULL) { xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); @@ -532,12 +466,91 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { ctxt->charset = XML_CHAR_ENCODING_UTF8; } - return(xmlCurrentChar(ctxt, len)); + /* + * We are supposed to handle UTF8, check it's valid + * From rfc2044: encoding of the Unicode values on UTF-8: + * + * UCS-4 range (hex.) UTF-8 octet sequence (binary) + * 0000 0000-0000 007F 0xxxxxxx + * 0000 0080-0000 07FF 110xxxxx 10xxxxxx + * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx + * + * Check for the 0x110000 limit too + */ + cur = ctxt->input->cur; + c = *cur; + if (c & 0x80) { + if ((c & 0x40) == 0) + goto encoding_error; + if (cur[1] == 0) { + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + cur = ctxt->input->cur; + } + if ((cur[1] & 0xc0) != 0x80) + goto encoding_error; + if ((c & 0xe0) == 0xe0) { + + if (cur[2] == 0) { + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + cur = ctxt->input->cur; + } + if ((cur[2] & 0xc0) != 0x80) + goto encoding_error; + if ((c & 0xf0) == 0xf0) { + if (cur[3] == 0) { + xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + cur = ctxt->input->cur; + } + if (((c & 0xf8) != 0xf0) || + ((cur[3] & 0xc0) != 0x80)) + goto encoding_error; + /* 4-byte code */ + *len = 4; + val = (cur[0] & 0x7) << 18; + val |= (cur[1] & 0x3f) << 12; + val |= (cur[2] & 0x3f) << 6; + val |= cur[3] & 0x3f; + if (val < 0x10000) + goto encoding_error; + } else { + /* 3-byte code */ + *len = 3; + val = (cur[0] & 0xf) << 12; + val |= (cur[1] & 0x3f) << 6; + val |= cur[2] & 0x3f; + if (val < 0x800) + goto encoding_error; + } + } else { + /* 2-byte code */ + *len = 2; + val = (cur[0] & 0x1f) << 6; + val |= cur[1] & 0x3f; + if (val < 0x80) + goto encoding_error; + } + if (!IS_CHAR(val)) { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Char 0x%X out of allowed range\n", val); + } + return(val); + } else { + if ((*ctxt->input->cur == 0) && + (ctxt->input->cur < ctxt->input->end)) { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Char 0x%X out of allowed range\n", 0); + *len = 1; + return(' '); + } + /* 1-byte code */ + *len = 1; + return((int) *ctxt->input->cur); + } encoding_error: /* * If we detect an UTF8 error that probably mean that the - * input encoding didn't get properly advertized in the + * input encoding didn't get properly advertised in the * declaration header. Report the error and switch the encoding * to ISO-Latin-1 (if you don't like this policy, just declare the * encoding !) @@ -602,8 +615,8 @@ htmlSkipBlankChars(xmlParserCtxtPtr ctxt) { ************************************************************************/ /* - * Start Tag: 1 means the start tag can be ommited - * End Tag: 1 means the end tag can be ommited + * Start Tag: 1 means the start tag can be omitted + * End Tag: 1 means the end tag can be omitted * 2 means it's forbidden (empty elements) * 3 means the tag is stylistic and should be closed easily * Depr: this element is deprecated @@ -1342,7 +1355,7 @@ htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) if (xmlStrEqual(newtag, ctxt->nameTab[i])) break; /* - * A missplaced endtag can only close elements with lower + * A misplaced endtag can only close elements with lower * or equal priority, so if we find an element with higher * priority before we find an element with * matching name, we just ignore this endtag @@ -2176,6 +2189,7 @@ htmlEncodeEntities(unsigned char* out, int *outlen, * * ************************************************************************/ +#ifdef LIBXML_PUSH_ENABLED /** * htmlNewInputStream: * @ctxt: an HTML parser context @@ -2207,6 +2221,7 @@ htmlNewInputStream(htmlParserCtxtPtr ctxt) { input->length = 0; return(input); } +#endif /************************************************************************ @@ -2216,9 +2231,9 @@ htmlNewInputStream(htmlParserCtxtPtr ctxt) { ************************************************************************/ /* * all tags allowing pc data from the html 4.01 loose dtd - * NOTE: it might be more apropriate to integrate this information + * NOTE: it might be more appropriate to integrate this information * into the html40ElementTable array but I don't want to risk any - * binary incomptibility + * binary incompatibility */ static const char *allowPCData[] = { "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big", @@ -2787,47 +2802,39 @@ htmlParseAttValue(htmlParserCtxtPtr ctxt) { static xmlChar * htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { size_t len = 0, startPosition = 0; + int err = 0; + int quote; xmlChar *ret = NULL; - if (CUR == '"') { - NEXT; + if ((CUR != '"') && (CUR != '\'')) { + htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, + "SystemLiteral \" or ' expected\n", NULL, NULL); + return(NULL); + } + quote = CUR; + NEXT; - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; + if (CUR_PTR < BASE_PTR) + return(ret); + startPosition = CUR_PTR - BASE_PTR; - while ((IS_CHAR_CH(CUR)) && (CUR != '"')) { - NEXT; - len++; - } - if (!IS_CHAR_CH(CUR)) { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished SystemLiteral\n", NULL, NULL); - } else { - ret = xmlStrndup((BASE_PTR+startPosition), len); - NEXT; + while ((CUR != 0) && (CUR != quote)) { + /* TODO: Handle UTF-8 */ + if (!IS_CHAR_CH(CUR)) { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Invalid char in SystemLiteral 0x%X\n", CUR); + err = 1; } - } else if (CUR == '\'') { NEXT; - - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; - - while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) { - NEXT; - len++; - } - if (!IS_CHAR_CH(CUR)) { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished SystemLiteral\n", NULL, NULL); - } else { - ret = xmlStrndup((BASE_PTR+startPosition), len); - NEXT; - } + len++; + } + if (CUR != quote) { + htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, + "Unfinished SystemLiteral\n", NULL, NULL); } else { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, - " or ' expected\n", NULL, NULL); + NEXT; + if (err == 0) + ret = xmlStrndup((BASE_PTR+startPosition), len); } return(ret); @@ -2847,51 +2854,42 @@ htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) { static xmlChar * htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { size_t len = 0, startPosition = 0; + int err = 0; + int quote; xmlChar *ret = NULL; + + if ((CUR != '"') && (CUR != '\'')) { + htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, + "PubidLiteral \" or ' expected\n", NULL, NULL); + return(NULL); + } + quote = CUR; + NEXT; + /* * Name ::= (Letter | '_') (NameChar)* */ - if (CUR == '"') { - NEXT; - - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; - - while (IS_PUBIDCHAR_CH(CUR)) { - len++; - NEXT; + if (CUR_PTR < BASE_PTR) + return(ret); + startPosition = CUR_PTR - BASE_PTR; + + while ((CUR != 0) && (CUR != quote)) { + if (!IS_PUBIDCHAR_CH(CUR)) { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Invalid char in PubidLiteral 0x%X\n", CUR); + err = 1; } - - if (CUR != '"') { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished PubidLiteral\n", NULL, NULL); - } else { - ret = xmlStrndup((BASE_PTR + startPosition), len); - NEXT; - } - } else if (CUR == '\'') { + len++; NEXT; + } - if (CUR_PTR < BASE_PTR) - return(ret); - startPosition = CUR_PTR - BASE_PTR; - - while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){ - len++; - NEXT; - } - - if (CUR != '\'') { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, - "Unfinished PubidLiteral\n", NULL, NULL); - } else { - ret = xmlStrndup((BASE_PTR + startPosition), len); - NEXT; - } + if (CUR != '"') { + htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED, + "Unfinished PubidLiteral\n", NULL, NULL); } else { - htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED, - "PubidLiteral \" or ' expected\n", NULL, NULL); + NEXT; + if (err == 0) + ret = xmlStrndup((BASE_PTR + startPosition), len); } return(ret); @@ -2926,7 +2924,7 @@ htmlParseScript(htmlParserCtxtPtr ctxt) { SHRINK; cur = CUR_CHAR(l); - while (IS_CHAR_CH(cur)) { + while (cur != 0) { if ((cur == '<') && (NXT(1) == '/')) { /* * One should break here, the specification is clear: @@ -2957,8 +2955,14 @@ htmlParseScript(htmlParserCtxtPtr ctxt) { } } } - COPY_BUF(l,buf,nbchar,cur); + if (IS_CHAR(cur)) { + COPY_BUF(l,buf,nbchar,cur); + } else { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Invalid char in CDATA 0x%X\n", cur); + } if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { + buf[nbchar] = 0; if (ctxt->sax->cdataBlock!= NULL) { /* * Insert as CDATA, which is the same as HTML_PRESERVE_NODE @@ -2974,15 +2978,8 @@ htmlParseScript(htmlParserCtxtPtr ctxt) { cur = CUR_CHAR(l); } - if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) { - htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, - "Invalid char in CDATA 0x%X\n", cur); - if (ctxt->input->cur < ctxt->input->end) { - NEXT; - } - } - if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) { + buf[nbchar] = 0; if (ctxt->sax->cdataBlock!= NULL) { /* * Insert as CDATA, which is the same as HTML_PRESERVE_NODE @@ -3028,6 +3025,8 @@ htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) { COPY_BUF(l,buf,nbchar,cur); } if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { + buf[nbchar] = 0; + /* * Ok the segment is to be consumed as chars. */ @@ -3226,7 +3225,7 @@ htmlParsePI(htmlParserCtxtPtr ctxt) { } SKIP_BLANKS; cur = CUR_CHAR(l); - while (IS_CHAR(cur) && (cur != '>')) { + while ((cur != 0) && (cur != '>')) { if (len + 5 >= size) { xmlChar *tmp; @@ -3245,7 +3244,13 @@ htmlParsePI(htmlParserCtxtPtr ctxt) { GROW; count = 0; } - COPY_BUF(l,buf,len,cur); + if (IS_CHAR(cur)) { + COPY_BUF(l,buf,len,cur); + } else { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Invalid char in processing instruction " + "0x%X\n", cur); + } NEXTL(l); cur = CUR_CHAR(l); if (cur == 0) { @@ -3315,15 +3320,15 @@ htmlParseComment(htmlParserCtxtPtr ctxt) { len = 0; buf[len] = 0; q = CUR_CHAR(ql); - if (!IS_CHAR(q)) + if (q == 0) goto unfinished; NEXTL(ql); r = CUR_CHAR(rl); - if (!IS_CHAR(r)) + if (r == 0) goto unfinished; NEXTL(rl); cur = CUR_CHAR(l); - while (IS_CHAR(cur) && + while ((cur != 0) && ((cur != '>') || (r != '-') || (q != '-'))) { if (len + 5 >= size) { @@ -3339,7 +3344,12 @@ htmlParseComment(htmlParserCtxtPtr ctxt) { } buf = tmp; } - COPY_BUF(ql,buf,len,q); + if (IS_CHAR(q)) { + COPY_BUF(ql,buf,len,q); + } else { + htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, + "Invalid char in comment 0x%X\n", q); + } q = r; ql = rl; r = cur; @@ -3353,7 +3363,7 @@ htmlParseComment(htmlParserCtxtPtr ctxt) { } } buf[len] = 0; - if (IS_CHAR(cur)) { + if (cur == '>') { NEXT; if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) && (!ctxt->disableSAX)) @@ -3394,13 +3404,16 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) { ((NXT(2) == 'x') || NXT(2) == 'X')) { SKIP(3); while (CUR != ';') { - if ((CUR >= '0') && (CUR <= '9')) - val = val * 16 + (CUR - '0'); - else if ((CUR >= 'a') && (CUR <= 'f')) - val = val * 16 + (CUR - 'a') + 10; - else if ((CUR >= 'A') && (CUR <= 'F')) - val = val * 16 + (CUR - 'A') + 10; - else { + if ((CUR >= '0') && (CUR <= '9')) { + if (val < 0x110000) + val = val * 16 + (CUR - '0'); + } else if ((CUR >= 'a') && (CUR <= 'f')) { + if (val < 0x110000) + val = val * 16 + (CUR - 'a') + 10; + } else if ((CUR >= 'A') && (CUR <= 'F')) { + if (val < 0x110000) + val = val * 16 + (CUR - 'A') + 10; + } else { htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF, "htmlParseCharRef: missing semicolon\n", NULL, NULL); @@ -3413,9 +3426,10 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) { } else if ((CUR == '&') && (NXT(1) == '#')) { SKIP(2); while (CUR != ';') { - if ((CUR >= '0') && (CUR <= '9')) - val = val * 10 + (CUR - '0'); - else { + if ((CUR >= '0') && (CUR <= '9')) { + if (val < 0x110000) + val = val * 10 + (CUR - '0'); + } else { htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF, "htmlParseCharRef: missing semicolon\n", NULL, NULL); @@ -3434,6 +3448,9 @@ htmlParseCharRef(htmlParserCtxtPtr ctxt) { */ if (IS_CHAR(val)) { return(val); + } else if (val >= 0x110000) { + htmlParseErr(ctxt, XML_ERR_INVALID_CHAR, + "htmlParseCharRef: value too large\n", NULL, NULL); } else { htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, "htmlParseCharRef: invalid xmlChar value %d\n", @@ -3493,9 +3510,12 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { if (CUR != '>') { htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED, "DOCTYPE improperly terminated\n", NULL, NULL); - /* We shouldn't try to resynchronize ... */ + /* Ignore bogus content */ + while ((CUR != 0) && (CUR != '>')) + NEXT; } - NEXT; + if (CUR == '>') + NEXT; /* * Create or update the document accordingly to the DOCTYPE @@ -3773,7 +3793,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { /* Dump the bogus tag like browsers do */ - while ((IS_CHAR_CH(CUR)) && (CUR != '>') && + while ((CUR != 0) && (CUR != '>') && (ctxt->instate != XML_PARSER_EOF)) NEXT; return -1; @@ -3829,7 +3849,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { * (S Attribute)* S? */ SKIP_BLANKS; - while ((IS_CHAR_CH(CUR)) && + while ((CUR != 0) && (CUR != '>') && ((CUR != '/') || (NXT(1) != '>'))) { long cons = ctxt->nbChars; @@ -3892,7 +3912,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) { xmlFree(attvalue); /* Dump the bogus attribute string up to the next blank or * the end of the tag. */ - while ((IS_CHAR_CH(CUR)) && + while ((CUR != 0) && !(IS_BLANK_CH(CUR)) && (CUR != '>') && ((CUR != '/') || (NXT(1) != '>'))) NEXT; @@ -3973,19 +3993,14 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt) * We should definitely be at the ending "S? '>'" part */ SKIP_BLANKS; - if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { + if (CUR != '>') { htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, "End tag : expected '>'\n", NULL, NULL); - if (ctxt->recovery) { - /* - * We're not at the ending > !! - * Error, unless in recover mode where we search forwards - * until we find a > - */ - while (CUR != '\0' && CUR != '>') NEXT; - NEXT; - } - } else + /* Skip to next '>' */ + while ((CUR != 0) && (CUR != '>')) + NEXT; + } + if (CUR == '>') NEXT; /* @@ -4175,7 +4190,7 @@ htmlParseContent(htmlParserCtxtPtr ctxt) { "htmlParseStartTag: invalid element name\n", NULL, NULL); /* Dump the bogus tag like browsers do */ - while ((IS_CHAR_CH(CUR)) && (CUR != '>')) + while ((CUR != 0) && (CUR != '>')) NEXT; if (currentNode != NULL) @@ -4390,7 +4405,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) { */ currentNode = xmlStrdup(ctxt->name); depth = ctxt->nameNr; - while (IS_CHAR_CH(CUR)) { + while (CUR != 0) { oldptr = ctxt->input->cur; htmlParseContent(ctxt); if (oldptr==ctxt->input->cur) break; @@ -4407,7 +4422,7 @@ htmlParseElement(htmlParserCtxtPtr ctxt) { node_info.node = ctxt->node; xmlParserAddNodeInfo(ctxt, &node_info); } - if (!IS_CHAR_CH(CUR)) { + if (CUR == 0) { htmlAutoCloseOnEnd(ctxt); } @@ -4428,7 +4443,7 @@ htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) { xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo); htmlNodeInfoPop(ctxt); } - if (!IS_CHAR_CH(CUR)) { + if (CUR == 0) { htmlAutoCloseOnEnd(ctxt); } } @@ -4577,7 +4592,7 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) { "htmlParseStartTag: invalid element name\n", NULL, NULL); /* Dump the bogus tag like browsers do */ - while ((IS_CHAR_CH(CUR)) && (CUR != '>')) + while ((CUR == 0) && (CUR != '>')) NEXT; htmlParserFinishElementParsing(ctxt); @@ -5113,7 +5128,7 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) { * @first: the first char to lookup * @next: the next char to lookup or zero * @third: the next char to lookup or zero - * @comment: flag to force checking inside comments + * @ignoreattrval: skip over attribute values * * Try to find if a sequence (first, next, third) or just (first next) or * (first) is available in the input stream. @@ -5127,13 +5142,11 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) { */ static int htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, - xmlChar next, xmlChar third, int iscomment, - int ignoreattrval) + xmlChar next, xmlChar third, int ignoreattrval) { int base, len; htmlParserInputPtr in; const xmlChar *buf; - int incomment = 0; int invalue = 0; char valdellim = 0x0; @@ -5145,8 +5158,11 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, if (base < 0) return (-1); - if (ctxt->checkIndex > base) + if (ctxt->checkIndex > base) { base = ctxt->checkIndex; + /* Abuse hasPErefs member to restore current state. */ + invalue = ctxt->hasPErefs & 1 ? 1 : 0; + } if (in->buf == NULL) { buf = in->base; @@ -5162,14 +5178,6 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, else if (next) len--; for (; base < len; base++) { - if ((!incomment) && (base + 4 < len) && (!iscomment)) { - if ((buf[base] == '<') && (buf[base + 1] == '!') && - (buf[base + 2] == '-') && (buf[base + 3] == '-')) { - incomment = 1; - /* do not increment past <! - some people use <!--> */ - base += 2; - } - } if (ignoreattrval) { if (buf[base] == '"' || buf[base] == '\'') { if (invalue) { @@ -5186,16 +5194,6 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, continue; } } - if (incomment) { - if (base + 3 > len) - return (-1); - if ((buf[base] == '-') && (buf[base + 1] == '-') && - (buf[base + 2] == '>')) { - incomment = 0; - base += 2; - } - continue; - } if (buf[base] == first) { if (third != 0) { if ((buf[base + 1] != next) || (buf[base + 2] != third)) @@ -5222,8 +5220,12 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, return (base - (in->cur - in->base)); } } - if ((!incomment) && (!invalue)) - ctxt->checkIndex = base; + ctxt->checkIndex = base; + /* Abuse hasPErefs member to track current state. */ + if (invalue) + ctxt->hasPErefs |= 1; + else + ctxt->hasPErefs &= ~1; #ifdef DEBUG_PUSH if (next == 0) xmlGenericError(xmlGenericErrorContext, @@ -5240,80 +5242,6 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, } /** - * htmlParseLookupChars: - * @ctxt: an HTML parser context - * @stop: Array of chars, which stop the lookup. - * @stopLen: Length of stop-Array - * - * Try to find if any char of the stop-Array is available in the input - * stream. - * This function has a side effect of (possibly) incrementing ctxt->checkIndex - * to avoid rescanning sequences of bytes, it DOES change the state of the - * parser, do not use liberally. - * - * Returns the index to the current parsing point if a stopChar - * is available, -1 otherwise. - */ -static int -htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop, - int stopLen) -{ - int base, len; - htmlParserInputPtr in; - const xmlChar *buf; - int incomment = 0; - int i; - - in = ctxt->input; - if (in == NULL) - return (-1); - - base = in->cur - in->base; - if (base < 0) - return (-1); - - if (ctxt->checkIndex > base) - base = ctxt->checkIndex; - - if (in->buf == NULL) { - buf = in->base; - len = in->length; - } else { - buf = xmlBufContent(in->buf->buffer); - len = xmlBufUse(in->buf->buffer); - } - - for (; base < len; base++) { - if (!incomment && (base + 4 < len)) { - if ((buf[base] == '<') && (buf[base + 1] == '!') && - (buf[base + 2] == '-') && (buf[base + 3] == '-')) { - incomment = 1; - /* do not increment past <! - some people use <!--> */ - base += 2; - } - } - if (incomment) { - if (base + 3 > len) - return (-1); - if ((buf[base] == '-') && (buf[base + 1] == '-') && - (buf[base + 2] == '>')) { - incomment = 0; - base += 2; - } - continue; - } - for (i = 0; i < stopLen; ++i) { - if (buf[base] == stop[i]) { - ctxt->checkIndex = 0; - return (base - (in->cur - in->base)); - } - } - } - ctxt->checkIndex = base; - return (-1); -} - -/** * htmlParseTryOrFinish: * @ctxt: an HTML parser context * @terminate: last chunk indicator @@ -5326,7 +5254,7 @@ static int htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { int ret = 0; htmlParserInputPtr in; - int avail = 0; + ptrdiff_t avail = 0; xmlChar cur, next; htmlParserNodeInfo node_info; @@ -5391,7 +5319,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); + avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - + (in->cur - in->base); if ((avail == 0) && (terminate)) { htmlAutoCloseOnEnd(ctxt); if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { @@ -5405,6 +5334,12 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { } if (avail < 1) goto done; + /* + * This is done to make progress and avoid an infinite loop + * if a parsing attempt was aborted by hitting a NUL byte. After + * changing htmlCurrentChar, this probably isn't necessary anymore. + * We should consider removing this check. + */ cur = in->cur[0]; if (cur == 0) { SKIP(1); @@ -5427,7 +5362,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); + avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - + (in->cur - in->base); } if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) ctxt->sax->setDocumentLocator(ctxt->userData, @@ -5444,7 +5380,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { (UPP(6) == 'Y') && (UPP(7) == 'P') && (UPP(8) == 'E')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5469,14 +5405,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); + avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - + (in->cur - in->base); /* * no chars in buffer */ if (avail < 1) goto done; /* - * not enouth chars in buffer + * not enough chars in buffer */ if (avail < 2) { if (!terminate) @@ -5490,7 +5427,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if ((cur == '<') && (next == '!') && (in->cur[2] == '-') && (in->cur[3] == '-')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) + (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5500,7 +5437,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { ctxt->instate = XML_PARSER_MISC; } else if ((cur == '<') && (next == '?')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5514,7 +5451,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { (UPP(6) == 'Y') && (UPP(7) == 'P') && (UPP(8) == 'E')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5530,7 +5467,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { (avail < 9)) { goto done; } else { - ctxt->instate = XML_PARSER_START_TAG; + ctxt->instate = XML_PARSER_CONTENT; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "HPP: entering START_TAG\n"); @@ -5542,7 +5479,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); + avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - + (in->cur - in->base); if (avail < 2) goto done; cur = in->cur[0]; @@ -5550,7 +5488,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if ((cur == '<') && (next == '!') && (in->cur[2] == '-') && (in->cur[3] == '-')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) + (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5560,7 +5498,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { ctxt->instate = XML_PARSER_PROLOG; } else if ((cur == '<') && (next == '?')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5572,7 +5510,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { (avail < 4)) { goto done; } else { - ctxt->instate = XML_PARSER_START_TAG; + ctxt->instate = XML_PARSER_CONTENT; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "HPP: entering START_TAG\n"); @@ -5583,7 +5521,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); + avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) - + (in->cur - in->base); if (avail < 1) goto done; cur = in->cur[0]; @@ -5597,7 +5536,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if ((cur == '<') && (next == '!') && (in->cur[2] == '-') && (in->cur[3] == '-')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0)) + (htmlParseLookupSequence(ctxt, '-', '-', '>', 0) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5607,7 +5546,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { ctxt->instate = XML_PARSER_EPILOG; } else if ((cur == '<') && (next == '?')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5642,7 +5581,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (avail < 1) goto done; /* - * not enouth chars in buffer + * not enough chars in buffer */ if (avail < 2) { if (!terminate) @@ -5671,7 +5610,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { break; } if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) goto done; /* Capture start position */ @@ -5762,13 +5701,13 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { break; } case XML_PARSER_CONTENT: { + xmlChar chr[2] = { 0, 0 }; long cons; + /* * Handle preparsed entities and charRef */ if (ctxt->token != 0) { - xmlChar chr[2] = { 0 , 0 } ; - chr[0] = (xmlChar) ctxt->token; htmlCheckParagraph(ctxt); if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) @@ -5780,21 +5719,22 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { cur = in->cur[0]; if ((cur != '<') && (cur != '&')) { if (ctxt->sax != NULL) { + chr[0] = cur; if (IS_BLANK_CH(cur)) { if (ctxt->keepBlanks) { if (ctxt->sax->characters != NULL) ctxt->sax->characters( - ctxt->userData, &in->cur[0], 1); + ctxt->userData, chr, 1); } else { if (ctxt->sax->ignorableWhitespace != NULL) ctxt->sax->ignorableWhitespace( - ctxt->userData, &in->cur[0], 1); + ctxt->userData, chr, 1); } } else { htmlCheckParagraph(ctxt); if (ctxt->sax->characters != NULL) ctxt->sax->characters( - ctxt->userData, &in->cur[0], 1); + ctxt->userData, chr, 1); } } ctxt->token = 0; @@ -5817,7 +5757,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { int idx; xmlChar val; - idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0); + idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0); if (idx < 0) goto done; val = in->cur[idx + 2]; @@ -5844,7 +5784,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { (UPP(6) == 'Y') && (UPP(7) == 'P') && (UPP(8) == 'E')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) goto done; htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, "Misplaced DOCTYPE declaration\n", @@ -5854,7 +5794,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { (in->cur[2] == '-') && (in->cur[3] == '-')) { if ((!terminate) && (htmlParseLookupSequence( - ctxt, '-', '-', '>', 1, 1) < 0)) + ctxt, '-', '-', '>', 0) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5864,7 +5804,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { ctxt->instate = XML_PARSER_CONTENT; } else if ((cur == '<') && (next == '?')) { if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) goto done; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, @@ -5883,24 +5823,35 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { #endif break; } else if (cur == '<') { - ctxt->instate = XML_PARSER_START_TAG; - ctxt->checkIndex = 0; + if ((!terminate) && (next == 0)) + goto done; + /* + * Only switch to START_TAG if the next character + * starts a valid name. Otherwise, htmlParseStartTag + * might return without consuming all characters + * up to the final '>'. + */ + if ((IS_ASCII_LETTER(next)) || + (next == '_') || (next == ':') || (next == '.')) { + ctxt->instate = XML_PARSER_START_TAG; + ctxt->checkIndex = 0; #ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: entering START_TAG\n"); + xmlGenericError(xmlGenericErrorContext, + "HPP: entering START_TAG\n"); #endif + } else { + htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED, + "htmlParseTryOrFinish: " + "invalid element name\n", + NULL, NULL); + htmlCheckParagraph(ctxt); + if ((ctxt->sax != NULL) && + (ctxt->sax->characters != NULL)) + ctxt->sax->characters(ctxt->userData, + in->cur, 1); + NEXT; + } break; - } else if (cur == '&') { - if ((!terminate) && - (htmlParseLookupChars(ctxt, - BAD_CAST "; >/", 4) < 0)) - goto done; -#ifdef DEBUG_PUSH - xmlGenericError(xmlGenericErrorContext, - "HPP: Parsing Reference\n"); -#endif - /* TODO: check generation of subtrees if noent !!! */ - htmlParseReference(ctxt); } else { /* * check that the text sequence is complete @@ -5909,14 +5860,21 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { * data detection. */ if ((!terminate) && - (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0)) + (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0)) goto done; ctxt->checkIndex = 0; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "HPP: Parsing char data\n"); #endif - htmlParseCharData(ctxt); + while ((cur != '<') && (cur != 0)) { + if (cur == '&') { + htmlParseReference(ctxt); + } else { + htmlParseCharData(ctxt); + } + cur = in->cur[0]; + } } } if (cons == ctxt->nbChars) { @@ -5935,7 +5893,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (avail < 2) goto done; if ((!terminate) && - (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0)) + (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) goto done; htmlParseEndTag(ctxt); if (ctxt->nameNr == 0) { @@ -6117,12 +6075,12 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, int res; res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); + xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); if (res < 0) { ctxt->errNo = XML_PARSER_EOF; ctxt->disableSAX = 1; return (XML_PARSER_EOF); } - xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); #endif @@ -6141,12 +6099,12 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, size_t current = ctxt->input->cur - ctxt->input->base; nbchars = xmlCharEncInput(in, terminate); + xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current); if (nbchars < 0) { htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, "encoder error\n", NULL, NULL); return(XML_ERR_INVALID_ENCODING); } - xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current); } } } |
