diff options
author | Daniel Veillard <veillard@src.gnome.org> | 2006-02-12 19:14:15 +0000 |
---|---|---|
committer | Daniel Veillard <veillard@src.gnome.org> | 2006-02-12 19:14:15 +0000 |
commit | fc011b7fb895ed3b43f6c2e5b7a6fbfc078da349 (patch) | |
tree | e6469e0cf96be8931a89abc238457f834ebf8169 /xmlregexp.c | |
parent | 73dd71ecdde271105c42e76357cbb063545d8817 (diff) | |
download | android_external_libxml2-fc011b7fb895ed3b43f6c2e5b7a6fbfc078da349.tar.gz android_external_libxml2-fc011b7fb895ed3b43f6c2e5b7a6fbfc078da349.tar.bz2 android_external_libxml2-fc011b7fb895ed3b43f6c2e5b7a6fbfc078da349.zip |
bug fixes for #327167 as well as some cleanups and more thorough tests on
* xmlregexp.c: bug fixes for #327167 as well as some cleanups
and more thorough tests on atoms comparisons.
Daniel
Diffstat (limited to 'xmlregexp.c')
-rw-r--r-- | xmlregexp.c | 283 |
1 files changed, 267 insertions, 16 deletions
diff --git a/xmlregexp.c b/xmlregexp.c index de581f0b..21091f39 100644 --- a/xmlregexp.c +++ b/xmlregexp.c @@ -71,6 +71,9 @@ * * ************************************************************************/ +/* + * Note: the order of the enums below is significant, do not shuffle + */ typedef enum { XML_REGEXP_EPSILON = 1, XML_REGEXP_CHARVAL, @@ -2054,34 +2057,281 @@ xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { } /** + * xmlFACompareAtomTypes: + * @type1: an atom type + * @type2: an atom type + * + * Compares two atoms type to check whether they intersect in some ways, + * this is used by xmlFACompareAtoms only + * + * Returns 1 if they may intersect and 0 otherwise + */ +static int +xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { + if ((type1 == XML_REGEXP_EPSILON) || + (type1 == XML_REGEXP_CHARVAL) || + (type1 == XML_REGEXP_RANGES) || + (type1 == XML_REGEXP_SUBREG) || + (type1 == XML_REGEXP_STRING) || + (type1 == XML_REGEXP_ANYCHAR)) + return(1); + if ((type2 == XML_REGEXP_EPSILON) || + (type2 == XML_REGEXP_CHARVAL) || + (type2 == XML_REGEXP_RANGES) || + (type2 == XML_REGEXP_SUBREG) || + (type2 == XML_REGEXP_STRING) || + (type2 == XML_REGEXP_ANYCHAR)) + return(1); + + if (type1 == type2) return(1); + + /* simplify subsequent compares by making sure type1 < type2 */ + if (type1 > type2) { + xmlRegAtomType tmp = type1; + type1 = type2; + type2 = tmp; + } + switch (type1) { + case XML_REGEXP_ANYSPACE: /* \s */ + /* can't be a letter, number, mark, pontuation, symbol */ + if ((type2 == XML_REGEXP_NOTSPACE) || + ((type2 >= XML_REGEXP_LETTER) && + (type2 <= XML_REGEXP_LETTER_OTHERS)) || + ((type2 >= XML_REGEXP_NUMBER) && + (type2 <= XML_REGEXP_NUMBER_OTHERS)) || + ((type2 >= XML_REGEXP_MARK) && + (type2 <= XML_REGEXP_MARK_ENCLOSING)) || + ((type2 >= XML_REGEXP_PUNCT) && + (type2 <= XML_REGEXP_PUNCT_OTHERS)) || + ((type2 >= XML_REGEXP_SYMBOL) && + (type2 <= XML_REGEXP_SYMBOL_OTHERS)) + ) return(0); + break; + case XML_REGEXP_NOTSPACE: /* \S */ + break; + case XML_REGEXP_INITNAME: /* \l */ + /* can't be a number, mark, separator, pontuation, symbol or other */ + if ((type2 == XML_REGEXP_NOTINITNAME) || + ((type2 >= XML_REGEXP_NUMBER) && + (type2 <= XML_REGEXP_NUMBER_OTHERS)) || + ((type2 >= XML_REGEXP_MARK) && + (type2 <= XML_REGEXP_MARK_ENCLOSING)) || + ((type2 >= XML_REGEXP_SEPAR) && + (type2 <= XML_REGEXP_SEPAR_PARA)) || + ((type2 >= XML_REGEXP_PUNCT) && + (type2 <= XML_REGEXP_PUNCT_OTHERS)) || + ((type2 >= XML_REGEXP_SYMBOL) && + (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || + ((type2 >= XML_REGEXP_OTHER) && + (type2 <= XML_REGEXP_OTHER_NA)) + ) return(0); + break; + case XML_REGEXP_NOTINITNAME: /* \L */ + break; + case XML_REGEXP_NAMECHAR: /* \c */ + /* can't be a mark, separator, pontuation, symbol or other */ + if ((type2 == XML_REGEXP_NOTNAMECHAR) || + ((type2 >= XML_REGEXP_MARK) && + (type2 <= XML_REGEXP_MARK_ENCLOSING)) || + ((type2 >= XML_REGEXP_PUNCT) && + (type2 <= XML_REGEXP_PUNCT_OTHERS)) || + ((type2 >= XML_REGEXP_SEPAR) && + (type2 <= XML_REGEXP_SEPAR_PARA)) || + ((type2 >= XML_REGEXP_SYMBOL) && + (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || + ((type2 >= XML_REGEXP_OTHER) && + (type2 <= XML_REGEXP_OTHER_NA)) + ) return(0); + break; + case XML_REGEXP_NOTNAMECHAR: /* \C */ + break; + case XML_REGEXP_DECIMAL: /* \d */ + /* can't be a letter, mark, separator, pontuation, symbol or other */ + if ((type2 == XML_REGEXP_NOTDECIMAL) || + (type2 == XML_REGEXP_REALCHAR) || + ((type2 >= XML_REGEXP_LETTER) && + (type2 <= XML_REGEXP_LETTER_OTHERS)) || + ((type2 >= XML_REGEXP_MARK) && + (type2 <= XML_REGEXP_MARK_ENCLOSING)) || + ((type2 >= XML_REGEXP_PUNCT) && + (type2 <= XML_REGEXP_PUNCT_OTHERS)) || + ((type2 >= XML_REGEXP_SEPAR) && + (type2 <= XML_REGEXP_SEPAR_PARA)) || + ((type2 >= XML_REGEXP_SYMBOL) && + (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || + ((type2 >= XML_REGEXP_OTHER) && + (type2 <= XML_REGEXP_OTHER_NA)) + )return(0); + break; + case XML_REGEXP_NOTDECIMAL: /* \D */ + break; + case XML_REGEXP_REALCHAR: /* \w */ + /* can't be a mark, separator, pontuation, symbol or other */ + if ((type2 == XML_REGEXP_NOTDECIMAL) || + ((type2 >= XML_REGEXP_MARK) && + (type2 <= XML_REGEXP_MARK_ENCLOSING)) || + ((type2 >= XML_REGEXP_PUNCT) && + (type2 <= XML_REGEXP_PUNCT_OTHERS)) || + ((type2 >= XML_REGEXP_SEPAR) && + (type2 <= XML_REGEXP_SEPAR_PARA)) || + ((type2 >= XML_REGEXP_SYMBOL) && + (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || + ((type2 >= XML_REGEXP_OTHER) && + (type2 <= XML_REGEXP_OTHER_NA)) + )return(0); + break; + case XML_REGEXP_NOTREALCHAR: /* \W */ + break; + /* + * at that point we know both type 1 and type2 are from + * character categories are ordered and are different, + * it becomes simple because this is a partition + */ + case XML_REGEXP_LETTER: + if (type2 <= XML_REGEXP_LETTER_OTHERS) + return(1); + return(0); + case XML_REGEXP_LETTER_UPPERCASE: + case XML_REGEXP_LETTER_LOWERCASE: + case XML_REGEXP_LETTER_TITLECASE: + case XML_REGEXP_LETTER_MODIFIER: + case XML_REGEXP_LETTER_OTHERS: + return(0); + case XML_REGEXP_MARK: + if (type2 <= XML_REGEXP_MARK_ENCLOSING) + return(1); + return(0); + case XML_REGEXP_MARK_NONSPACING: + case XML_REGEXP_MARK_SPACECOMBINING: + case XML_REGEXP_MARK_ENCLOSING: + return(0); + case XML_REGEXP_NUMBER: + if (type2 <= XML_REGEXP_NUMBER_OTHERS) + return(1); + return(0); + case XML_REGEXP_NUMBER_DECIMAL: + case XML_REGEXP_NUMBER_LETTER: + case XML_REGEXP_NUMBER_OTHERS: + return(0); + case XML_REGEXP_PUNCT: + if (type2 <= XML_REGEXP_PUNCT_OTHERS) + return(1); + return(0); + case XML_REGEXP_PUNCT_CONNECTOR: + case XML_REGEXP_PUNCT_DASH: + case XML_REGEXP_PUNCT_OPEN: + case XML_REGEXP_PUNCT_CLOSE: + case XML_REGEXP_PUNCT_INITQUOTE: + case XML_REGEXP_PUNCT_FINQUOTE: + case XML_REGEXP_PUNCT_OTHERS: + return(0); + case XML_REGEXP_SEPAR: + if (type2 <= XML_REGEXP_SEPAR_PARA) + return(1); + return(0); + case XML_REGEXP_SEPAR_SPACE: + case XML_REGEXP_SEPAR_LINE: + case XML_REGEXP_SEPAR_PARA: + return(0); + case XML_REGEXP_SYMBOL: + if (type2 <= XML_REGEXP_SYMBOL_OTHERS) + return(1); + return(0); + case XML_REGEXP_SYMBOL_MATH: + case XML_REGEXP_SYMBOL_CURRENCY: + case XML_REGEXP_SYMBOL_MODIFIER: + case XML_REGEXP_SYMBOL_OTHERS: + return(0); + case XML_REGEXP_OTHER: + if (type2 <= XML_REGEXP_OTHER_NA) + return(1); + return(0); + case XML_REGEXP_OTHER_CONTROL: + case XML_REGEXP_OTHER_FORMAT: + case XML_REGEXP_OTHER_PRIVATE: + case XML_REGEXP_OTHER_NA: + return(0); + default: + break; + } + return(1); +} + +/** + * xmlFAEqualAtoms: + * @atom1: an atom + * @atom2: an atom + * + * Compares two atoms to check whether they are the same exactly + * this is used to remove equivalent transitions + * + * Returns 1 if same and 0 otherwise + */ +static int +xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { + int ret = 0; + + if (atom1 == atom2) + return(1); + if ((atom1 == NULL) || (atom2 == NULL)) + return(0); + + if (atom1->type != atom2->type) + return(0); + switch (atom1->type) { + case XML_REGEXP_EPSILON: + ret = 0; + break; + case XML_REGEXP_STRING: + ret = xmlStrEqual((xmlChar *)atom1->valuep, + (xmlChar *)atom2->valuep); + break; + case XML_REGEXP_CHARVAL: + ret = (atom1->codepoint == atom2->codepoint); + break; + case XML_REGEXP_RANGES: + /* too hard to do in the general case */ + ret = 0; + default: + break; + } + return(ret); +} + +/** * xmlFACompareAtoms: * @atom1: an atom * @atom2: an atom * * Compares two atoms to check whether they intersect in some ways, - * this is used by xmlFAComputesDeterminism only + * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only * * Returns 1 if yes and 0 otherwise */ static int xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { - int ret; + int ret = 1; if (atom1 == atom2) return(1); if ((atom1 == NULL) || (atom2 == NULL)) return(0); - if ((atom1->type == XML_REGEXP_RANGES) && - (atom2->type == XML_REGEXP_CHARVAL)) { - } else if ((atom1->type == XML_REGEXP_CHARVAL) && - (atom2->type == XML_REGEXP_RANGES)) { + if ((atom1->type == XML_REGEXP_ANYCHAR) || + (atom2->type == XML_REGEXP_ANYCHAR)) + return(1); + + if (atom1->type > atom2->type) { xmlRegAtomPtr tmp; tmp = atom1; atom1 = atom2; atom2 = tmp; - } else if (atom1->type != atom2->type) { - return(0); + } + if (atom1->type != atom2->type) { + ret = xmlFACompareAtomTypes(atom1->type, atom2->type); + /* if they can't intersect at the type level break now */ + if (ret == 0) + return(0); } switch (atom1->type) { case XML_REGEXP_STRING: @@ -2091,15 +2341,16 @@ xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { case XML_REGEXP_EPSILON: goto not_determinist; case XML_REGEXP_CHARVAL: - ret = (atom1->codepoint == atom2->codepoint); - break; - case XML_REGEXP_RANGES: if (atom2->type == XML_REGEXP_CHARVAL) { - ret = xmlRegCheckCharacter(atom1, atom2->codepoint); - if (ret < 0) - return(-1); - break; + ret = (atom1->codepoint == atom2->codepoint); } else { + ret = xmlRegCheckCharacter(atom2, atom1->codepoint); + if (ret < 0) + ret = 1; + } + break; + case XML_REGEXP_RANGES: + if (atom2->type == XML_REGEXP_RANGES) { int i, j, res; xmlRegRangePtr r1, r2; @@ -2233,7 +2484,7 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { continue; if (t2->atom != NULL) { if (t1->to == t2->to) { - if (xmlFACompareAtoms(t1->atom, t2->atom)) + if (xmlFAEqualAtoms(t1->atom, t2->atom)) t2->to = -1; /* eliminated */ } } |