aboutsummaryrefslogtreecommitdiffstats
path: root/xmlregexp.c
diff options
context:
space:
mode:
authorDaniel Veillard <veillard@src.gnome.org>2006-02-12 19:14:15 +0000
committerDaniel Veillard <veillard@src.gnome.org>2006-02-12 19:14:15 +0000
commitfc011b7fb895ed3b43f6c2e5b7a6fbfc078da349 (patch)
treee6469e0cf96be8931a89abc238457f834ebf8169 /xmlregexp.c
parent73dd71ecdde271105c42e76357cbb063545d8817 (diff)
downloadandroid_external_libxml2-fc011b7fb895ed3b43f6c2e5b7a6fbfc078da349.tar.gz
android_external_libxml2-fc011b7fb895ed3b43f6c2e5b7a6fbfc078da349.tar.bz2
android_external_libxml2-fc011b7fb895ed3b43f6c2e5b7a6fbfc078da349.zip
bug fixes for #327167 as well as some cleanups and more thorough tests on
* xmlregexp.c: bug fixes for #327167 as well as some cleanups and more thorough tests on atoms comparisons. Daniel
Diffstat (limited to 'xmlregexp.c')
-rw-r--r--xmlregexp.c283
1 files changed, 267 insertions, 16 deletions
diff --git a/xmlregexp.c b/xmlregexp.c
index de581f0b..21091f39 100644
--- a/xmlregexp.c
+++ b/xmlregexp.c
@@ -71,6 +71,9 @@
* *
************************************************************************/
+/*
+ * Note: the order of the enums below is significant, do not shuffle
+ */
typedef enum {
XML_REGEXP_EPSILON = 1,
XML_REGEXP_CHARVAL,
@@ -2054,34 +2057,281 @@ xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
}
/**
+ * xmlFACompareAtomTypes:
+ * @type1: an atom type
+ * @type2: an atom type
+ *
+ * Compares two atoms type to check whether they intersect in some ways,
+ * this is used by xmlFACompareAtoms only
+ *
+ * Returns 1 if they may intersect and 0 otherwise
+ */
+static int
+xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
+ if ((type1 == XML_REGEXP_EPSILON) ||
+ (type1 == XML_REGEXP_CHARVAL) ||
+ (type1 == XML_REGEXP_RANGES) ||
+ (type1 == XML_REGEXP_SUBREG) ||
+ (type1 == XML_REGEXP_STRING) ||
+ (type1 == XML_REGEXP_ANYCHAR))
+ return(1);
+ if ((type2 == XML_REGEXP_EPSILON) ||
+ (type2 == XML_REGEXP_CHARVAL) ||
+ (type2 == XML_REGEXP_RANGES) ||
+ (type2 == XML_REGEXP_SUBREG) ||
+ (type2 == XML_REGEXP_STRING) ||
+ (type2 == XML_REGEXP_ANYCHAR))
+ return(1);
+
+ if (type1 == type2) return(1);
+
+ /* simplify subsequent compares by making sure type1 < type2 */
+ if (type1 > type2) {
+ xmlRegAtomType tmp = type1;
+ type1 = type2;
+ type2 = tmp;
+ }
+ switch (type1) {
+ case XML_REGEXP_ANYSPACE: /* \s */
+ /* can't be a letter, number, mark, pontuation, symbol */
+ if ((type2 == XML_REGEXP_NOTSPACE) ||
+ ((type2 >= XML_REGEXP_LETTER) &&
+ (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
+ ((type2 >= XML_REGEXP_NUMBER) &&
+ (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
+ ((type2 >= XML_REGEXP_MARK) &&
+ (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
+ ((type2 >= XML_REGEXP_PUNCT) &&
+ (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
+ ((type2 >= XML_REGEXP_SYMBOL) &&
+ (type2 <= XML_REGEXP_SYMBOL_OTHERS))
+ ) return(0);
+ break;
+ case XML_REGEXP_NOTSPACE: /* \S */
+ break;
+ case XML_REGEXP_INITNAME: /* \l */
+ /* can't be a number, mark, separator, pontuation, symbol or other */
+ if ((type2 == XML_REGEXP_NOTINITNAME) ||
+ ((type2 >= XML_REGEXP_NUMBER) &&
+ (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
+ ((type2 >= XML_REGEXP_MARK) &&
+ (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
+ ((type2 >= XML_REGEXP_SEPAR) &&
+ (type2 <= XML_REGEXP_SEPAR_PARA)) ||
+ ((type2 >= XML_REGEXP_PUNCT) &&
+ (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
+ ((type2 >= XML_REGEXP_SYMBOL) &&
+ (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
+ ((type2 >= XML_REGEXP_OTHER) &&
+ (type2 <= XML_REGEXP_OTHER_NA))
+ ) return(0);
+ break;
+ case XML_REGEXP_NOTINITNAME: /* \L */
+ break;
+ case XML_REGEXP_NAMECHAR: /* \c */
+ /* can't be a mark, separator, pontuation, symbol or other */
+ if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
+ ((type2 >= XML_REGEXP_MARK) &&
+ (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
+ ((type2 >= XML_REGEXP_PUNCT) &&
+ (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
+ ((type2 >= XML_REGEXP_SEPAR) &&
+ (type2 <= XML_REGEXP_SEPAR_PARA)) ||
+ ((type2 >= XML_REGEXP_SYMBOL) &&
+ (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
+ ((type2 >= XML_REGEXP_OTHER) &&
+ (type2 <= XML_REGEXP_OTHER_NA))
+ ) return(0);
+ break;
+ case XML_REGEXP_NOTNAMECHAR: /* \C */
+ break;
+ case XML_REGEXP_DECIMAL: /* \d */
+ /* can't be a letter, mark, separator, pontuation, symbol or other */
+ if ((type2 == XML_REGEXP_NOTDECIMAL) ||
+ (type2 == XML_REGEXP_REALCHAR) ||
+ ((type2 >= XML_REGEXP_LETTER) &&
+ (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
+ ((type2 >= XML_REGEXP_MARK) &&
+ (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
+ ((type2 >= XML_REGEXP_PUNCT) &&
+ (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
+ ((type2 >= XML_REGEXP_SEPAR) &&
+ (type2 <= XML_REGEXP_SEPAR_PARA)) ||
+ ((type2 >= XML_REGEXP_SYMBOL) &&
+ (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
+ ((type2 >= XML_REGEXP_OTHER) &&
+ (type2 <= XML_REGEXP_OTHER_NA))
+ )return(0);
+ break;
+ case XML_REGEXP_NOTDECIMAL: /* \D */
+ break;
+ case XML_REGEXP_REALCHAR: /* \w */
+ /* can't be a mark, separator, pontuation, symbol or other */
+ if ((type2 == XML_REGEXP_NOTDECIMAL) ||
+ ((type2 >= XML_REGEXP_MARK) &&
+ (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
+ ((type2 >= XML_REGEXP_PUNCT) &&
+ (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
+ ((type2 >= XML_REGEXP_SEPAR) &&
+ (type2 <= XML_REGEXP_SEPAR_PARA)) ||
+ ((type2 >= XML_REGEXP_SYMBOL) &&
+ (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
+ ((type2 >= XML_REGEXP_OTHER) &&
+ (type2 <= XML_REGEXP_OTHER_NA))
+ )return(0);
+ break;
+ case XML_REGEXP_NOTREALCHAR: /* \W */
+ break;
+ /*
+ * at that point we know both type 1 and type2 are from
+ * character categories are ordered and are different,
+ * it becomes simple because this is a partition
+ */
+ case XML_REGEXP_LETTER:
+ if (type2 <= XML_REGEXP_LETTER_OTHERS)
+ return(1);
+ return(0);
+ case XML_REGEXP_LETTER_UPPERCASE:
+ case XML_REGEXP_LETTER_LOWERCASE:
+ case XML_REGEXP_LETTER_TITLECASE:
+ case XML_REGEXP_LETTER_MODIFIER:
+ case XML_REGEXP_LETTER_OTHERS:
+ return(0);
+ case XML_REGEXP_MARK:
+ if (type2 <= XML_REGEXP_MARK_ENCLOSING)
+ return(1);
+ return(0);
+ case XML_REGEXP_MARK_NONSPACING:
+ case XML_REGEXP_MARK_SPACECOMBINING:
+ case XML_REGEXP_MARK_ENCLOSING:
+ return(0);
+ case XML_REGEXP_NUMBER:
+ if (type2 <= XML_REGEXP_NUMBER_OTHERS)
+ return(1);
+ return(0);
+ case XML_REGEXP_NUMBER_DECIMAL:
+ case XML_REGEXP_NUMBER_LETTER:
+ case XML_REGEXP_NUMBER_OTHERS:
+ return(0);
+ case XML_REGEXP_PUNCT:
+ if (type2 <= XML_REGEXP_PUNCT_OTHERS)
+ return(1);
+ return(0);
+ case XML_REGEXP_PUNCT_CONNECTOR:
+ case XML_REGEXP_PUNCT_DASH:
+ case XML_REGEXP_PUNCT_OPEN:
+ case XML_REGEXP_PUNCT_CLOSE:
+ case XML_REGEXP_PUNCT_INITQUOTE:
+ case XML_REGEXP_PUNCT_FINQUOTE:
+ case XML_REGEXP_PUNCT_OTHERS:
+ return(0);
+ case XML_REGEXP_SEPAR:
+ if (type2 <= XML_REGEXP_SEPAR_PARA)
+ return(1);
+ return(0);
+ case XML_REGEXP_SEPAR_SPACE:
+ case XML_REGEXP_SEPAR_LINE:
+ case XML_REGEXP_SEPAR_PARA:
+ return(0);
+ case XML_REGEXP_SYMBOL:
+ if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
+ return(1);
+ return(0);
+ case XML_REGEXP_SYMBOL_MATH:
+ case XML_REGEXP_SYMBOL_CURRENCY:
+ case XML_REGEXP_SYMBOL_MODIFIER:
+ case XML_REGEXP_SYMBOL_OTHERS:
+ return(0);
+ case XML_REGEXP_OTHER:
+ if (type2 <= XML_REGEXP_OTHER_NA)
+ return(1);
+ return(0);
+ case XML_REGEXP_OTHER_CONTROL:
+ case XML_REGEXP_OTHER_FORMAT:
+ case XML_REGEXP_OTHER_PRIVATE:
+ case XML_REGEXP_OTHER_NA:
+ return(0);
+ default:
+ break;
+ }
+ return(1);
+}
+
+/**
+ * xmlFAEqualAtoms:
+ * @atom1: an atom
+ * @atom2: an atom
+ *
+ * Compares two atoms to check whether they are the same exactly
+ * this is used to remove equivalent transitions
+ *
+ * Returns 1 if same and 0 otherwise
+ */
+static int
+xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
+ int ret = 0;
+
+ if (atom1 == atom2)
+ return(1);
+ if ((atom1 == NULL) || (atom2 == NULL))
+ return(0);
+
+ if (atom1->type != atom2->type)
+ return(0);
+ switch (atom1->type) {
+ case XML_REGEXP_EPSILON:
+ ret = 0;
+ break;
+ case XML_REGEXP_STRING:
+ ret = xmlStrEqual((xmlChar *)atom1->valuep,
+ (xmlChar *)atom2->valuep);
+ break;
+ case XML_REGEXP_CHARVAL:
+ ret = (atom1->codepoint == atom2->codepoint);
+ break;
+ case XML_REGEXP_RANGES:
+ /* too hard to do in the general case */
+ ret = 0;
+ default:
+ break;
+ }
+ return(ret);
+}
+
+/**
* xmlFACompareAtoms:
* @atom1: an atom
* @atom2: an atom
*
* Compares two atoms to check whether they intersect in some ways,
- * this is used by xmlFAComputesDeterminism only
+ * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
*
* Returns 1 if yes and 0 otherwise
*/
static int
xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
- int ret;
+ int ret = 1;
if (atom1 == atom2)
return(1);
if ((atom1 == NULL) || (atom2 == NULL))
return(0);
- if ((atom1->type == XML_REGEXP_RANGES) &&
- (atom2->type == XML_REGEXP_CHARVAL)) {
- } else if ((atom1->type == XML_REGEXP_CHARVAL) &&
- (atom2->type == XML_REGEXP_RANGES)) {
+ if ((atom1->type == XML_REGEXP_ANYCHAR) ||
+ (atom2->type == XML_REGEXP_ANYCHAR))
+ return(1);
+
+ if (atom1->type > atom2->type) {
xmlRegAtomPtr tmp;
tmp = atom1;
atom1 = atom2;
atom2 = tmp;
- } else if (atom1->type != atom2->type) {
- return(0);
+ }
+ if (atom1->type != atom2->type) {
+ ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
+ /* if they can't intersect at the type level break now */
+ if (ret == 0)
+ return(0);
}
switch (atom1->type) {
case XML_REGEXP_STRING:
@@ -2091,15 +2341,16 @@ xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) {
case XML_REGEXP_EPSILON:
goto not_determinist;
case XML_REGEXP_CHARVAL:
- ret = (atom1->codepoint == atom2->codepoint);
- break;
- case XML_REGEXP_RANGES:
if (atom2->type == XML_REGEXP_CHARVAL) {
- ret = xmlRegCheckCharacter(atom1, atom2->codepoint);
- if (ret < 0)
- return(-1);
- break;
+ ret = (atom1->codepoint == atom2->codepoint);
} else {
+ ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
+ if (ret < 0)
+ ret = 1;
+ }
+ break;
+ case XML_REGEXP_RANGES:
+ if (atom2->type == XML_REGEXP_RANGES) {
int i, j, res;
xmlRegRangePtr r1, r2;
@@ -2233,7 +2484,7 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
continue;
if (t2->atom != NULL) {
if (t1->to == t2->to) {
- if (xmlFACompareAtoms(t1->atom, t2->atom))
+ if (xmlFAEqualAtoms(t1->atom, t2->atom))
t2->to = -1; /* eliminated */
}
}