aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Veillard <veillard@src.gnome.org>2002-09-19 19:56:43 +0000
committerDaniel Veillard <veillard@src.gnome.org>2002-09-19 19:56:43 +0000
commit23e73571f8f6918e4ea7be3506ee5bd24ee86c52 (patch)
tree82576c18687ffe918aeb12801b667651b34e10d8
parent5acfd6b58a46fb71c8683df62b6710497f27df27 (diff)
downloadandroid_external_libxml2-23e73571f8f6918e4ea7be3506ee5bd24ee86c52.tar.gz
android_external_libxml2-23e73571f8f6918e4ea7be3506ee5bd24ee86c52.tar.bz2
android_external_libxml2-23e73571f8f6918e4ea7be3506ee5bd24ee86c52.zip
made configuring with regexps/automata/unicode the default but without
* Makefile.am configure.in include/libxml/xmlversion.h.in: made configuring with regexps/automata/unicode the default but without schemas ATM * testRegexp.c valid.c xmlregexp.c include/libxml/xmlregexp.h: fixed the regexp based DTD validation performance and memory problem by switching to a compact form for determinist regexps and detecting the determinism property in the process. Seems as fast as the old DTD validation specific engine :-) despite the regexp built and compaction process. Daniel
-rw-r--r--ChangeLog12
-rw-r--r--Makefile.am2
-rw-r--r--configure.in15
-rw-r--r--include/libxml/xmlregexp.h1
-rw-r--r--include/libxml/xmlversion.h.in6
-rw-r--r--testRegexp.c3
-rw-r--r--valid.c11
-rw-r--r--xmlregexp.c348
8 files changed, 387 insertions, 11 deletions
diff --git a/ChangeLog b/ChangeLog
index 01ca5b81..954f8318 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,15 @@
+Thu Sep 19 21:46:53 CEST 2002 Daniel Veillard <daniel@veillard.com>
+
+ * Makefile.am configure.in include/libxml/xmlversion.h.in:
+ made configuring with regexps/automata/unicode the default
+ but without schemas ATM
+ * testRegexp.c valid.c xmlregexp.c include/libxml/xmlregexp.h:
+ fixed the regexp based DTD validation performance and memory
+ problem by switching to a compact form for determinist regexps
+ and detecting the determinism property in the process. Seems
+ as fast as the old DTD validation specific engine :-) despite
+ the regexp built and compaction process.
+
Wed Sep 18 18:27:26 CEST 2002 Daniel Veillard <daniel@veillard.com>
* valid.c: determinism is debugged, new DTD checking code now works
diff --git a/Makefile.am b/Makefile.am
index f9d94262..7269a9e0 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -106,7 +106,7 @@ check-local: tests
testall : tests SVGtests SAXtests
-tests: XMLtests XMLenttests HTMLtests Validtests URItests XPathtests XPtrtests XIncludetests C14Ntests Scripttests Catatests @TEST_SCHEMAS@ @TEST_THREADS@
+tests: XMLtests XMLenttests HTMLtests Validtests URItests XPathtests XPtrtests XIncludetests C14Ntests Scripttests Catatests @TEST_REGEXPS@ @TEST_SCHEMAS@ @TEST_THREADS@
@(cd python ; $(MAKE) tests)
valgrind:
diff --git a/configure.in b/configure.in
index 55e395dc..26268f78 100644
--- a/configure.in
+++ b/configure.in
@@ -525,7 +525,8 @@ AC_ARG_WITH(schemas, [ --with-schemas Add experimental Schemas sup
if test "$with_schemas" = "yes" ; then
echo Enabling Schemas support
WITH_SCHEMAS=1
- TEST_SCHEMAS="Regexptests Automatatests Schemastests"
+ TEST_SCHEMAS="Schemastests"
+ with_regexps=yes
else
WITH_SCHEMAS=0
TEST_SCHEMAS=
@@ -533,6 +534,18 @@ fi
AC_SUBST(WITH_SCHEMAS)
AC_SUBST(TEST_SCHEMAS)
+AC_ARG_WITH(regexps, [ --with-regexps Add Regular Expressions support (on)])
+if test "$with_regexps" = "no" ; then
+ echo Disabling Regexps support
+ WITH_REGEXPS=0
+ TEST_REGEXPS=
+else
+ WITH_REGEXPS=1
+ TEST_REGEXPS="Regexptests Automatatests"
+fi
+AC_SUBST(WITH_REGEXPS)
+AC_SUBST(TEST_REGEXPS)
+
AC_ARG_WITH(debug, [ --with-debug Add the debugging module (on)])
if test "$with_debug" = "no" ; then
echo Disabling DEBUG support
diff --git a/include/libxml/xmlregexp.h b/include/libxml/xmlregexp.h
index b66d4d95..48d1aa28 100644
--- a/include/libxml/xmlregexp.h
+++ b/include/libxml/xmlregexp.h
@@ -55,6 +55,7 @@ int xmlRegexpExec (xmlRegexpPtr comp,
const xmlChar *value);
void xmlRegexpPrint (FILE *output,
xmlRegexpPtr regexp);
+int xmlRegexpIsDeterminist(xmlRegexpPtr comp);
/*
* Callback function when doing a transition in the automata
diff --git a/include/libxml/xmlversion.h.in b/include/libxml/xmlversion.h.in
index 777e9f73..94def180 100644
--- a/include/libxml/xmlversion.h.in
+++ b/include/libxml/xmlversion.h.in
@@ -194,7 +194,7 @@ extern void xmlCheckVersion(int version);
*
* Whether the Unicode related interfaces are compiled in
*/
-#if @WITH_SCHEMAS@
+#if @WITH_REGEXPS@
#define LIBXML_UNICODE_ENABLED
#endif
@@ -203,7 +203,7 @@ extern void xmlCheckVersion(int version);
*
* Whether the regular expressions interfaces are compiled in
*/
-#if @WITH_SCHEMAS@
+#if @WITH_REGEXPS@
#define LIBXML_REGEXP_ENABLED
#endif
@@ -212,7 +212,7 @@ extern void xmlCheckVersion(int version);
*
* Whether the automata interfaces are compiled in
*/
-#if @WITH_SCHEMAS@
+#if @WITH_REGEXPS@
#define LIBXML_AUTOMATA_ENABLED
#endif
diff --git a/testRegexp.c b/testRegexp.c
index a1d0d270..c0a5cbca 100644
--- a/testRegexp.c
+++ b/testRegexp.c
@@ -140,11 +140,12 @@ int main(int argc, char **argv) {
}
}
}
+ xmlMemoryDump();
if (comp != NULL)
xmlRegFreeRegexp(comp);
}
xmlCleanupParser();
- xmlMemoryDump();
+ /* xmlMemoryDump(); */
return(0);
}
diff --git a/valid.c b/valid.c
index 5a9a6147..fbdc7665 100644
--- a/valid.c
+++ b/valid.c
@@ -553,7 +553,7 @@ xmlValidBuildContentModel(xmlValidCtxtPtr ctxt, xmlElementPtr elem) {
xmlValidBuildAContentModel(elem->content, ctxt, elem->name);
xmlAutomataSetFinalState(ctxt->am, ctxt->state);
elem->contModel = xmlAutomataCompile(ctxt->am);
- if (!xmlAutomataIsDeterminist(ctxt->am)) {
+ if (!xmlRegexpIsDeterminist(elem->contModel)) {
char expr[5000];
expr[0] = 0;
xmlSnprintfElementContent(expr, 5000, elem->content, 1);
@@ -3849,6 +3849,7 @@ xmlValidateSkipIgnorable(xmlNodePtr child) {
return(child);
}
+#ifndef LIBXML_REGEXP_ENABLED
/**
* xmlValidateElementType:
* @ctxt: the validation context
@@ -4217,6 +4218,7 @@ analyze:
}
return(determinist);
}
+#endif
/**
* xmlSnprintfElements:
@@ -4319,7 +4321,10 @@ static int
xmlValidateElementContent(xmlValidCtxtPtr ctxt, xmlNodePtr child,
xmlElementPtr elemDecl, int warn, xmlNodePtr parent) {
int ret = 1;
- xmlNodePtr repl = NULL, last = NULL, cur, tmp;
+#ifndef LIBXML_REGEXP_ENABLED
+ xmlNodePtr last = NULL;
+#endif
+ xmlNodePtr repl = NULL, cur, tmp;
xmlElementContentPtr cont;
const xmlChar *name;
@@ -4561,7 +4566,9 @@ fail:
if (ret == -3)
ret = 1;
+#ifndef LIBXML_REGEXP_ENABLED
done:
+#endif
/*
* Deallocate the copy if done, and free up the validation stack
*/
diff --git a/xmlregexp.c b/xmlregexp.c
index 8ff73852..c4cf6812 100644
--- a/xmlregexp.c
+++ b/xmlregexp.c
@@ -30,6 +30,7 @@
/* #define DEBUG_REGEXP_GRAPH */
/* #define DEBUG_REGEXP_EXEC */
/* #define DEBUG_PUSH */
+/* #define DEBUG_COMPACTION */
#define ERROR(str) ctxt->error = 1; \
xmlGenericError(xmlGenericErrorContext, "Regexp: %s: %s\n", str, ctxt->cur)
@@ -193,6 +194,7 @@ struct _xmlRegTrans {
struct _xmlAutomataState {
xmlRegStateType type;
xmlRegMarkedType mark;
+ xmlRegMarkedType reached;
int no;
int maxTrans;
@@ -240,6 +242,13 @@ struct _xmlRegexp {
int nbCounters;
xmlRegCounter *counters;
int determinist;
+ /*
+ * That's the compact form for determinists automatas
+ */
+ int nbstates;
+ int *compact;
+ int nbstrings;
+ xmlChar **stringMap;
};
typedef struct _xmlRegExecRollback xmlRegExecRollback;
@@ -299,6 +308,8 @@ struct _xmlRegExecCtxt {
#define REGEXP_ALL_LAX_COUNTER 0x123457
static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
+static void xmlRegFreeState(xmlRegStatePtr state);
+static void xmlRegFreeAtom(xmlRegAtomPtr atom);
/************************************************************************
* *
@@ -306,6 +317,7 @@ static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
* *
************************************************************************/
+static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
/**
* xmlRegEpxFromParse:
* @ctxt: the parser context used to build it
@@ -337,6 +349,166 @@ xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
ret->counters = ctxt->counters;
ctxt->counters = NULL;
ret->determinist = ctxt->determinist;
+
+ if ((ret->determinist != 0) &&
+ (ret->nbCounters == 0) &&
+ (ret->atoms[0] != NULL) &&
+ (ret->atoms[0]->type == XML_REGEXP_STRING)) {
+ int i, j, nbstates = 0, nbatoms = 0;
+ int *stateRemap;
+ int *stringRemap;
+ int *transitions;
+ xmlChar **stringMap;
+ xmlChar *value;
+
+ /*
+ * Switch to a compact representation
+ * 1/ counting the effective number of states left
+ * 2/ conting the unique number of atoms, and check that
+ * they are all of the string type
+ * 3/ build a table state x atom for the transitions
+ */
+
+ stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
+ for (i = 0;i < ret->nbStates;i++) {
+ if (ret->states[i] != NULL) {
+ stateRemap[i] = nbstates;
+ nbstates++;
+ } else {
+ stateRemap[i] = -1;
+ }
+ }
+#ifdef DEBUG_COMPACTION
+ printf("Final: %d states\n", nbstates);
+#endif
+ stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
+ stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
+ for (i = 0;i < ret->nbAtoms;i++) {
+ if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
+ (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
+ value = ret->atoms[i]->valuep;
+ for (j = 0;j < nbatoms;j++) {
+ if (xmlStrEqual(stringMap[j], value)) {
+ stringRemap[i] = j;
+ break;
+ }
+ }
+ if (j >= nbatoms) {
+ stringRemap[i] = nbatoms;
+ stringMap[nbatoms] = xmlStrdup(value);
+ nbatoms++;
+ }
+ } else {
+ xmlFree(stateRemap);
+ xmlFree(stringRemap);
+ for (i = 0;i < nbatoms;i++)
+ xmlFree(stringMap[i]);
+ xmlFree(stringMap);
+ goto fail_compact;
+ }
+ }
+#ifdef DEBUG_COMPACTION
+ printf("Final: %d atoms\n", nbatoms);
+#endif
+
+ /*
+ * Allocate the transition table. The first entry for each
+ * state correspond to the state type.
+ */
+ transitions = (int *) xmlMalloc(nbstates * (nbatoms + 1) * sizeof(int));
+ memset(transitions, 0, nbstates * (nbatoms + 1) * sizeof(int));
+
+ for (i = 0;i < ret->nbStates;i++) {
+ int stateno, atomno, targetno, prev;
+ xmlRegStatePtr state;
+ xmlRegTransPtr trans;
+
+ stateno = stateRemap[i];
+ if (stateno == -1)
+ continue;
+ state = ret->states[i];
+
+ transitions[stateno * (nbatoms + 1)] = state->type;
+
+ for (j = 0;j < state->nbTrans;j++) {
+ trans = &(state->trans[j]);
+ if ((trans->to == -1) || (trans->atom == NULL))
+ continue;
+ atomno = stringRemap[trans->atom->no];
+ targetno = stateRemap[trans->to];
+ /*
+ * if the same atome can generate transition to 2 different
+ * states then it means the automata is not determinist and
+ * the compact form can't be used !
+ */
+ prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
+ if (prev != 0) {
+ if (prev != targetno + 1) {
+ printf("not determinist\n");
+ ret->determinist = 0;
+#ifdef DEBUG_COMPACTION
+ printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
+ i, j, trans->atom->no, trans->to, atomno, targetno);
+ printf(" previous to is %d\n", prev);
+#endif
+ ret->determinist = 0;
+ xmlFree(transitions);
+ xmlFree(stateRemap);
+ xmlFree(stringRemap);
+ for (i = 0;i < nbatoms;i++)
+ xmlFree(stringMap[i]);
+ xmlFree(stringMap);
+ goto fail_compact;
+ }
+ } else {
+#if 0
+ printf("State %d trans %d: atom %d to %d : %d to %d\n",
+ i, j, trans->atom->no, trans->to, atomno, targetno);
+#endif
+ transitions[stateno * (nbatoms + 1) + atomno + 1] =
+ targetno + 1;; /* to avoid 0 */
+ }
+ }
+ }
+ ret->determinist = 1;
+#ifdef DEBUG_COMPACTION
+ /*
+ * Debug
+ */
+ for (i = 0;i < nbstates;i++) {
+ for (j = 0;j < nbatoms + 1;j++) {
+ printf("%02d ", transitions[i * (nbatoms + 1) + j]);
+ }
+ printf("\n");
+ }
+ printf("\n");
+#endif
+ /*
+ * Cleanup of the old data
+ */
+ if (ret->states != NULL) {
+ for (i = 0;i < ret->nbStates;i++)
+ xmlRegFreeState(ret->states[i]);
+ xmlFree(ret->states);
+ }
+ ret->states = NULL;
+ ret->nbStates = 0;
+ if (ret->atoms != NULL) {
+ for (i = 0;i < ret->nbAtoms;i++)
+ xmlRegFreeAtom(ret->atoms[i]);
+ xmlFree(ret->atoms);
+ }
+ ret->atoms = NULL;
+ ret->nbAtoms = 0;
+
+ ret->compact = transitions;
+ ret->stringMap = stringMap;
+ ret->nbstrings = nbatoms;
+ ret->nbstates = nbstates;
+ xmlFree(stateRemap);
+ xmlFree(stringRemap);
+ }
+fail_compact:
return(ret);
}
@@ -741,6 +913,7 @@ xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
}
}
+#ifdef DEBUG_REGEXP_GRAPH
static void
xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
int i;
@@ -780,6 +953,7 @@ xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
ctxt->counters[i].max);
}
}
+#endif
/************************************************************************
* *
@@ -1228,7 +1402,6 @@ xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
ctxt->states[newto], counter, -1);
}
-
}
}
to->mark = XML_REGEXP_MARK_NORMAL;
@@ -1296,6 +1469,63 @@ xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
}
}
}
+
+ /*
+ * Use this pass to detect unreachable states too
+ */
+ for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
+ state = ctxt->states[statenr];
+ if (state != NULL)
+ state->reached = 0;
+ }
+ state = ctxt->states[0];
+ if (state != NULL)
+ state->reached = 1;
+ while (state != NULL) {
+ xmlRegStatePtr target = NULL;
+ state->reached = 2;
+ /*
+ * Mark all state reachable from the current reachable state
+ */
+ for (transnr = 0;transnr < state->nbTrans;transnr++) {
+ if ((state->trans[transnr].to >= 0) &&
+ ((state->trans[transnr].atom != NULL) ||
+ (state->trans[transnr].count >= 0))) {
+ int newto = state->trans[transnr].to;
+
+ if (ctxt->states[newto] == NULL)
+ continue;
+ if (ctxt->states[newto]->reached == 0) {
+ ctxt->states[newto]->reached = 1;
+ target = ctxt->states[newto];
+ }
+ }
+ }
+ /*
+ * find the next accessible state not explored
+ */
+ if (target == NULL) {
+ for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
+ state = ctxt->states[statenr];
+ if ((state != NULL) && (state->reached == 1)) {
+ target = state;
+ break;
+ }
+ }
+ }
+ state = target;
+ }
+ for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
+ state = ctxt->states[statenr];
+ if ((state != NULL) && (state->reached == 0)) {
+#ifdef DEBUG_REGEXP_GRAPH
+ printf("Removed unreachable state %d\n", statenr);
+#endif
+ xmlRegFreeState(state);
+ ctxt->states[statenr] = NULL;
+ }
+ }
+
}
/**
@@ -2041,7 +2271,8 @@ xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
exec->rollbacks = NULL;
exec->status = 0;
exec->comp = comp;
- exec->state = comp->states[0];
+ if (comp->compact == NULL)
+ exec->state = comp->states[0];
exec->transno = 0;
exec->transcount = 0;
exec->callback = callback;
@@ -2131,6 +2362,73 @@ xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
/**
+ * xmlRegCompactPushString:
+ * @exec: a regexp execution context
+ * @comp: the precompiled exec with a compact table
+ * @value: a string token input
+ * @data: data associated to the token to reuse in callbacks
+ *
+ * Push one input token in the execution context
+ *
+ * Returns: 1 if the regexp reached a final state, 0 if non-final, and
+ * a negative value in case of error.
+ */
+static int
+xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
+ xmlRegexpPtr comp,
+ const xmlChar *value,
+ void *data) {
+ int state = exec->index;
+ int i, target;
+
+ if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
+ return(-1);
+
+ if (value == NULL) {
+ /*
+ * are we at a final state ?
+ */
+ if (comp->compact[state * (comp->nbstrings + 1)] ==
+ XML_REGEXP_FINAL_STATE)
+ return(1);
+ return(0);
+ }
+
+#ifdef DEBUG_PUSH
+ printf("value pushed: %s\n", value);
+#endif
+
+ /*
+ * Examine all outside transition from current state
+ */
+ for (i = 0;i < comp->nbstrings;i++) {
+ target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
+ if ((target > 0) && (target <= comp->nbstates)) {
+ target--; /* to avoid 0 */
+ if (xmlStrEqual(comp->stringMap[i], value)) {
+ exec->index = target;
+#ifdef DEBUG_PUSH
+ printf("entering state %d\n", target);
+#endif
+ if (comp->compact[target * (comp->nbstrings + 1)] ==
+ XML_REGEXP_FINAL_STATE)
+ return(1);
+ return(0);
+ }
+ }
+ }
+ /*
+ * Failed to find an exit transition out from current state for the
+ * current token
+ */
+#ifdef DEBUG_PUSH
+ printf("failed to find a transition for %s on state %d\n", value, state);
+#endif
+ exec->status = -1;
+ return(-1);
+}
+
+/**
* xmlRegExecPushString:
* @exec: a regexp execution context
* @value: a string token input
@@ -2151,9 +2449,14 @@ xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
if (exec == NULL)
return(-1);
+ if (exec->comp == NULL)
+ return(-1);
if (exec->status != 0)
return(exec->status);
+ if (exec->comp->compact != NULL)
+ return(xmlRegCompactPushString(exec, exec->comp, value, data));
+
if (value == NULL) {
if (exec->state->type == XML_REGEXP_FINAL_STATE)
return(1);
@@ -3510,6 +3813,37 @@ xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
}
/**
+ * xmlRegexpIsDeterminist:
+ * @comp: the compiled regular expression
+ *
+ * Check if the regular expression is determinist
+ *
+ * Returns 1 if it yes, 0 if not and a negativa value in case of error
+ */
+int
+xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
+ xmlAutomataPtr am;
+ int ret;
+
+ if (comp == NULL)
+ return(-1);
+ if (comp->determinist != -1)
+ return(comp->determinist);
+
+ am = xmlNewAutomata();
+ am->nbAtoms = comp->nbAtoms;
+ am->atoms = comp->atoms;
+ am->nbStates = comp->nbStates;
+ am->states = comp->states;
+ am->determinist = -1;
+ ret = xmlFAComputesDeterminism(am);
+ am->atoms = NULL;
+ am->states = NULL;
+ xmlFreeAutomata(am);
+ return(ret);
+}
+
+/**
* xmlRegFreeRegexp:
* @regexp: the regexp
*
@@ -3535,6 +3869,14 @@ xmlRegFreeRegexp(xmlRegexpPtr regexp) {
}
if (regexp->counters != NULL)
xmlFree(regexp->counters);
+ if (regexp->compact != NULL)
+ xmlFree(regexp->compact);
+ if (regexp->stringMap != NULL) {
+ for (i = 0; i < regexp->nbstrings;i++)
+ xmlFree(regexp->stringMap[i]);
+ xmlFree(regexp->stringMap);
+ }
+
xmlFree(regexp);
}
@@ -3910,7 +4252,7 @@ xmlAutomataCompile(xmlAutomataPtr am) {
xmlRegexpPtr ret;
xmlFAEliminateEpsilonTransitions(am);
- xmlFAComputesDeterminism(am);
+ /* xmlFAComputesDeterminism(am); */
ret = xmlRegEpxFromParse(am);
return(ret);