From 23e73571f8f6918e4ea7be3506ee5bd24ee86c52 Mon Sep 17 00:00:00 2001 From: Daniel Veillard Date: Thu, 19 Sep 2002 19:56:43 +0000 Subject: made configuring with regexps/automata/unicode the default but without * Makefile.am configure.in include/libxml/xmlversion.h.in: made configuring with regexps/automata/unicode the default but without schemas ATM * testRegexp.c valid.c xmlregexp.c include/libxml/xmlregexp.h: fixed the regexp based DTD validation performance and memory problem by switching to a compact form for determinist regexps and detecting the determinism property in the process. Seems as fast as the old DTD validation specific engine :-) despite the regexp built and compaction process. Daniel --- ChangeLog | 12 ++ Makefile.am | 2 +- configure.in | 15 +- include/libxml/xmlregexp.h | 1 + include/libxml/xmlversion.h.in | 6 +- testRegexp.c | 3 +- valid.c | 11 +- xmlregexp.c | 348 ++++++++++++++++++++++++++++++++++++++++- 8 files changed, 387 insertions(+), 11 deletions(-) diff --git a/ChangeLog b/ChangeLog index 01ca5b81..954f8318 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,15 @@ +Thu Sep 19 21:46:53 CEST 2002 Daniel Veillard + + * Makefile.am configure.in include/libxml/xmlversion.h.in: + made configuring with regexps/automata/unicode the default + but without schemas ATM + * testRegexp.c valid.c xmlregexp.c include/libxml/xmlregexp.h: + fixed the regexp based DTD validation performance and memory + problem by switching to a compact form for determinist regexps + and detecting the determinism property in the process. Seems + as fast as the old DTD validation specific engine :-) despite + the regexp built and compaction process. + Wed Sep 18 18:27:26 CEST 2002 Daniel Veillard * valid.c: determinism is debugged, new DTD checking code now works diff --git a/Makefile.am b/Makefile.am index f9d94262..7269a9e0 100644 --- a/Makefile.am +++ b/Makefile.am @@ -106,7 +106,7 @@ check-local: tests testall : tests SVGtests SAXtests -tests: XMLtests XMLenttests HTMLtests Validtests URItests XPathtests XPtrtests XIncludetests C14Ntests Scripttests Catatests @TEST_SCHEMAS@ @TEST_THREADS@ +tests: XMLtests XMLenttests HTMLtests Validtests URItests XPathtests XPtrtests XIncludetests C14Ntests Scripttests Catatests @TEST_REGEXPS@ @TEST_SCHEMAS@ @TEST_THREADS@ @(cd python ; $(MAKE) tests) valgrind: diff --git a/configure.in b/configure.in index 55e395dc..26268f78 100644 --- a/configure.in +++ b/configure.in @@ -525,7 +525,8 @@ AC_ARG_WITH(schemas, [ --with-schemas Add experimental Schemas sup if test "$with_schemas" = "yes" ; then echo Enabling Schemas support WITH_SCHEMAS=1 - TEST_SCHEMAS="Regexptests Automatatests Schemastests" + TEST_SCHEMAS="Schemastests" + with_regexps=yes else WITH_SCHEMAS=0 TEST_SCHEMAS= @@ -533,6 +534,18 @@ fi AC_SUBST(WITH_SCHEMAS) AC_SUBST(TEST_SCHEMAS) +AC_ARG_WITH(regexps, [ --with-regexps Add Regular Expressions support (on)]) +if test "$with_regexps" = "no" ; then + echo Disabling Regexps support + WITH_REGEXPS=0 + TEST_REGEXPS= +else + WITH_REGEXPS=1 + TEST_REGEXPS="Regexptests Automatatests" +fi +AC_SUBST(WITH_REGEXPS) +AC_SUBST(TEST_REGEXPS) + AC_ARG_WITH(debug, [ --with-debug Add the debugging module (on)]) if test "$with_debug" = "no" ; then echo Disabling DEBUG support diff --git a/include/libxml/xmlregexp.h b/include/libxml/xmlregexp.h index b66d4d95..48d1aa28 100644 --- a/include/libxml/xmlregexp.h +++ b/include/libxml/xmlregexp.h @@ -55,6 +55,7 @@ int xmlRegexpExec (xmlRegexpPtr comp, const xmlChar *value); void xmlRegexpPrint (FILE *output, xmlRegexpPtr regexp); +int xmlRegexpIsDeterminist(xmlRegexpPtr comp); /* * Callback function when doing a transition in the automata diff --git a/include/libxml/xmlversion.h.in b/include/libxml/xmlversion.h.in index 777e9f73..94def180 100644 --- a/include/libxml/xmlversion.h.in +++ b/include/libxml/xmlversion.h.in @@ -194,7 +194,7 @@ extern void xmlCheckVersion(int version); * * Whether the Unicode related interfaces are compiled in */ -#if @WITH_SCHEMAS@ +#if @WITH_REGEXPS@ #define LIBXML_UNICODE_ENABLED #endif @@ -203,7 +203,7 @@ extern void xmlCheckVersion(int version); * * Whether the regular expressions interfaces are compiled in */ -#if @WITH_SCHEMAS@ +#if @WITH_REGEXPS@ #define LIBXML_REGEXP_ENABLED #endif @@ -212,7 +212,7 @@ extern void xmlCheckVersion(int version); * * Whether the automata interfaces are compiled in */ -#if @WITH_SCHEMAS@ +#if @WITH_REGEXPS@ #define LIBXML_AUTOMATA_ENABLED #endif diff --git a/testRegexp.c b/testRegexp.c index a1d0d270..c0a5cbca 100644 --- a/testRegexp.c +++ b/testRegexp.c @@ -140,11 +140,12 @@ int main(int argc, char **argv) { } } } + xmlMemoryDump(); if (comp != NULL) xmlRegFreeRegexp(comp); } xmlCleanupParser(); - xmlMemoryDump(); + /* xmlMemoryDump(); */ return(0); } diff --git a/valid.c b/valid.c index 5a9a6147..fbdc7665 100644 --- a/valid.c +++ b/valid.c @@ -553,7 +553,7 @@ xmlValidBuildContentModel(xmlValidCtxtPtr ctxt, xmlElementPtr elem) { xmlValidBuildAContentModel(elem->content, ctxt, elem->name); xmlAutomataSetFinalState(ctxt->am, ctxt->state); elem->contModel = xmlAutomataCompile(ctxt->am); - if (!xmlAutomataIsDeterminist(ctxt->am)) { + if (!xmlRegexpIsDeterminist(elem->contModel)) { char expr[5000]; expr[0] = 0; xmlSnprintfElementContent(expr, 5000, elem->content, 1); @@ -3849,6 +3849,7 @@ xmlValidateSkipIgnorable(xmlNodePtr child) { return(child); } +#ifndef LIBXML_REGEXP_ENABLED /** * xmlValidateElementType: * @ctxt: the validation context @@ -4217,6 +4218,7 @@ analyze: } return(determinist); } +#endif /** * xmlSnprintfElements: @@ -4319,7 +4321,10 @@ static int xmlValidateElementContent(xmlValidCtxtPtr ctxt, xmlNodePtr child, xmlElementPtr elemDecl, int warn, xmlNodePtr parent) { int ret = 1; - xmlNodePtr repl = NULL, last = NULL, cur, tmp; +#ifndef LIBXML_REGEXP_ENABLED + xmlNodePtr last = NULL; +#endif + xmlNodePtr repl = NULL, cur, tmp; xmlElementContentPtr cont; const xmlChar *name; @@ -4561,7 +4566,9 @@ fail: if (ret == -3) ret = 1; +#ifndef LIBXML_REGEXP_ENABLED done: +#endif /* * Deallocate the copy if done, and free up the validation stack */ diff --git a/xmlregexp.c b/xmlregexp.c index 8ff73852..c4cf6812 100644 --- a/xmlregexp.c +++ b/xmlregexp.c @@ -30,6 +30,7 @@ /* #define DEBUG_REGEXP_GRAPH */ /* #define DEBUG_REGEXP_EXEC */ /* #define DEBUG_PUSH */ +/* #define DEBUG_COMPACTION */ #define ERROR(str) ctxt->error = 1; \ xmlGenericError(xmlGenericErrorContext, "Regexp: %s: %s\n", str, ctxt->cur) @@ -193,6 +194,7 @@ struct _xmlRegTrans { struct _xmlAutomataState { xmlRegStateType type; xmlRegMarkedType mark; + xmlRegMarkedType reached; int no; int maxTrans; @@ -240,6 +242,13 @@ struct _xmlRegexp { int nbCounters; xmlRegCounter *counters; int determinist; + /* + * That's the compact form for determinists automatas + */ + int nbstates; + int *compact; + int nbstrings; + xmlChar **stringMap; }; typedef struct _xmlRegExecRollback xmlRegExecRollback; @@ -299,6 +308,8 @@ struct _xmlRegExecCtxt { #define REGEXP_ALL_LAX_COUNTER 0x123457 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); +static void xmlRegFreeState(xmlRegStatePtr state); +static void xmlRegFreeAtom(xmlRegAtomPtr atom); /************************************************************************ * * @@ -306,6 +317,7 @@ static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top); * * ************************************************************************/ +static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt); /** * xmlRegEpxFromParse: * @ctxt: the parser context used to build it @@ -337,6 +349,166 @@ xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) { ret->counters = ctxt->counters; ctxt->counters = NULL; ret->determinist = ctxt->determinist; + + if ((ret->determinist != 0) && + (ret->nbCounters == 0) && + (ret->atoms[0] != NULL) && + (ret->atoms[0]->type == XML_REGEXP_STRING)) { + int i, j, nbstates = 0, nbatoms = 0; + int *stateRemap; + int *stringRemap; + int *transitions; + xmlChar **stringMap; + xmlChar *value; + + /* + * Switch to a compact representation + * 1/ counting the effective number of states left + * 2/ conting the unique number of atoms, and check that + * they are all of the string type + * 3/ build a table state x atom for the transitions + */ + + stateRemap = xmlMalloc(ret->nbStates * sizeof(int)); + for (i = 0;i < ret->nbStates;i++) { + if (ret->states[i] != NULL) { + stateRemap[i] = nbstates; + nbstates++; + } else { + stateRemap[i] = -1; + } + } +#ifdef DEBUG_COMPACTION + printf("Final: %d states\n", nbstates); +#endif + stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *)); + stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int)); + for (i = 0;i < ret->nbAtoms;i++) { + if ((ret->atoms[i]->type == XML_REGEXP_STRING) && + (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) { + value = ret->atoms[i]->valuep; + for (j = 0;j < nbatoms;j++) { + if (xmlStrEqual(stringMap[j], value)) { + stringRemap[i] = j; + break; + } + } + if (j >= nbatoms) { + stringRemap[i] = nbatoms; + stringMap[nbatoms] = xmlStrdup(value); + nbatoms++; + } + } else { + xmlFree(stateRemap); + xmlFree(stringRemap); + for (i = 0;i < nbatoms;i++) + xmlFree(stringMap[i]); + xmlFree(stringMap); + goto fail_compact; + } + } +#ifdef DEBUG_COMPACTION + printf("Final: %d atoms\n", nbatoms); +#endif + + /* + * Allocate the transition table. The first entry for each + * state correspond to the state type. + */ + transitions = (int *) xmlMalloc(nbstates * (nbatoms + 1) * sizeof(int)); + memset(transitions, 0, nbstates * (nbatoms + 1) * sizeof(int)); + + for (i = 0;i < ret->nbStates;i++) { + int stateno, atomno, targetno, prev; + xmlRegStatePtr state; + xmlRegTransPtr trans; + + stateno = stateRemap[i]; + if (stateno == -1) + continue; + state = ret->states[i]; + + transitions[stateno * (nbatoms + 1)] = state->type; + + for (j = 0;j < state->nbTrans;j++) { + trans = &(state->trans[j]); + if ((trans->to == -1) || (trans->atom == NULL)) + continue; + atomno = stringRemap[trans->atom->no]; + targetno = stateRemap[trans->to]; + /* + * if the same atome can generate transition to 2 different + * states then it means the automata is not determinist and + * the compact form can't be used ! + */ + prev = transitions[stateno * (nbatoms + 1) + atomno + 1]; + if (prev != 0) { + if (prev != targetno + 1) { + printf("not determinist\n"); + ret->determinist = 0; +#ifdef DEBUG_COMPACTION + printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n", + i, j, trans->atom->no, trans->to, atomno, targetno); + printf(" previous to is %d\n", prev); +#endif + ret->determinist = 0; + xmlFree(transitions); + xmlFree(stateRemap); + xmlFree(stringRemap); + for (i = 0;i < nbatoms;i++) + xmlFree(stringMap[i]); + xmlFree(stringMap); + goto fail_compact; + } + } else { +#if 0 + printf("State %d trans %d: atom %d to %d : %d to %d\n", + i, j, trans->atom->no, trans->to, atomno, targetno); +#endif + transitions[stateno * (nbatoms + 1) + atomno + 1] = + targetno + 1;; /* to avoid 0 */ + } + } + } + ret->determinist = 1; +#ifdef DEBUG_COMPACTION + /* + * Debug + */ + for (i = 0;i < nbstates;i++) { + for (j = 0;j < nbatoms + 1;j++) { + printf("%02d ", transitions[i * (nbatoms + 1) + j]); + } + printf("\n"); + } + printf("\n"); +#endif + /* + * Cleanup of the old data + */ + if (ret->states != NULL) { + for (i = 0;i < ret->nbStates;i++) + xmlRegFreeState(ret->states[i]); + xmlFree(ret->states); + } + ret->states = NULL; + ret->nbStates = 0; + if (ret->atoms != NULL) { + for (i = 0;i < ret->nbAtoms;i++) + xmlRegFreeAtom(ret->atoms[i]); + xmlFree(ret->atoms); + } + ret->atoms = NULL; + ret->nbAtoms = 0; + + ret->compact = transitions; + ret->stringMap = stringMap; + ret->nbstrings = nbatoms; + ret->nbstates = nbstates; + xmlFree(stateRemap); + xmlFree(stringRemap); + } +fail_compact: return(ret); } @@ -741,6 +913,7 @@ xmlRegPrintState(FILE *output, xmlRegStatePtr state) { } } +#ifdef DEBUG_REGEXP_GRAPH static void xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { int i; @@ -780,6 +953,7 @@ xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) { ctxt->counters[i].max); } } +#endif /************************************************************************ * * @@ -1228,7 +1402,6 @@ xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr, xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom, ctxt->states[newto], counter, -1); } - } } to->mark = XML_REGEXP_MARK_NORMAL; @@ -1296,6 +1469,63 @@ xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) { } } } + + /* + * Use this pass to detect unreachable states too + */ + for (statenr = 0;statenr < ctxt->nbStates;statenr++) { + state = ctxt->states[statenr]; + if (state != NULL) + state->reached = 0; + } + state = ctxt->states[0]; + if (state != NULL) + state->reached = 1; + while (state != NULL) { + xmlRegStatePtr target = NULL; + state->reached = 2; + /* + * Mark all state reachable from the current reachable state + */ + for (transnr = 0;transnr < state->nbTrans;transnr++) { + if ((state->trans[transnr].to >= 0) && + ((state->trans[transnr].atom != NULL) || + (state->trans[transnr].count >= 0))) { + int newto = state->trans[transnr].to; + + if (ctxt->states[newto] == NULL) + continue; + if (ctxt->states[newto]->reached == 0) { + ctxt->states[newto]->reached = 1; + target = ctxt->states[newto]; + } + } + } + /* + * find the next accessible state not explored + */ + if (target == NULL) { + for (statenr = 1;statenr < ctxt->nbStates;statenr++) { + state = ctxt->states[statenr]; + if ((state != NULL) && (state->reached == 1)) { + target = state; + break; + } + } + } + state = target; + } + for (statenr = 0;statenr < ctxt->nbStates;statenr++) { + state = ctxt->states[statenr]; + if ((state != NULL) && (state->reached == 0)) { +#ifdef DEBUG_REGEXP_GRAPH + printf("Removed unreachable state %d\n", statenr); +#endif + xmlRegFreeState(state); + ctxt->states[statenr] = NULL; + } + } + } /** @@ -2041,7 +2271,8 @@ xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) { exec->rollbacks = NULL; exec->status = 0; exec->comp = comp; - exec->state = comp->states[0]; + if (comp->compact == NULL) + exec->state = comp->states[0]; exec->transno = 0; exec->transcount = 0; exec->callback = callback; @@ -2130,6 +2361,73 @@ xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value, } +/** + * xmlRegCompactPushString: + * @exec: a regexp execution context + * @comp: the precompiled exec with a compact table + * @value: a string token input + * @data: data associated to the token to reuse in callbacks + * + * Push one input token in the execution context + * + * Returns: 1 if the regexp reached a final state, 0 if non-final, and + * a negative value in case of error. + */ +static int +xmlRegCompactPushString(xmlRegExecCtxtPtr exec, + xmlRegexpPtr comp, + const xmlChar *value, + void *data) { + int state = exec->index; + int i, target; + + if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL)) + return(-1); + + if (value == NULL) { + /* + * are we at a final state ? + */ + if (comp->compact[state * (comp->nbstrings + 1)] == + XML_REGEXP_FINAL_STATE) + return(1); + return(0); + } + +#ifdef DEBUG_PUSH + printf("value pushed: %s\n", value); +#endif + + /* + * Examine all outside transition from current state + */ + for (i = 0;i < comp->nbstrings;i++) { + target = comp->compact[state * (comp->nbstrings + 1) + i + 1]; + if ((target > 0) && (target <= comp->nbstates)) { + target--; /* to avoid 0 */ + if (xmlStrEqual(comp->stringMap[i], value)) { + exec->index = target; +#ifdef DEBUG_PUSH + printf("entering state %d\n", target); +#endif + if (comp->compact[target * (comp->nbstrings + 1)] == + XML_REGEXP_FINAL_STATE) + return(1); + return(0); + } + } + } + /* + * Failed to find an exit transition out from current state for the + * current token + */ +#ifdef DEBUG_PUSH + printf("failed to find a transition for %s on state %d\n", value, state); +#endif + exec->status = -1; + return(-1); +} + /** * xmlRegExecPushString: * @exec: a regexp execution context @@ -2151,9 +2449,14 @@ xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value, if (exec == NULL) return(-1); + if (exec->comp == NULL) + return(-1); if (exec->status != 0) return(exec->status); + if (exec->comp->compact != NULL) + return(xmlRegCompactPushString(exec, exec->comp, value, data)); + if (value == NULL) { if (exec->state->type == XML_REGEXP_FINAL_STATE) return(1); @@ -3509,6 +3812,37 @@ xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) { return(xmlFARegExec(comp, content)); } +/** + * xmlRegexpIsDeterminist: + * @comp: the compiled regular expression + * + * Check if the regular expression is determinist + * + * Returns 1 if it yes, 0 if not and a negativa value in case of error + */ +int +xmlRegexpIsDeterminist(xmlRegexpPtr comp) { + xmlAutomataPtr am; + int ret; + + if (comp == NULL) + return(-1); + if (comp->determinist != -1) + return(comp->determinist); + + am = xmlNewAutomata(); + am->nbAtoms = comp->nbAtoms; + am->atoms = comp->atoms; + am->nbStates = comp->nbStates; + am->states = comp->states; + am->determinist = -1; + ret = xmlFAComputesDeterminism(am); + am->atoms = NULL; + am->states = NULL; + xmlFreeAutomata(am); + return(ret); +} + /** * xmlRegFreeRegexp: * @regexp: the regexp @@ -3535,6 +3869,14 @@ xmlRegFreeRegexp(xmlRegexpPtr regexp) { } if (regexp->counters != NULL) xmlFree(regexp->counters); + if (regexp->compact != NULL) + xmlFree(regexp->compact); + if (regexp->stringMap != NULL) { + for (i = 0; i < regexp->nbstrings;i++) + xmlFree(regexp->stringMap[i]); + xmlFree(regexp->stringMap); + } + xmlFree(regexp); } @@ -3910,7 +4252,7 @@ xmlAutomataCompile(xmlAutomataPtr am) { xmlRegexpPtr ret; xmlFAEliminateEpsilonTransitions(am); - xmlFAComputesDeterminism(am); + /* xmlFAComputesDeterminism(am); */ ret = xmlRegEpxFromParse(am); return(ret); -- cgit v1.2.3