• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * regexp.c: generic and extensible Regular Expression engine
3  *
4  * Basically designed with the purpose of compiling regexps for
5  * the variety of validation/schemas mechanisms now available in
6  * XML related specifications these include:
7  *    - XML-1.0 DTD validation
8  *    - XML Schemas structure part 1
9  *    - XML Schemas Datatypes part 2 especially Appendix F
10  *    - RELAX-NG/TREX i.e. the counter proposal
11  *
12  * See Copyright for the status of this software.
13  *
14  * Daniel Veillard <veillard@redhat.com>
15  */
16 
17 #define IN_LIBXML
18 #include "libxml.h"
19 
20 #ifdef LIBXML_REGEXP_ENABLED
21 
22 /* #define DEBUG_ERR */
23 
24 #include <stdio.h>
25 #include <string.h>
26 #include <limits.h>
27 
28 #include <libxml/tree.h>
29 #include <libxml/parserInternals.h>
30 #include <libxml/xmlregexp.h>
31 #include <libxml/xmlautomata.h>
32 #include <libxml/xmlunicode.h>
33 
34 #include "private/error.h"
35 #include "private/regexp.h"
36 
37 #ifndef SIZE_MAX
38 #define SIZE_MAX ((size_t) -1)
39 #endif
40 
41 /* #define DEBUG_REGEXP_GRAPH */
42 /* #define DEBUG_REGEXP_EXEC */
43 /* #define DEBUG_PUSH */
44 /* #define DEBUG_COMPACTION */
45 
46 #define MAX_PUSH 10000000
47 
48 #ifdef ERROR
49 #undef ERROR
50 #endif
51 #define ERROR(str)							\
52     ctxt->error = XML_REGEXP_COMPILE_ERROR;				\
53     xmlRegexpErrCompile(ctxt, str);
54 #define NEXT ctxt->cur++
55 #define CUR (*(ctxt->cur))
56 #define NXT(index) (ctxt->cur[index])
57 
58 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
59 #define NEXTL(l) ctxt->cur += l;
60 #define XML_REG_STRING_SEPARATOR '|'
61 /*
62  * Need PREV to check on a '-' within a Character Group. May only be used
63  * when it's guaranteed that cur is not at the beginning of ctxt->string!
64  */
65 #define PREV (ctxt->cur[-1])
66 
67 /**
68  * TODO:
69  *
70  * macro to flag unimplemented blocks
71  */
72 #define TODO								\
73     xmlGenericError(xmlGenericErrorContext,				\
74 	    "Unimplemented block at %s:%d\n",				\
75             __FILE__, __LINE__);
76 
77 /************************************************************************
78  *									*
79  *			Datatypes and structures			*
80  *									*
81  ************************************************************************/
82 
83 /*
84  * Note: the order of the enums below is significant, do not shuffle
85  */
86 typedef enum {
87     XML_REGEXP_EPSILON = 1,
88     XML_REGEXP_CHARVAL,
89     XML_REGEXP_RANGES,
90     XML_REGEXP_SUBREG,  /* used for () sub regexps */
91     XML_REGEXP_STRING,
92     XML_REGEXP_ANYCHAR, /* . */
93     XML_REGEXP_ANYSPACE, /* \s */
94     XML_REGEXP_NOTSPACE, /* \S */
95     XML_REGEXP_INITNAME, /* \l */
96     XML_REGEXP_NOTINITNAME, /* \L */
97     XML_REGEXP_NAMECHAR, /* \c */
98     XML_REGEXP_NOTNAMECHAR, /* \C */
99     XML_REGEXP_DECIMAL, /* \d */
100     XML_REGEXP_NOTDECIMAL, /* \D */
101     XML_REGEXP_REALCHAR, /* \w */
102     XML_REGEXP_NOTREALCHAR, /* \W */
103     XML_REGEXP_LETTER = 100,
104     XML_REGEXP_LETTER_UPPERCASE,
105     XML_REGEXP_LETTER_LOWERCASE,
106     XML_REGEXP_LETTER_TITLECASE,
107     XML_REGEXP_LETTER_MODIFIER,
108     XML_REGEXP_LETTER_OTHERS,
109     XML_REGEXP_MARK,
110     XML_REGEXP_MARK_NONSPACING,
111     XML_REGEXP_MARK_SPACECOMBINING,
112     XML_REGEXP_MARK_ENCLOSING,
113     XML_REGEXP_NUMBER,
114     XML_REGEXP_NUMBER_DECIMAL,
115     XML_REGEXP_NUMBER_LETTER,
116     XML_REGEXP_NUMBER_OTHERS,
117     XML_REGEXP_PUNCT,
118     XML_REGEXP_PUNCT_CONNECTOR,
119     XML_REGEXP_PUNCT_DASH,
120     XML_REGEXP_PUNCT_OPEN,
121     XML_REGEXP_PUNCT_CLOSE,
122     XML_REGEXP_PUNCT_INITQUOTE,
123     XML_REGEXP_PUNCT_FINQUOTE,
124     XML_REGEXP_PUNCT_OTHERS,
125     XML_REGEXP_SEPAR,
126     XML_REGEXP_SEPAR_SPACE,
127     XML_REGEXP_SEPAR_LINE,
128     XML_REGEXP_SEPAR_PARA,
129     XML_REGEXP_SYMBOL,
130     XML_REGEXP_SYMBOL_MATH,
131     XML_REGEXP_SYMBOL_CURRENCY,
132     XML_REGEXP_SYMBOL_MODIFIER,
133     XML_REGEXP_SYMBOL_OTHERS,
134     XML_REGEXP_OTHER,
135     XML_REGEXP_OTHER_CONTROL,
136     XML_REGEXP_OTHER_FORMAT,
137     XML_REGEXP_OTHER_PRIVATE,
138     XML_REGEXP_OTHER_NA,
139     XML_REGEXP_BLOCK_NAME
140 } xmlRegAtomType;
141 
142 typedef enum {
143     XML_REGEXP_QUANT_EPSILON = 1,
144     XML_REGEXP_QUANT_ONCE,
145     XML_REGEXP_QUANT_OPT,
146     XML_REGEXP_QUANT_MULT,
147     XML_REGEXP_QUANT_PLUS,
148     XML_REGEXP_QUANT_ONCEONLY,
149     XML_REGEXP_QUANT_ALL,
150     XML_REGEXP_QUANT_RANGE
151 } xmlRegQuantType;
152 
153 typedef enum {
154     XML_REGEXP_START_STATE = 1,
155     XML_REGEXP_FINAL_STATE,
156     XML_REGEXP_TRANS_STATE,
157     XML_REGEXP_SINK_STATE,
158     XML_REGEXP_UNREACH_STATE
159 } xmlRegStateType;
160 
161 typedef enum {
162     XML_REGEXP_MARK_NORMAL = 0,
163     XML_REGEXP_MARK_START,
164     XML_REGEXP_MARK_VISITED
165 } xmlRegMarkedType;
166 
167 typedef struct _xmlRegRange xmlRegRange;
168 typedef xmlRegRange *xmlRegRangePtr;
169 
170 struct _xmlRegRange {
171     int neg;		/* 0 normal, 1 not, 2 exclude */
172     xmlRegAtomType type;
173     int start;
174     int end;
175     xmlChar *blockName;
176 };
177 
178 typedef struct _xmlRegAtom xmlRegAtom;
179 typedef xmlRegAtom *xmlRegAtomPtr;
180 
181 typedef struct _xmlAutomataState xmlRegState;
182 typedef xmlRegState *xmlRegStatePtr;
183 
184 struct _xmlRegAtom {
185     int no;
186     xmlRegAtomType type;
187     xmlRegQuantType quant;
188     int min;
189     int max;
190 
191     void *valuep;
192     void *valuep2;
193     int neg;
194     int codepoint;
195     xmlRegStatePtr start;
196     xmlRegStatePtr start0;
197     xmlRegStatePtr stop;
198     int maxRanges;
199     int nbRanges;
200     xmlRegRangePtr *ranges;
201     void *data;
202 };
203 
204 typedef struct _xmlRegCounter xmlRegCounter;
205 typedef xmlRegCounter *xmlRegCounterPtr;
206 
207 struct _xmlRegCounter {
208     int min;
209     int max;
210 };
211 
212 typedef struct _xmlRegTrans xmlRegTrans;
213 typedef xmlRegTrans *xmlRegTransPtr;
214 
215 struct _xmlRegTrans {
216     xmlRegAtomPtr atom;
217     int to;
218     int counter;
219     int count;
220     int nd;
221 };
222 
223 struct _xmlAutomataState {
224     xmlRegStateType type;
225     xmlRegMarkedType mark;
226     xmlRegMarkedType markd;
227     xmlRegMarkedType reached;
228     int no;
229     int maxTrans;
230     int nbTrans;
231     xmlRegTrans *trans;
232     /*  knowing states pointing to us can speed things up */
233     int maxTransTo;
234     int nbTransTo;
235     int *transTo;
236 };
237 
238 typedef struct _xmlAutomata xmlRegParserCtxt;
239 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
240 
241 #define AM_AUTOMATA_RNG 1
242 
243 struct _xmlAutomata {
244     xmlChar *string;
245     xmlChar *cur;
246 
247     int error;
248     int neg;
249 
250     xmlRegStatePtr start;
251     xmlRegStatePtr end;
252     xmlRegStatePtr state;
253 
254     xmlRegAtomPtr atom;
255 
256     int maxAtoms;
257     int nbAtoms;
258     xmlRegAtomPtr *atoms;
259 
260     int maxStates;
261     int nbStates;
262     xmlRegStatePtr *states;
263 
264     int maxCounters;
265     int nbCounters;
266     xmlRegCounter *counters;
267 
268     int determinist;
269     int negs;
270     int flags;
271 
272     int depth;
273 };
274 
275 struct _xmlRegexp {
276     xmlChar *string;
277     int nbStates;
278     xmlRegStatePtr *states;
279     int nbAtoms;
280     xmlRegAtomPtr *atoms;
281     int nbCounters;
282     xmlRegCounter *counters;
283     int determinist;
284     int flags;
285     /*
286      * That's the compact form for determinists automatas
287      */
288     int nbstates;
289     int *compact;
290     void **transdata;
291     int nbstrings;
292     xmlChar **stringMap;
293 };
294 
295 typedef struct _xmlRegExecRollback xmlRegExecRollback;
296 typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
297 
298 struct _xmlRegExecRollback {
299     xmlRegStatePtr state;/* the current state */
300     int index;		/* the index in the input stack */
301     int nextbranch;	/* the next transition to explore in that state */
302     int *counts;	/* save the automata state if it has some */
303 };
304 
305 typedef struct _xmlRegInputToken xmlRegInputToken;
306 typedef xmlRegInputToken *xmlRegInputTokenPtr;
307 
308 struct _xmlRegInputToken {
309     xmlChar *value;
310     void *data;
311 };
312 
313 struct _xmlRegExecCtxt {
314     int status;		/* execution status != 0 indicate an error */
315     int determinist;	/* did we find an indeterministic behaviour */
316     xmlRegexpPtr comp;	/* the compiled regexp */
317     xmlRegExecCallbacks callback;
318     void *data;
319 
320     xmlRegStatePtr state;/* the current state */
321     int transno;	/* the current transition on that state */
322     int transcount;	/* the number of chars in char counted transitions */
323 
324     /*
325      * A stack of rollback states
326      */
327     int maxRollbacks;
328     int nbRollbacks;
329     xmlRegExecRollback *rollbacks;
330 
331     /*
332      * The state of the automata if any
333      */
334     int *counts;
335 
336     /*
337      * The input stack
338      */
339     int inputStackMax;
340     int inputStackNr;
341     int index;
342     int *charStack;
343     const xmlChar *inputString; /* when operating on characters */
344     xmlRegInputTokenPtr inputStack;/* when operating on strings */
345 
346     /*
347      * error handling
348      */
349     int errStateNo;		/* the error state number */
350     xmlRegStatePtr errState;    /* the error state */
351     xmlChar *errString;		/* the string raising the error */
352     int *errCounts;		/* counters at the error state */
353     int nbPush;
354 };
355 
356 #define REGEXP_ALL_COUNTER	0x123456
357 #define REGEXP_ALL_LAX_COUNTER	0x123457
358 
359 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
360 static void xmlRegFreeState(xmlRegStatePtr state);
361 static void xmlRegFreeAtom(xmlRegAtomPtr atom);
362 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
363 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
364 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
365                   int neg, int start, int end, const xmlChar *blockName);
366 
367 /************************************************************************
368  *									*
369  *		Regexp memory error handler				*
370  *									*
371  ************************************************************************/
372 /**
373  * xmlRegexpErrMemory:
374  * @extra:  extra information
375  *
376  * Handle an out of memory condition
377  */
378 static void
xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt,const char * extra)379 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
380 {
381     const char *regexp = NULL;
382     if (ctxt != NULL) {
383         regexp = (const char *) ctxt->string;
384 	ctxt->error = XML_ERR_NO_MEMORY;
385     }
386     __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
387 		    XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
388 		    regexp, NULL, 0, 0,
389 		    "Memory allocation failed : %s\n", extra);
390 }
391 
392 /**
393  * xmlRegexpErrCompile:
394  * @extra:  extra information
395  *
396  * Handle a compilation failure
397  */
398 static void
xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt,const char * extra)399 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
400 {
401     const char *regexp = NULL;
402     int idx = 0;
403 
404     if (ctxt != NULL) {
405         regexp = (const char *) ctxt->string;
406 	idx = ctxt->cur - ctxt->string;
407 	ctxt->error = XML_REGEXP_COMPILE_ERROR;
408     }
409     __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
410 		    XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
411 		    regexp, NULL, idx, 0,
412 		    "failed to compile: %s\n", extra);
413 }
414 
415 /************************************************************************
416  *									*
417  *			Allocation/Deallocation				*
418  *									*
419  ************************************************************************/
420 
421 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
422 
423 /**
424  * xmlRegCalloc2:
425  * @dim1:  size of first dimension
426  * @dim2:  size of second dimension
427  * @elemSize:  size of element
428  *
429  * Allocate a two-dimensional array and set all elements to zero.
430  *
431  * Returns the new array or NULL in case of error.
432  */
433 static void*
xmlRegCalloc2(size_t dim1,size_t dim2,size_t elemSize)434 xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) {
435     size_t totalSize;
436     void *ret;
437 
438     /* Check for overflow */
439     if ((dim2 == 0) || (elemSize == 0) ||
440         (dim1 > SIZE_MAX / dim2 / elemSize))
441         return (NULL);
442     totalSize = dim1 * dim2 * elemSize;
443     ret = xmlMalloc(totalSize);
444     if (ret != NULL)
445         memset(ret, 0, totalSize);
446     return (ret);
447 }
448 
449 /**
450  * xmlRegEpxFromParse:
451  * @ctxt:  the parser context used to build it
452  *
453  * Allocate a new regexp and fill it with the result from the parser
454  *
455  * Returns the new regexp or NULL in case of error
456  */
457 static xmlRegexpPtr
xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt)458 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
459     xmlRegexpPtr ret;
460 
461     ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
462     if (ret == NULL) {
463 	xmlRegexpErrMemory(ctxt, "compiling regexp");
464 	return(NULL);
465     }
466     memset(ret, 0, sizeof(xmlRegexp));
467     ret->string = ctxt->string;
468     ret->nbStates = ctxt->nbStates;
469     ret->states = ctxt->states;
470     ret->nbAtoms = ctxt->nbAtoms;
471     ret->atoms = ctxt->atoms;
472     ret->nbCounters = ctxt->nbCounters;
473     ret->counters = ctxt->counters;
474     ret->determinist = ctxt->determinist;
475     ret->flags = ctxt->flags;
476     if (ret->determinist == -1) {
477         xmlRegexpIsDeterminist(ret);
478     }
479 
480     if ((ret->determinist != 0) &&
481 	(ret->nbCounters == 0) &&
482 	(ctxt->negs == 0) &&
483 	(ret->atoms != NULL) &&
484 	(ret->atoms[0] != NULL) &&
485 	(ret->atoms[0]->type == XML_REGEXP_STRING)) {
486 	int i, j, nbstates = 0, nbatoms = 0;
487 	int *stateRemap;
488 	int *stringRemap;
489 	int *transitions;
490 	void **transdata;
491 	xmlChar **stringMap;
492         xmlChar *value;
493 
494 	/*
495 	 * Switch to a compact representation
496 	 * 1/ counting the effective number of states left
497 	 * 2/ counting the unique number of atoms, and check that
498 	 *    they are all of the string type
499 	 * 3/ build a table state x atom for the transitions
500 	 */
501 
502 	stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
503 	if (stateRemap == NULL) {
504 	    xmlRegexpErrMemory(ctxt, "compiling regexp");
505 	    xmlFree(ret);
506 	    return(NULL);
507 	}
508 	for (i = 0;i < ret->nbStates;i++) {
509 	    if (ret->states[i] != NULL) {
510 		stateRemap[i] = nbstates;
511 		nbstates++;
512 	    } else {
513 		stateRemap[i] = -1;
514 	    }
515 	}
516 #ifdef DEBUG_COMPACTION
517 	printf("Final: %d states\n", nbstates);
518 #endif
519 	stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
520 	if (stringMap == NULL) {
521 	    xmlRegexpErrMemory(ctxt, "compiling regexp");
522 	    xmlFree(stateRemap);
523 	    xmlFree(ret);
524 	    return(NULL);
525 	}
526 	stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
527 	if (stringRemap == NULL) {
528 	    xmlRegexpErrMemory(ctxt, "compiling regexp");
529 	    xmlFree(stringMap);
530 	    xmlFree(stateRemap);
531 	    xmlFree(ret);
532 	    return(NULL);
533 	}
534 	for (i = 0;i < ret->nbAtoms;i++) {
535 	    if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
536 		(ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
537 		value = ret->atoms[i]->valuep;
538                 for (j = 0;j < nbatoms;j++) {
539 		    if (xmlStrEqual(stringMap[j], value)) {
540 			stringRemap[i] = j;
541 			break;
542 		    }
543 		}
544 		if (j >= nbatoms) {
545 		    stringRemap[i] = nbatoms;
546 		    stringMap[nbatoms] = xmlStrdup(value);
547 		    if (stringMap[nbatoms] == NULL) {
548 			for (i = 0;i < nbatoms;i++)
549 			    xmlFree(stringMap[i]);
550 			xmlFree(stringRemap);
551 			xmlFree(stringMap);
552 			xmlFree(stateRemap);
553 			xmlFree(ret);
554 			return(NULL);
555 		    }
556 		    nbatoms++;
557 		}
558 	    } else {
559 		xmlFree(stateRemap);
560 		xmlFree(stringRemap);
561 		for (i = 0;i < nbatoms;i++)
562 		    xmlFree(stringMap[i]);
563 		xmlFree(stringMap);
564 		xmlFree(ret);
565 		return(NULL);
566 	    }
567 	}
568 #ifdef DEBUG_COMPACTION
569 	printf("Final: %d atoms\n", nbatoms);
570 #endif
571 	transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1,
572                                             sizeof(int));
573 	if (transitions == NULL) {
574 	    xmlFree(stateRemap);
575 	    xmlFree(stringRemap);
576             for (i = 0;i < nbatoms;i++)
577 		xmlFree(stringMap[i]);
578 	    xmlFree(stringMap);
579 	    xmlFree(ret);
580 	    return(NULL);
581 	}
582 
583 	/*
584 	 * Allocate the transition table. The first entry for each
585 	 * state corresponds to the state type.
586 	 */
587 	transdata = NULL;
588 
589 	for (i = 0;i < ret->nbStates;i++) {
590 	    int stateno, atomno, targetno, prev;
591 	    xmlRegStatePtr state;
592 	    xmlRegTransPtr trans;
593 
594 	    stateno = stateRemap[i];
595 	    if (stateno == -1)
596 		continue;
597 	    state = ret->states[i];
598 
599 	    transitions[stateno * (nbatoms + 1)] = state->type;
600 
601 	    for (j = 0;j < state->nbTrans;j++) {
602 		trans = &(state->trans[j]);
603 		if ((trans->to == -1) || (trans->atom == NULL))
604 		    continue;
605                 atomno = stringRemap[trans->atom->no];
606 		if ((trans->atom->data != NULL) && (transdata == NULL)) {
607 		    transdata = (void **) xmlRegCalloc2(nbstates, nbatoms,
608 			                                sizeof(void *));
609 		    if (transdata == NULL) {
610 			xmlRegexpErrMemory(ctxt, "compiling regexp");
611 			break;
612 		    }
613 		}
614 		targetno = stateRemap[trans->to];
615 		/*
616 		 * if the same atom can generate transitions to 2 different
617 		 * states then it means the automata is not deterministic and
618 		 * the compact form can't be used !
619 		 */
620 		prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
621 		if (prev != 0) {
622 		    if (prev != targetno + 1) {
623 			ret->determinist = 0;
624 #ifdef DEBUG_COMPACTION
625 			printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
626 			       i, j, trans->atom->no, trans->to, atomno, targetno);
627 			printf("       previous to is %d\n", prev);
628 #endif
629 			if (transdata != NULL)
630 			    xmlFree(transdata);
631 			xmlFree(transitions);
632 			xmlFree(stateRemap);
633 			xmlFree(stringRemap);
634 			for (i = 0;i < nbatoms;i++)
635 			    xmlFree(stringMap[i]);
636 			xmlFree(stringMap);
637 			goto not_determ;
638 		    }
639 		} else {
640 #if 0
641 		    printf("State %d trans %d: atom %d to %d : %d to %d\n",
642 			   i, j, trans->atom->no, trans->to, atomno, targetno);
643 #endif
644 		    transitions[stateno * (nbatoms + 1) + atomno + 1] =
645 			targetno + 1; /* to avoid 0 */
646 		    if (transdata != NULL)
647 			transdata[stateno * nbatoms + atomno] =
648 			    trans->atom->data;
649 		}
650 	    }
651 	}
652 	ret->determinist = 1;
653 #ifdef DEBUG_COMPACTION
654 	/*
655 	 * Debug
656 	 */
657 	for (i = 0;i < nbstates;i++) {
658 	    for (j = 0;j < nbatoms + 1;j++) {
659                 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
660 	    }
661 	    printf("\n");
662 	}
663 	printf("\n");
664 #endif
665 	/*
666 	 * Cleanup of the old data
667 	 */
668 	if (ret->states != NULL) {
669 	    for (i = 0;i < ret->nbStates;i++)
670 		xmlRegFreeState(ret->states[i]);
671 	    xmlFree(ret->states);
672 	}
673 	ret->states = NULL;
674 	ret->nbStates = 0;
675 	if (ret->atoms != NULL) {
676 	    for (i = 0;i < ret->nbAtoms;i++)
677 		xmlRegFreeAtom(ret->atoms[i]);
678 	    xmlFree(ret->atoms);
679 	}
680 	ret->atoms = NULL;
681 	ret->nbAtoms = 0;
682 
683 	ret->compact = transitions;
684 	ret->transdata = transdata;
685 	ret->stringMap = stringMap;
686 	ret->nbstrings = nbatoms;
687 	ret->nbstates = nbstates;
688 	xmlFree(stateRemap);
689 	xmlFree(stringRemap);
690     }
691 not_determ:
692     ctxt->string = NULL;
693     ctxt->nbStates = 0;
694     ctxt->states = NULL;
695     ctxt->nbAtoms = 0;
696     ctxt->atoms = NULL;
697     ctxt->nbCounters = 0;
698     ctxt->counters = NULL;
699     return(ret);
700 }
701 
702 /**
703  * xmlRegNewParserCtxt:
704  * @string:  the string to parse
705  *
706  * Allocate a new regexp parser context
707  *
708  * Returns the new context or NULL in case of error
709  */
710 static xmlRegParserCtxtPtr
xmlRegNewParserCtxt(const xmlChar * string)711 xmlRegNewParserCtxt(const xmlChar *string) {
712     xmlRegParserCtxtPtr ret;
713 
714     ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
715     if (ret == NULL)
716 	return(NULL);
717     memset(ret, 0, sizeof(xmlRegParserCtxt));
718     if (string != NULL)
719 	ret->string = xmlStrdup(string);
720     ret->cur = ret->string;
721     ret->neg = 0;
722     ret->negs = 0;
723     ret->error = 0;
724     ret->determinist = -1;
725     return(ret);
726 }
727 
728 /**
729  * xmlRegNewRange:
730  * @ctxt:  the regexp parser context
731  * @neg:  is that negative
732  * @type:  the type of range
733  * @start:  the start codepoint
734  * @end:  the end codepoint
735  *
736  * Allocate a new regexp range
737  *
738  * Returns the new range or NULL in case of error
739  */
740 static xmlRegRangePtr
xmlRegNewRange(xmlRegParserCtxtPtr ctxt,int neg,xmlRegAtomType type,int start,int end)741 xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
742 	       int neg, xmlRegAtomType type, int start, int end) {
743     xmlRegRangePtr ret;
744 
745     ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
746     if (ret == NULL) {
747 	xmlRegexpErrMemory(ctxt, "allocating range");
748 	return(NULL);
749     }
750     ret->neg = neg;
751     ret->type = type;
752     ret->start = start;
753     ret->end = end;
754     return(ret);
755 }
756 
757 /**
758  * xmlRegFreeRange:
759  * @range:  the regexp range
760  *
761  * Free a regexp range
762  */
763 static void
xmlRegFreeRange(xmlRegRangePtr range)764 xmlRegFreeRange(xmlRegRangePtr range) {
765     if (range == NULL)
766 	return;
767 
768     if (range->blockName != NULL)
769 	xmlFree(range->blockName);
770     xmlFree(range);
771 }
772 
773 /**
774  * xmlRegCopyRange:
775  * @range:  the regexp range
776  *
777  * Copy a regexp range
778  *
779  * Returns the new copy or NULL in case of error.
780  */
781 static xmlRegRangePtr
xmlRegCopyRange(xmlRegParserCtxtPtr ctxt,xmlRegRangePtr range)782 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
783     xmlRegRangePtr ret;
784 
785     if (range == NULL)
786 	return(NULL);
787 
788     ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
789                          range->end);
790     if (ret == NULL)
791         return(NULL);
792     if (range->blockName != NULL) {
793 	ret->blockName = xmlStrdup(range->blockName);
794 	if (ret->blockName == NULL) {
795 	    xmlRegexpErrMemory(ctxt, "allocating range");
796 	    xmlRegFreeRange(ret);
797 	    return(NULL);
798 	}
799     }
800     return(ret);
801 }
802 
803 /**
804  * xmlRegNewAtom:
805  * @ctxt:  the regexp parser context
806  * @type:  the type of atom
807  *
808  * Allocate a new atom
809  *
810  * Returns the new atom or NULL in case of error
811  */
812 static xmlRegAtomPtr
xmlRegNewAtom(xmlRegParserCtxtPtr ctxt,xmlRegAtomType type)813 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
814     xmlRegAtomPtr ret;
815 
816     ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
817     if (ret == NULL) {
818 	xmlRegexpErrMemory(ctxt, "allocating atom");
819 	return(NULL);
820     }
821     memset(ret, 0, sizeof(xmlRegAtom));
822     ret->type = type;
823     ret->quant = XML_REGEXP_QUANT_ONCE;
824     ret->min = 0;
825     ret->max = 0;
826     return(ret);
827 }
828 
829 /**
830  * xmlRegFreeAtom:
831  * @atom:  the regexp atom
832  *
833  * Free a regexp atom
834  */
835 static void
xmlRegFreeAtom(xmlRegAtomPtr atom)836 xmlRegFreeAtom(xmlRegAtomPtr atom) {
837     int i;
838 
839     if (atom == NULL)
840 	return;
841 
842     for (i = 0;i < atom->nbRanges;i++)
843 	xmlRegFreeRange(atom->ranges[i]);
844     if (atom->ranges != NULL)
845 	xmlFree(atom->ranges);
846     if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
847 	xmlFree(atom->valuep);
848     if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
849 	xmlFree(atom->valuep2);
850     if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
851 	xmlFree(atom->valuep);
852     xmlFree(atom);
853 }
854 
855 /**
856  * xmlRegCopyAtom:
857  * @ctxt:  the regexp parser context
858  * @atom:  the original atom
859  *
860  * Allocate a new regexp range
861  *
862  * Returns the new atom or NULL in case of error
863  */
864 static xmlRegAtomPtr
xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom)865 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
866     xmlRegAtomPtr ret;
867 
868     ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
869     if (ret == NULL) {
870 	xmlRegexpErrMemory(ctxt, "copying atom");
871 	return(NULL);
872     }
873     memset(ret, 0, sizeof(xmlRegAtom));
874     ret->type = atom->type;
875     ret->quant = atom->quant;
876     ret->min = atom->min;
877     ret->max = atom->max;
878     if (atom->nbRanges > 0) {
879         int i;
880 
881         ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
882 	                                           atom->nbRanges);
883 	if (ret->ranges == NULL) {
884 	    xmlRegexpErrMemory(ctxt, "copying atom");
885 	    goto error;
886 	}
887 	for (i = 0;i < atom->nbRanges;i++) {
888 	    ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
889 	    if (ret->ranges[i] == NULL)
890 	        goto error;
891 	    ret->nbRanges = i + 1;
892 	}
893     }
894     return(ret);
895 
896 error:
897     xmlRegFreeAtom(ret);
898     return(NULL);
899 }
900 
901 static xmlRegStatePtr
xmlRegNewState(xmlRegParserCtxtPtr ctxt)902 xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
903     xmlRegStatePtr ret;
904 
905     ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
906     if (ret == NULL) {
907 	xmlRegexpErrMemory(ctxt, "allocating state");
908 	return(NULL);
909     }
910     memset(ret, 0, sizeof(xmlRegState));
911     ret->type = XML_REGEXP_TRANS_STATE;
912     ret->mark = XML_REGEXP_MARK_NORMAL;
913     return(ret);
914 }
915 
916 /**
917  * xmlRegFreeState:
918  * @state:  the regexp state
919  *
920  * Free a regexp state
921  */
922 static void
xmlRegFreeState(xmlRegStatePtr state)923 xmlRegFreeState(xmlRegStatePtr state) {
924     if (state == NULL)
925 	return;
926 
927     if (state->trans != NULL)
928 	xmlFree(state->trans);
929     if (state->transTo != NULL)
930 	xmlFree(state->transTo);
931     xmlFree(state);
932 }
933 
934 /**
935  * xmlRegFreeParserCtxt:
936  * @ctxt:  the regexp parser context
937  *
938  * Free a regexp parser context
939  */
940 static void
xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt)941 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
942     int i;
943     if (ctxt == NULL)
944 	return;
945 
946     if (ctxt->string != NULL)
947 	xmlFree(ctxt->string);
948     if (ctxt->states != NULL) {
949 	for (i = 0;i < ctxt->nbStates;i++)
950 	    xmlRegFreeState(ctxt->states[i]);
951 	xmlFree(ctxt->states);
952     }
953     if (ctxt->atoms != NULL) {
954 	for (i = 0;i < ctxt->nbAtoms;i++)
955 	    xmlRegFreeAtom(ctxt->atoms[i]);
956 	xmlFree(ctxt->atoms);
957     }
958     if (ctxt->counters != NULL)
959 	xmlFree(ctxt->counters);
960     xmlFree(ctxt);
961 }
962 
963 /************************************************************************
964  *									*
965  *			Display of Data structures			*
966  *									*
967  ************************************************************************/
968 
969 static void
xmlRegPrintAtomType(FILE * output,xmlRegAtomType type)970 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
971     switch (type) {
972         case XML_REGEXP_EPSILON:
973 	    fprintf(output, "epsilon "); break;
974         case XML_REGEXP_CHARVAL:
975 	    fprintf(output, "charval "); break;
976         case XML_REGEXP_RANGES:
977 	    fprintf(output, "ranges "); break;
978         case XML_REGEXP_SUBREG:
979 	    fprintf(output, "subexpr "); break;
980         case XML_REGEXP_STRING:
981 	    fprintf(output, "string "); break;
982         case XML_REGEXP_ANYCHAR:
983 	    fprintf(output, "anychar "); break;
984         case XML_REGEXP_ANYSPACE:
985 	    fprintf(output, "anyspace "); break;
986         case XML_REGEXP_NOTSPACE:
987 	    fprintf(output, "notspace "); break;
988         case XML_REGEXP_INITNAME:
989 	    fprintf(output, "initname "); break;
990         case XML_REGEXP_NOTINITNAME:
991 	    fprintf(output, "notinitname "); break;
992         case XML_REGEXP_NAMECHAR:
993 	    fprintf(output, "namechar "); break;
994         case XML_REGEXP_NOTNAMECHAR:
995 	    fprintf(output, "notnamechar "); break;
996         case XML_REGEXP_DECIMAL:
997 	    fprintf(output, "decimal "); break;
998         case XML_REGEXP_NOTDECIMAL:
999 	    fprintf(output, "notdecimal "); break;
1000         case XML_REGEXP_REALCHAR:
1001 	    fprintf(output, "realchar "); break;
1002         case XML_REGEXP_NOTREALCHAR:
1003 	    fprintf(output, "notrealchar "); break;
1004         case XML_REGEXP_LETTER:
1005             fprintf(output, "LETTER "); break;
1006         case XML_REGEXP_LETTER_UPPERCASE:
1007             fprintf(output, "LETTER_UPPERCASE "); break;
1008         case XML_REGEXP_LETTER_LOWERCASE:
1009             fprintf(output, "LETTER_LOWERCASE "); break;
1010         case XML_REGEXP_LETTER_TITLECASE:
1011             fprintf(output, "LETTER_TITLECASE "); break;
1012         case XML_REGEXP_LETTER_MODIFIER:
1013             fprintf(output, "LETTER_MODIFIER "); break;
1014         case XML_REGEXP_LETTER_OTHERS:
1015             fprintf(output, "LETTER_OTHERS "); break;
1016         case XML_REGEXP_MARK:
1017             fprintf(output, "MARK "); break;
1018         case XML_REGEXP_MARK_NONSPACING:
1019             fprintf(output, "MARK_NONSPACING "); break;
1020         case XML_REGEXP_MARK_SPACECOMBINING:
1021             fprintf(output, "MARK_SPACECOMBINING "); break;
1022         case XML_REGEXP_MARK_ENCLOSING:
1023             fprintf(output, "MARK_ENCLOSING "); break;
1024         case XML_REGEXP_NUMBER:
1025             fprintf(output, "NUMBER "); break;
1026         case XML_REGEXP_NUMBER_DECIMAL:
1027             fprintf(output, "NUMBER_DECIMAL "); break;
1028         case XML_REGEXP_NUMBER_LETTER:
1029             fprintf(output, "NUMBER_LETTER "); break;
1030         case XML_REGEXP_NUMBER_OTHERS:
1031             fprintf(output, "NUMBER_OTHERS "); break;
1032         case XML_REGEXP_PUNCT:
1033             fprintf(output, "PUNCT "); break;
1034         case XML_REGEXP_PUNCT_CONNECTOR:
1035             fprintf(output, "PUNCT_CONNECTOR "); break;
1036         case XML_REGEXP_PUNCT_DASH:
1037             fprintf(output, "PUNCT_DASH "); break;
1038         case XML_REGEXP_PUNCT_OPEN:
1039             fprintf(output, "PUNCT_OPEN "); break;
1040         case XML_REGEXP_PUNCT_CLOSE:
1041             fprintf(output, "PUNCT_CLOSE "); break;
1042         case XML_REGEXP_PUNCT_INITQUOTE:
1043             fprintf(output, "PUNCT_INITQUOTE "); break;
1044         case XML_REGEXP_PUNCT_FINQUOTE:
1045             fprintf(output, "PUNCT_FINQUOTE "); break;
1046         case XML_REGEXP_PUNCT_OTHERS:
1047             fprintf(output, "PUNCT_OTHERS "); break;
1048         case XML_REGEXP_SEPAR:
1049             fprintf(output, "SEPAR "); break;
1050         case XML_REGEXP_SEPAR_SPACE:
1051             fprintf(output, "SEPAR_SPACE "); break;
1052         case XML_REGEXP_SEPAR_LINE:
1053             fprintf(output, "SEPAR_LINE "); break;
1054         case XML_REGEXP_SEPAR_PARA:
1055             fprintf(output, "SEPAR_PARA "); break;
1056         case XML_REGEXP_SYMBOL:
1057             fprintf(output, "SYMBOL "); break;
1058         case XML_REGEXP_SYMBOL_MATH:
1059             fprintf(output, "SYMBOL_MATH "); break;
1060         case XML_REGEXP_SYMBOL_CURRENCY:
1061             fprintf(output, "SYMBOL_CURRENCY "); break;
1062         case XML_REGEXP_SYMBOL_MODIFIER:
1063             fprintf(output, "SYMBOL_MODIFIER "); break;
1064         case XML_REGEXP_SYMBOL_OTHERS:
1065             fprintf(output, "SYMBOL_OTHERS "); break;
1066         case XML_REGEXP_OTHER:
1067             fprintf(output, "OTHER "); break;
1068         case XML_REGEXP_OTHER_CONTROL:
1069             fprintf(output, "OTHER_CONTROL "); break;
1070         case XML_REGEXP_OTHER_FORMAT:
1071             fprintf(output, "OTHER_FORMAT "); break;
1072         case XML_REGEXP_OTHER_PRIVATE:
1073             fprintf(output, "OTHER_PRIVATE "); break;
1074         case XML_REGEXP_OTHER_NA:
1075             fprintf(output, "OTHER_NA "); break;
1076         case XML_REGEXP_BLOCK_NAME:
1077 	    fprintf(output, "BLOCK "); break;
1078     }
1079 }
1080 
1081 static void
xmlRegPrintQuantType(FILE * output,xmlRegQuantType type)1082 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1083     switch (type) {
1084         case XML_REGEXP_QUANT_EPSILON:
1085 	    fprintf(output, "epsilon "); break;
1086         case XML_REGEXP_QUANT_ONCE:
1087 	    fprintf(output, "once "); break;
1088         case XML_REGEXP_QUANT_OPT:
1089 	    fprintf(output, "? "); break;
1090         case XML_REGEXP_QUANT_MULT:
1091 	    fprintf(output, "* "); break;
1092         case XML_REGEXP_QUANT_PLUS:
1093 	    fprintf(output, "+ "); break;
1094 	case XML_REGEXP_QUANT_RANGE:
1095 	    fprintf(output, "range "); break;
1096 	case XML_REGEXP_QUANT_ONCEONLY:
1097 	    fprintf(output, "onceonly "); break;
1098 	case XML_REGEXP_QUANT_ALL:
1099 	    fprintf(output, "all "); break;
1100     }
1101 }
1102 static void
xmlRegPrintRange(FILE * output,xmlRegRangePtr range)1103 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1104     fprintf(output, "  range: ");
1105     if (range->neg)
1106 	fprintf(output, "negative ");
1107     xmlRegPrintAtomType(output, range->type);
1108     fprintf(output, "%c - %c\n", range->start, range->end);
1109 }
1110 
1111 static void
xmlRegPrintAtom(FILE * output,xmlRegAtomPtr atom)1112 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1113     fprintf(output, " atom: ");
1114     if (atom == NULL) {
1115 	fprintf(output, "NULL\n");
1116 	return;
1117     }
1118     if (atom->neg)
1119         fprintf(output, "not ");
1120     xmlRegPrintAtomType(output, atom->type);
1121     xmlRegPrintQuantType(output, atom->quant);
1122     if (atom->quant == XML_REGEXP_QUANT_RANGE)
1123 	fprintf(output, "%d-%d ", atom->min, atom->max);
1124     if (atom->type == XML_REGEXP_STRING)
1125 	fprintf(output, "'%s' ", (char *) atom->valuep);
1126     if (atom->type == XML_REGEXP_CHARVAL)
1127 	fprintf(output, "char %c\n", atom->codepoint);
1128     else if (atom->type == XML_REGEXP_RANGES) {
1129 	int i;
1130 	fprintf(output, "%d entries\n", atom->nbRanges);
1131 	for (i = 0; i < atom->nbRanges;i++)
1132 	    xmlRegPrintRange(output, atom->ranges[i]);
1133     } else if (atom->type == XML_REGEXP_SUBREG) {
1134 	fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1135     } else {
1136 	fprintf(output, "\n");
1137     }
1138 }
1139 
1140 static void
xmlRegPrintTrans(FILE * output,xmlRegTransPtr trans)1141 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1142     fprintf(output, "  trans: ");
1143     if (trans == NULL) {
1144 	fprintf(output, "NULL\n");
1145 	return;
1146     }
1147     if (trans->to < 0) {
1148 	fprintf(output, "removed\n");
1149 	return;
1150     }
1151     if (trans->nd != 0) {
1152 	if (trans->nd == 2)
1153 	    fprintf(output, "last not determinist, ");
1154 	else
1155 	    fprintf(output, "not determinist, ");
1156     }
1157     if (trans->counter >= 0) {
1158 	fprintf(output, "counted %d, ", trans->counter);
1159     }
1160     if (trans->count == REGEXP_ALL_COUNTER) {
1161 	fprintf(output, "all transition, ");
1162     } else if (trans->count >= 0) {
1163 	fprintf(output, "count based %d, ", trans->count);
1164     }
1165     if (trans->atom == NULL) {
1166 	fprintf(output, "epsilon to %d\n", trans->to);
1167 	return;
1168     }
1169     if (trans->atom->type == XML_REGEXP_CHARVAL)
1170 	fprintf(output, "char %c ", trans->atom->codepoint);
1171     fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1172 }
1173 
1174 static void
xmlRegPrintState(FILE * output,xmlRegStatePtr state)1175 xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1176     int i;
1177 
1178     fprintf(output, " state: ");
1179     if (state == NULL) {
1180 	fprintf(output, "NULL\n");
1181 	return;
1182     }
1183     if (state->type == XML_REGEXP_START_STATE)
1184 	fprintf(output, "START ");
1185     if (state->type == XML_REGEXP_FINAL_STATE)
1186 	fprintf(output, "FINAL ");
1187 
1188     fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1189     for (i = 0;i < state->nbTrans; i++) {
1190 	xmlRegPrintTrans(output, &(state->trans[i]));
1191     }
1192 }
1193 
1194 #ifdef DEBUG_REGEXP_GRAPH
1195 static void
xmlRegPrintCtxt(FILE * output,xmlRegParserCtxtPtr ctxt)1196 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1197     int i;
1198 
1199     fprintf(output, " ctxt: ");
1200     if (ctxt == NULL) {
1201 	fprintf(output, "NULL\n");
1202 	return;
1203     }
1204     fprintf(output, "'%s' ", ctxt->string);
1205     if (ctxt->error)
1206 	fprintf(output, "error ");
1207     if (ctxt->neg)
1208 	fprintf(output, "neg ");
1209     fprintf(output, "\n");
1210     fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1211     for (i = 0;i < ctxt->nbAtoms; i++) {
1212 	fprintf(output, " %02d ", i);
1213 	xmlRegPrintAtom(output, ctxt->atoms[i]);
1214     }
1215     if (ctxt->atom != NULL) {
1216 	fprintf(output, "current atom:\n");
1217 	xmlRegPrintAtom(output, ctxt->atom);
1218     }
1219     fprintf(output, "%d states:", ctxt->nbStates);
1220     if (ctxt->start != NULL)
1221 	fprintf(output, " start: %d", ctxt->start->no);
1222     if (ctxt->end != NULL)
1223 	fprintf(output, " end: %d", ctxt->end->no);
1224     fprintf(output, "\n");
1225     for (i = 0;i < ctxt->nbStates; i++) {
1226 	xmlRegPrintState(output, ctxt->states[i]);
1227     }
1228     fprintf(output, "%d counters:\n", ctxt->nbCounters);
1229     for (i = 0;i < ctxt->nbCounters; i++) {
1230 	fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1231 		                                ctxt->counters[i].max);
1232     }
1233 }
1234 #endif
1235 
1236 /************************************************************************
1237  *									*
1238  *		 Finite Automata structures manipulations		*
1239  *									*
1240  ************************************************************************/
1241 
1242 static xmlRegRangePtr
xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom,int neg,xmlRegAtomType type,int start,int end,xmlChar * blockName)1243 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1244 	           int neg, xmlRegAtomType type, int start, int end,
1245 		   xmlChar *blockName) {
1246     xmlRegRangePtr range;
1247 
1248     if (atom == NULL) {
1249 	ERROR("add range: atom is NULL");
1250 	return(NULL);
1251     }
1252     if (atom->type != XML_REGEXP_RANGES) {
1253 	ERROR("add range: atom is not ranges");
1254 	return(NULL);
1255     }
1256     if (atom->maxRanges == 0) {
1257 	atom->maxRanges = 4;
1258 	atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1259 		                             sizeof(xmlRegRangePtr));
1260 	if (atom->ranges == NULL) {
1261 	    xmlRegexpErrMemory(ctxt, "adding ranges");
1262 	    atom->maxRanges = 0;
1263 	    return(NULL);
1264 	}
1265     } else if (atom->nbRanges >= atom->maxRanges) {
1266 	xmlRegRangePtr *tmp;
1267 	atom->maxRanges *= 2;
1268 	tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1269 		                             sizeof(xmlRegRangePtr));
1270 	if (tmp == NULL) {
1271 	    xmlRegexpErrMemory(ctxt, "adding ranges");
1272 	    atom->maxRanges /= 2;
1273 	    return(NULL);
1274 	}
1275 	atom->ranges = tmp;
1276     }
1277     range = xmlRegNewRange(ctxt, neg, type, start, end);
1278     if (range == NULL)
1279 	return(NULL);
1280     range->blockName = blockName;
1281     atom->ranges[atom->nbRanges++] = range;
1282 
1283     return(range);
1284 }
1285 
1286 static int
xmlRegGetCounter(xmlRegParserCtxtPtr ctxt)1287 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1288     if (ctxt->maxCounters == 0) {
1289 	ctxt->maxCounters = 4;
1290 	ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1291 		                             sizeof(xmlRegCounter));
1292 	if (ctxt->counters == NULL) {
1293 	    xmlRegexpErrMemory(ctxt, "allocating counter");
1294 	    ctxt->maxCounters = 0;
1295 	    return(-1);
1296 	}
1297     } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1298 	xmlRegCounter *tmp;
1299 	ctxt->maxCounters *= 2;
1300 	tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1301 		                           sizeof(xmlRegCounter));
1302 	if (tmp == NULL) {
1303 	    xmlRegexpErrMemory(ctxt, "allocating counter");
1304 	    ctxt->maxCounters /= 2;
1305 	    return(-1);
1306 	}
1307 	ctxt->counters = tmp;
1308     }
1309     ctxt->counters[ctxt->nbCounters].min = -1;
1310     ctxt->counters[ctxt->nbCounters].max = -1;
1311     return(ctxt->nbCounters++);
1312 }
1313 
1314 static int
xmlRegAtomPush(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom)1315 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1316     if (atom == NULL) {
1317 	ERROR("atom push: atom is NULL");
1318 	return(-1);
1319     }
1320     if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1321         size_t newSize = ctxt->maxAtoms ? ctxt->maxAtoms * 2 : 4;
1322 	xmlRegAtomPtr *tmp;
1323 
1324 	tmp = xmlRealloc(ctxt->atoms, newSize * sizeof(xmlRegAtomPtr));
1325 	if (tmp == NULL) {
1326 	    xmlRegexpErrMemory(ctxt, "allocating counter");
1327 	    return(-1);
1328 	}
1329 	ctxt->atoms = tmp;
1330         ctxt->maxAtoms = newSize;
1331     }
1332     atom->no = ctxt->nbAtoms;
1333     ctxt->atoms[ctxt->nbAtoms++] = atom;
1334     return(0);
1335 }
1336 
1337 static void
xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr target,int from)1338 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1339                       int from) {
1340     if (target->maxTransTo == 0) {
1341 	target->maxTransTo = 8;
1342 	target->transTo = (int *) xmlMalloc(target->maxTransTo *
1343 		                             sizeof(int));
1344 	if (target->transTo == NULL) {
1345 	    xmlRegexpErrMemory(ctxt, "adding transition");
1346 	    target->maxTransTo = 0;
1347 	    return;
1348 	}
1349     } else if (target->nbTransTo >= target->maxTransTo) {
1350 	int *tmp;
1351 	target->maxTransTo *= 2;
1352 	tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1353 		                             sizeof(int));
1354 	if (tmp == NULL) {
1355 	    xmlRegexpErrMemory(ctxt, "adding transition");
1356 	    target->maxTransTo /= 2;
1357 	    return;
1358 	}
1359 	target->transTo = tmp;
1360     }
1361     target->transTo[target->nbTransTo] = from;
1362     target->nbTransTo++;
1363 }
1364 
1365 static void
xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,xmlRegAtomPtr atom,xmlRegStatePtr target,int counter,int count)1366 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1367 	            xmlRegAtomPtr atom, xmlRegStatePtr target,
1368 		    int counter, int count) {
1369 
1370     int nrtrans;
1371 
1372     if (state == NULL) {
1373 	ERROR("add state: state is NULL");
1374 	return;
1375     }
1376     if (target == NULL) {
1377 	ERROR("add state: target is NULL");
1378 	return;
1379     }
1380     /*
1381      * Other routines follow the philosophy 'When in doubt, add a transition'
1382      * so we check here whether such a transition is already present and, if
1383      * so, silently ignore this request.
1384      */
1385 
1386     for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1387 	xmlRegTransPtr trans = &(state->trans[nrtrans]);
1388 	if ((trans->atom == atom) &&
1389 	    (trans->to == target->no) &&
1390 	    (trans->counter == counter) &&
1391 	    (trans->count == count)) {
1392 #ifdef DEBUG_REGEXP_GRAPH
1393 	    printf("Ignoring duplicate transition from %d to %d\n",
1394 		    state->no, target->no);
1395 #endif
1396 	    return;
1397 	}
1398     }
1399 
1400     if (state->maxTrans == 0) {
1401 	state->maxTrans = 8;
1402 	state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1403 		                             sizeof(xmlRegTrans));
1404 	if (state->trans == NULL) {
1405 	    xmlRegexpErrMemory(ctxt, "adding transition");
1406 	    state->maxTrans = 0;
1407 	    return;
1408 	}
1409     } else if (state->nbTrans >= state->maxTrans) {
1410 	xmlRegTrans *tmp;
1411 	state->maxTrans *= 2;
1412 	tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1413 		                             sizeof(xmlRegTrans));
1414 	if (tmp == NULL) {
1415 	    xmlRegexpErrMemory(ctxt, "adding transition");
1416 	    state->maxTrans /= 2;
1417 	    return;
1418 	}
1419 	state->trans = tmp;
1420     }
1421 #ifdef DEBUG_REGEXP_GRAPH
1422     printf("Add trans from %d to %d ", state->no, target->no);
1423     if (count == REGEXP_ALL_COUNTER)
1424 	printf("all transition\n");
1425     else if (count >= 0)
1426 	printf("count based %d\n", count);
1427     else if (counter >= 0)
1428 	printf("counted %d\n", counter);
1429     else if (atom == NULL)
1430 	printf("epsilon transition\n");
1431     else if (atom != NULL)
1432         xmlRegPrintAtom(stdout, atom);
1433 #endif
1434 
1435     state->trans[state->nbTrans].atom = atom;
1436     state->trans[state->nbTrans].to = target->no;
1437     state->trans[state->nbTrans].counter = counter;
1438     state->trans[state->nbTrans].count = count;
1439     state->trans[state->nbTrans].nd = 0;
1440     state->nbTrans++;
1441     xmlRegStateAddTransTo(ctxt, target, state->no);
1442 }
1443 
1444 static xmlRegStatePtr
xmlRegStatePush(xmlRegParserCtxtPtr ctxt)1445 xmlRegStatePush(xmlRegParserCtxtPtr ctxt) {
1446     xmlRegStatePtr state;
1447 
1448     if (ctxt->nbStates >= ctxt->maxStates) {
1449         size_t newSize = ctxt->maxStates ? ctxt->maxStates * 2 : 4;
1450 	xmlRegStatePtr *tmp;
1451 
1452 	tmp = xmlRealloc(ctxt->states, newSize * sizeof(tmp[0]));
1453 	if (tmp == NULL) {
1454 	    xmlRegexpErrMemory(ctxt, "adding state");
1455 	    return(NULL);
1456 	}
1457 	ctxt->states = tmp;
1458 	ctxt->maxStates = newSize;
1459     }
1460 
1461     state = xmlRegNewState(ctxt);
1462     if (state == NULL)
1463         return(NULL);
1464 
1465     state->no = ctxt->nbStates;
1466     ctxt->states[ctxt->nbStates++] = state;
1467 
1468     return(state);
1469 }
1470 
1471 /**
1472  * xmlFAGenerateAllTransition:
1473  * @ctxt:  a regexp parser context
1474  * @from:  the from state
1475  * @to:  the target state or NULL for building a new one
1476  * @lax:
1477  *
1478  */
1479 static int
xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int lax)1480 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1481 			   xmlRegStatePtr from, xmlRegStatePtr to,
1482 			   int lax) {
1483     if (to == NULL) {
1484 	to = xmlRegStatePush(ctxt);
1485         if (to == NULL)
1486             return(-1);
1487 	ctxt->state = to;
1488     }
1489     if (lax)
1490 	xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1491     else
1492 	xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1493     return(0);
1494 }
1495 
1496 /**
1497  * xmlFAGenerateEpsilonTransition:
1498  * @ctxt:  a regexp parser context
1499  * @from:  the from state
1500  * @to:  the target state or NULL for building a new one
1501  *
1502  */
1503 static int
xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to)1504 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1505 			       xmlRegStatePtr from, xmlRegStatePtr to) {
1506     if (to == NULL) {
1507 	to = xmlRegStatePush(ctxt);
1508         if (to == NULL)
1509             return(-1);
1510 	ctxt->state = to;
1511     }
1512     xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1513     return(0);
1514 }
1515 
1516 /**
1517  * xmlFAGenerateCountedEpsilonTransition:
1518  * @ctxt:  a regexp parser context
1519  * @from:  the from state
1520  * @to:  the target state or NULL for building a new one
1521  * counter:  the counter for that transition
1522  *
1523  */
1524 static int
xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1525 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1526 	    xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1527     if (to == NULL) {
1528 	to = xmlRegStatePush(ctxt);
1529         if (to == NULL)
1530             return(-1);
1531 	ctxt->state = to;
1532     }
1533     xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1534     return(0);
1535 }
1536 
1537 /**
1538  * xmlFAGenerateCountedTransition:
1539  * @ctxt:  a regexp parser context
1540  * @from:  the from state
1541  * @to:  the target state or NULL for building a new one
1542  * counter:  the counter for that transition
1543  *
1544  */
1545 static int
xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1546 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1547 	    xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1548     if (to == NULL) {
1549 	to = xmlRegStatePush(ctxt);
1550         if (to == NULL)
1551             return(-1);
1552 	ctxt->state = to;
1553     }
1554     xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1555     return(0);
1556 }
1557 
1558 /**
1559  * xmlFAGenerateTransitions:
1560  * @ctxt:  a regexp parser context
1561  * @from:  the from state
1562  * @to:  the target state or NULL for building a new one
1563  * @atom:  the atom generating the transition
1564  *
1565  * Returns 0 if success and -1 in case of error.
1566  */
1567 static int
xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,xmlRegAtomPtr atom)1568 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1569 	                 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1570     xmlRegStatePtr end;
1571     int nullable = 0;
1572 
1573     if (atom == NULL) {
1574 	ERROR("generate transition: atom == NULL");
1575 	return(-1);
1576     }
1577     if (atom->type == XML_REGEXP_SUBREG) {
1578 	/*
1579 	 * this is a subexpression handling one should not need to
1580 	 * create a new node except for XML_REGEXP_QUANT_RANGE.
1581 	 */
1582 	if ((to != NULL) && (atom->stop != to) &&
1583 	    (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1584 	    /*
1585 	     * Generate an epsilon transition to link to the target
1586 	     */
1587 	    xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1588 #ifdef DV
1589 	} else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1590 		   (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1591 	    to = xmlRegStatePush(ctxt, to);
1592             if (to == NULL)
1593                 return(-1);
1594 	    ctxt->state = to;
1595 	    xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1596 #endif
1597 	}
1598 	switch (atom->quant) {
1599 	    case XML_REGEXP_QUANT_OPT:
1600 		atom->quant = XML_REGEXP_QUANT_ONCE;
1601 		/*
1602 		 * transition done to the state after end of atom.
1603 		 *      1. set transition from atom start to new state
1604 		 *      2. set transition from atom end to this state.
1605 		 */
1606                 if (to == NULL) {
1607                     xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1608                     xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1609                                                    ctxt->state);
1610                 } else {
1611                     xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1612                 }
1613 		break;
1614 	    case XML_REGEXP_QUANT_MULT:
1615 		atom->quant = XML_REGEXP_QUANT_ONCE;
1616 		xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1617 		xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1618 		break;
1619 	    case XML_REGEXP_QUANT_PLUS:
1620 		atom->quant = XML_REGEXP_QUANT_ONCE;
1621 		xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1622 		break;
1623 	    case XML_REGEXP_QUANT_RANGE: {
1624 		int counter;
1625 		xmlRegStatePtr inter, newstate;
1626 
1627 		/*
1628 		 * create the final state now if needed
1629 		 */
1630 		if (to != NULL) {
1631 		    newstate = to;
1632 		} else {
1633 		    newstate = xmlRegStatePush(ctxt);
1634                     if (newstate == NULL)
1635                         return(-1);
1636 		}
1637 
1638 		/*
1639 		 * The principle here is to use counted transition
1640 		 * to avoid explosion in the number of states in the
1641 		 * graph. This is clearly more complex but should not
1642 		 * be exploitable at runtime.
1643 		 */
1644 		if ((atom->min == 0) && (atom->start0 == NULL)) {
1645 		    xmlRegAtomPtr copy;
1646 		    /*
1647 		     * duplicate a transition based on atom to count next
1648 		     * occurrences after 1. We cannot loop to atom->start
1649 		     * directly because we need an epsilon transition to
1650 		     * newstate.
1651 		     */
1652 		     /* ???? For some reason it seems we never reach that
1653 		        case, I suppose this got optimized out before when
1654 			building the automata */
1655 		    copy = xmlRegCopyAtom(ctxt, atom);
1656 		    if (copy == NULL)
1657 		        return(-1);
1658 		    copy->quant = XML_REGEXP_QUANT_ONCE;
1659 		    copy->min = 0;
1660 		    copy->max = 0;
1661 
1662 		    if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1663 		        < 0) {
1664                         xmlRegFreeAtom(copy);
1665 			return(-1);
1666                     }
1667 		    inter = ctxt->state;
1668 		    counter = xmlRegGetCounter(ctxt);
1669                     if (counter < 0)
1670                         return(-1);
1671 		    ctxt->counters[counter].min = atom->min - 1;
1672 		    ctxt->counters[counter].max = atom->max - 1;
1673 		    /* count the number of times we see it again */
1674 		    xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1675 						   atom->stop, counter);
1676 		    /* allow a way out based on the count */
1677 		    xmlFAGenerateCountedTransition(ctxt, inter,
1678 			                           newstate, counter);
1679 		    /* and also allow a direct exit for 0 */
1680 		    xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1681 		                                   newstate);
1682 		} else {
1683 		    /*
1684 		     * either we need the atom at least once or there
1685 		     * is an atom->start0 allowing to easily plug the
1686 		     * epsilon transition.
1687 		     */
1688 		    counter = xmlRegGetCounter(ctxt);
1689                     if (counter < 0)
1690                         return(-1);
1691 		    ctxt->counters[counter].min = atom->min - 1;
1692 		    ctxt->counters[counter].max = atom->max - 1;
1693 		    /* allow a way out based on the count */
1694 		    xmlFAGenerateCountedTransition(ctxt, atom->stop,
1695 			                           newstate, counter);
1696 		    /* count the number of times we see it again */
1697 		    xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1698 						   atom->start, counter);
1699 		    /* and if needed allow a direct exit for 0 */
1700 		    if (atom->min == 0)
1701 			xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1702 						       newstate);
1703 
1704 		}
1705 		atom->min = 0;
1706 		atom->max = 0;
1707 		atom->quant = XML_REGEXP_QUANT_ONCE;
1708 		ctxt->state = newstate;
1709 	    }
1710 	    default:
1711 		break;
1712 	}
1713 	if (xmlRegAtomPush(ctxt, atom) < 0)
1714 	    return(-1);
1715 	return(0);
1716     }
1717     if ((atom->min == 0) && (atom->max == 0) &&
1718                (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1719         /*
1720 	 * we can discard the atom and generate an epsilon transition instead
1721 	 */
1722 	if (to == NULL) {
1723 	    to = xmlRegStatePush(ctxt);
1724 	    if (to == NULL)
1725 		return(-1);
1726 	}
1727 	xmlFAGenerateEpsilonTransition(ctxt, from, to);
1728 	ctxt->state = to;
1729 	xmlRegFreeAtom(atom);
1730 	return(0);
1731     }
1732     if (to == NULL) {
1733 	to = xmlRegStatePush(ctxt);
1734 	if (to == NULL)
1735 	    return(-1);
1736     }
1737     end = to;
1738     if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
1739         (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1740 	/*
1741 	 * Do not pollute the target state by adding transitions from
1742 	 * it as it is likely to be the shared target of multiple branches.
1743 	 * So isolate with an epsilon transition.
1744 	 */
1745         xmlRegStatePtr tmp;
1746 
1747 	tmp = xmlRegStatePush(ctxt);
1748         if (tmp == NULL)
1749 	    return(-1);
1750 	xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1751 	to = tmp;
1752     }
1753     if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
1754         (atom->min == 0) && (atom->max > 0)) {
1755 	nullable = 1;
1756 	atom->min = 1;
1757         if (atom->max == 1)
1758 	    atom->quant = XML_REGEXP_QUANT_OPT;
1759     }
1760     xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1761     ctxt->state = end;
1762     switch (atom->quant) {
1763 	case XML_REGEXP_QUANT_OPT:
1764 	    atom->quant = XML_REGEXP_QUANT_ONCE;
1765 	    xmlFAGenerateEpsilonTransition(ctxt, from, to);
1766 	    break;
1767 	case XML_REGEXP_QUANT_MULT:
1768 	    atom->quant = XML_REGEXP_QUANT_ONCE;
1769 	    xmlFAGenerateEpsilonTransition(ctxt, from, to);
1770 	    xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1771 	    break;
1772 	case XML_REGEXP_QUANT_PLUS:
1773 	    atom->quant = XML_REGEXP_QUANT_ONCE;
1774 	    xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1775 	    break;
1776 	case XML_REGEXP_QUANT_RANGE:
1777 	    if (nullable)
1778 		xmlFAGenerateEpsilonTransition(ctxt, from, to);
1779 	    break;
1780 	default:
1781 	    break;
1782     }
1783     if (xmlRegAtomPush(ctxt, atom) < 0)
1784 	return(-1);
1785     return(0);
1786 }
1787 
1788 /**
1789  * xmlFAReduceEpsilonTransitions:
1790  * @ctxt:  a regexp parser context
1791  * @fromnr:  the from state
1792  * @tonr:  the to state
1793  * @counter:  should that transition be associated to a counted
1794  *
1795  */
1796 static void
xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt,int fromnr,int tonr,int counter)1797 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1798 	                      int tonr, int counter) {
1799     int transnr;
1800     xmlRegStatePtr from;
1801     xmlRegStatePtr to;
1802 
1803 #ifdef DEBUG_REGEXP_GRAPH
1804     printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1805 #endif
1806     from = ctxt->states[fromnr];
1807     if (from == NULL)
1808 	return;
1809     to = ctxt->states[tonr];
1810     if (to == NULL)
1811 	return;
1812     if ((to->mark == XML_REGEXP_MARK_START) ||
1813 	(to->mark == XML_REGEXP_MARK_VISITED))
1814 	return;
1815 
1816     to->mark = XML_REGEXP_MARK_VISITED;
1817     if (to->type == XML_REGEXP_FINAL_STATE) {
1818 #ifdef DEBUG_REGEXP_GRAPH
1819 	printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1820 #endif
1821 	from->type = XML_REGEXP_FINAL_STATE;
1822     }
1823     for (transnr = 0;transnr < to->nbTrans;transnr++) {
1824         if (to->trans[transnr].to < 0)
1825 	    continue;
1826 	if (to->trans[transnr].atom == NULL) {
1827 	    /*
1828 	     * Don't remove counted transitions
1829 	     * Don't loop either
1830 	     */
1831 	    if (to->trans[transnr].to != fromnr) {
1832 		if (to->trans[transnr].count >= 0) {
1833 		    int newto = to->trans[transnr].to;
1834 
1835 		    xmlRegStateAddTrans(ctxt, from, NULL,
1836 					ctxt->states[newto],
1837 					-1, to->trans[transnr].count);
1838 		} else {
1839 #ifdef DEBUG_REGEXP_GRAPH
1840 		    printf("Found epsilon trans %d from %d to %d\n",
1841 			   transnr, tonr, to->trans[transnr].to);
1842 #endif
1843 		    if (to->trans[transnr].counter >= 0) {
1844 			xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1845 					      to->trans[transnr].to,
1846 					      to->trans[transnr].counter);
1847 		    } else {
1848 			xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1849 					      to->trans[transnr].to,
1850 					      counter);
1851 		    }
1852 		}
1853 	    }
1854 	} else {
1855 	    int newto = to->trans[transnr].to;
1856 
1857 	    if (to->trans[transnr].counter >= 0) {
1858 		xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1859 				    ctxt->states[newto],
1860 				    to->trans[transnr].counter, -1);
1861 	    } else {
1862 		xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1863 				    ctxt->states[newto], counter, -1);
1864 	    }
1865 	}
1866     }
1867     to->mark = XML_REGEXP_MARK_NORMAL;
1868 }
1869 
1870 /**
1871  * xmlFAEliminateSimpleEpsilonTransitions:
1872  * @ctxt:  a regexp parser context
1873  *
1874  * Eliminating general epsilon transitions can get costly in the general
1875  * algorithm due to the large amount of generated new transitions and
1876  * associated comparisons. However for simple epsilon transition used just
1877  * to separate building blocks when generating the automata this can be
1878  * reduced to state elimination:
1879  *    - if there exists an epsilon from X to Y
1880  *    - if there is no other transition from X
1881  * then X and Y are semantically equivalent and X can be eliminated
1882  * If X is the start state then make Y the start state, else replace the
1883  * target of all transitions to X by transitions to Y.
1884  *
1885  * If X is a final state, skip it.
1886  * Otherwise it would be necessary to manipulate counters for this case when
1887  * eliminating state 2:
1888  * State 1 has a transition with an atom to state 2.
1889  * State 2 is final and has an epsilon transition to state 1.
1890  */
1891 static void
xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1892 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1893     int statenr, i, j, newto;
1894     xmlRegStatePtr state, tmp;
1895 
1896     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1897 	state = ctxt->states[statenr];
1898 	if (state == NULL)
1899 	    continue;
1900 	if (state->nbTrans != 1)
1901 	    continue;
1902        if (state->type == XML_REGEXP_UNREACH_STATE ||
1903            state->type == XML_REGEXP_FINAL_STATE)
1904 	    continue;
1905 	/* is the only transition out a basic transition */
1906 	if ((state->trans[0].atom == NULL) &&
1907 	    (state->trans[0].to >= 0) &&
1908 	    (state->trans[0].to != statenr) &&
1909 	    (state->trans[0].counter < 0) &&
1910 	    (state->trans[0].count < 0)) {
1911 	    newto = state->trans[0].to;
1912 
1913             if (state->type == XML_REGEXP_START_STATE) {
1914 #ifdef DEBUG_REGEXP_GRAPH
1915 		printf("Found simple epsilon trans from start %d to %d\n",
1916 		       statenr, newto);
1917 #endif
1918             } else {
1919 #ifdef DEBUG_REGEXP_GRAPH
1920 		printf("Found simple epsilon trans from %d to %d\n",
1921 		       statenr, newto);
1922 #endif
1923 	        for (i = 0;i < state->nbTransTo;i++) {
1924 		    tmp = ctxt->states[state->transTo[i]];
1925 		    for (j = 0;j < tmp->nbTrans;j++) {
1926 			if (tmp->trans[j].to == statenr) {
1927 #ifdef DEBUG_REGEXP_GRAPH
1928 			    printf("Changed transition %d on %d to go to %d\n",
1929 				   j, tmp->no, newto);
1930 #endif
1931 			    tmp->trans[j].to = -1;
1932 			    xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
1933 						ctxt->states[newto],
1934 					        tmp->trans[j].counter,
1935 						tmp->trans[j].count);
1936 			}
1937 		    }
1938 		}
1939 		if (state->type == XML_REGEXP_FINAL_STATE)
1940 		    ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1941 		/* eliminate the transition completely */
1942 		state->nbTrans = 0;
1943 
1944                 state->type = XML_REGEXP_UNREACH_STATE;
1945 
1946 	    }
1947 
1948 	}
1949     }
1950 }
1951 /**
1952  * xmlFAEliminateEpsilonTransitions:
1953  * @ctxt:  a regexp parser context
1954  *
1955  */
1956 static void
xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1957 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1958     int statenr, transnr;
1959     xmlRegStatePtr state;
1960     int has_epsilon;
1961 
1962     if (ctxt->states == NULL) return;
1963 
1964     /*
1965      * Eliminate simple epsilon transition and the associated unreachable
1966      * states.
1967      */
1968     xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1969     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1970 	state = ctxt->states[statenr];
1971 	if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1972 #ifdef DEBUG_REGEXP_GRAPH
1973 	    printf("Removed unreachable state %d\n", statenr);
1974 #endif
1975 	    xmlRegFreeState(state);
1976 	    ctxt->states[statenr] = NULL;
1977 	}
1978     }
1979 
1980     has_epsilon = 0;
1981 
1982     /*
1983      * Build the completed transitions bypassing the epsilons
1984      * Use a marking algorithm to avoid loops
1985      * Mark sink states too.
1986      * Process from the latest states backward to the start when
1987      * there is long cascading epsilon chains this minimize the
1988      * recursions and transition compares when adding the new ones
1989      */
1990     for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
1991 	state = ctxt->states[statenr];
1992 	if (state == NULL)
1993 	    continue;
1994 	if ((state->nbTrans == 0) &&
1995 	    (state->type != XML_REGEXP_FINAL_STATE)) {
1996 	    state->type = XML_REGEXP_SINK_STATE;
1997 	}
1998 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
1999 	    if ((state->trans[transnr].atom == NULL) &&
2000 		(state->trans[transnr].to >= 0)) {
2001 		if (state->trans[transnr].to == statenr) {
2002 		    state->trans[transnr].to = -1;
2003 #ifdef DEBUG_REGEXP_GRAPH
2004 		    printf("Removed loopback epsilon trans %d on %d\n",
2005 			   transnr, statenr);
2006 #endif
2007 		} else if (state->trans[transnr].count < 0) {
2008 		    int newto = state->trans[transnr].to;
2009 
2010 #ifdef DEBUG_REGEXP_GRAPH
2011 		    printf("Found epsilon trans %d from %d to %d\n",
2012 			   transnr, statenr, newto);
2013 #endif
2014 		    has_epsilon = 1;
2015 		    state->trans[transnr].to = -2;
2016 		    state->mark = XML_REGEXP_MARK_START;
2017 		    xmlFAReduceEpsilonTransitions(ctxt, statenr,
2018 				      newto, state->trans[transnr].counter);
2019 		    state->mark = XML_REGEXP_MARK_NORMAL;
2020 #ifdef DEBUG_REGEXP_GRAPH
2021 		} else {
2022 		    printf("Found counted transition %d on %d\n",
2023 			   transnr, statenr);
2024 #endif
2025 	        }
2026 	    }
2027 	}
2028     }
2029     /*
2030      * Eliminate the epsilon transitions
2031      */
2032     if (has_epsilon) {
2033 	for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2034 	    state = ctxt->states[statenr];
2035 	    if (state == NULL)
2036 		continue;
2037 	    for (transnr = 0;transnr < state->nbTrans;transnr++) {
2038 		xmlRegTransPtr trans = &(state->trans[transnr]);
2039 		if ((trans->atom == NULL) &&
2040 		    (trans->count < 0) &&
2041 		    (trans->to >= 0)) {
2042 		    trans->to = -1;
2043 		}
2044 	    }
2045 	}
2046     }
2047 
2048     /*
2049      * Use this pass to detect unreachable states too
2050      */
2051     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2052 	state = ctxt->states[statenr];
2053 	if (state != NULL)
2054 	    state->reached = XML_REGEXP_MARK_NORMAL;
2055     }
2056     state = ctxt->states[0];
2057     if (state != NULL)
2058 	state->reached = XML_REGEXP_MARK_START;
2059     while (state != NULL) {
2060 	xmlRegStatePtr target = NULL;
2061 	state->reached = XML_REGEXP_MARK_VISITED;
2062 	/*
2063 	 * Mark all states reachable from the current reachable state
2064 	 */
2065 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
2066 	    if ((state->trans[transnr].to >= 0) &&
2067 		((state->trans[transnr].atom != NULL) ||
2068 		 (state->trans[transnr].count >= 0))) {
2069 		int newto = state->trans[transnr].to;
2070 
2071 		if (ctxt->states[newto] == NULL)
2072 		    continue;
2073 		if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
2074 		    ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
2075 		    target = ctxt->states[newto];
2076 		}
2077 	    }
2078 	}
2079 
2080 	/*
2081 	 * find the next accessible state not explored
2082 	 */
2083 	if (target == NULL) {
2084 	    for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
2085 		state = ctxt->states[statenr];
2086 		if ((state != NULL) && (state->reached ==
2087 			XML_REGEXP_MARK_START)) {
2088 		    target = state;
2089 		    break;
2090 		}
2091 	    }
2092 	}
2093 	state = target;
2094     }
2095     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2096 	state = ctxt->states[statenr];
2097 	if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
2098 #ifdef DEBUG_REGEXP_GRAPH
2099 	    printf("Removed unreachable state %d\n", statenr);
2100 #endif
2101 	    xmlRegFreeState(state);
2102 	    ctxt->states[statenr] = NULL;
2103 	}
2104     }
2105 
2106 }
2107 
2108 static int
xmlFACompareRanges(xmlRegRangePtr range1,xmlRegRangePtr range2)2109 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
2110     int ret = 0;
2111 
2112     if ((range1->type == XML_REGEXP_RANGES) ||
2113         (range2->type == XML_REGEXP_RANGES) ||
2114         (range2->type == XML_REGEXP_SUBREG) ||
2115         (range1->type == XML_REGEXP_SUBREG) ||
2116         (range1->type == XML_REGEXP_STRING) ||
2117         (range2->type == XML_REGEXP_STRING))
2118 	return(-1);
2119 
2120     /* put them in order */
2121     if (range1->type > range2->type) {
2122         xmlRegRangePtr tmp;
2123 
2124 	tmp = range1;
2125 	range1 = range2;
2126 	range2 = tmp;
2127     }
2128     if ((range1->type == XML_REGEXP_ANYCHAR) ||
2129         (range2->type == XML_REGEXP_ANYCHAR)) {
2130 	ret = 1;
2131     } else if ((range1->type == XML_REGEXP_EPSILON) ||
2132                (range2->type == XML_REGEXP_EPSILON)) {
2133 	return(0);
2134     } else if (range1->type == range2->type) {
2135         if (range1->type != XML_REGEXP_CHARVAL)
2136             ret = 1;
2137         else if ((range1->end < range2->start) ||
2138 	         (range2->end < range1->start))
2139 	    ret = 0;
2140 	else
2141 	    ret = 1;
2142     } else if (range1->type == XML_REGEXP_CHARVAL) {
2143         int codepoint;
2144 	int neg = 0;
2145 
2146 	/*
2147 	 * just check all codepoints in the range for acceptance,
2148 	 * this is usually way cheaper since done only once at
2149 	 * compilation than testing over and over at runtime or
2150 	 * pushing too many states when evaluating.
2151 	 */
2152 	if (((range1->neg == 0) && (range2->neg != 0)) ||
2153 	    ((range1->neg != 0) && (range2->neg == 0)))
2154 	    neg = 1;
2155 
2156 	for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2157 	    ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2158 					    0, range2->start, range2->end,
2159 					    range2->blockName);
2160 	    if (ret < 0)
2161 	        return(-1);
2162 	    if (((neg == 1) && (ret == 0)) ||
2163 	        ((neg == 0) && (ret == 1)))
2164 		return(1);
2165 	}
2166 	return(0);
2167     } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2168                (range2->type == XML_REGEXP_BLOCK_NAME)) {
2169 	if (range1->type == range2->type) {
2170 	    ret = xmlStrEqual(range1->blockName, range2->blockName);
2171 	} else {
2172 	    /*
2173 	     * comparing a block range with anything else is way
2174 	     * too costly, and maintaining the table is like too much
2175 	     * memory too, so let's force the automata to save state
2176 	     * here.
2177 	     */
2178 	    return(1);
2179 	}
2180     } else if ((range1->type < XML_REGEXP_LETTER) ||
2181                (range2->type < XML_REGEXP_LETTER)) {
2182 	if ((range1->type == XML_REGEXP_ANYSPACE) &&
2183 	    (range2->type == XML_REGEXP_NOTSPACE))
2184 	    ret = 0;
2185 	else if ((range1->type == XML_REGEXP_INITNAME) &&
2186 	         (range2->type == XML_REGEXP_NOTINITNAME))
2187 	    ret = 0;
2188 	else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2189 	         (range2->type == XML_REGEXP_NOTNAMECHAR))
2190 	    ret = 0;
2191 	else if ((range1->type == XML_REGEXP_DECIMAL) &&
2192 	         (range2->type == XML_REGEXP_NOTDECIMAL))
2193 	    ret = 0;
2194 	else if ((range1->type == XML_REGEXP_REALCHAR) &&
2195 	         (range2->type == XML_REGEXP_NOTREALCHAR))
2196 	    ret = 0;
2197 	else {
2198 	    /* same thing to limit complexity */
2199 	    return(1);
2200 	}
2201     } else {
2202         ret = 0;
2203         /* range1->type < range2->type here */
2204         switch (range1->type) {
2205 	    case XML_REGEXP_LETTER:
2206 	         /* all disjoint except in the subgroups */
2207 	         if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2208 		     (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2209 		     (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2210 		     (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2211 		     (range2->type == XML_REGEXP_LETTER_OTHERS))
2212 		     ret = 1;
2213 		 break;
2214 	    case XML_REGEXP_MARK:
2215 	         if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2216 		     (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2217 		     (range2->type == XML_REGEXP_MARK_ENCLOSING))
2218 		     ret = 1;
2219 		 break;
2220 	    case XML_REGEXP_NUMBER:
2221 	         if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2222 		     (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2223 		     (range2->type == XML_REGEXP_NUMBER_OTHERS))
2224 		     ret = 1;
2225 		 break;
2226 	    case XML_REGEXP_PUNCT:
2227 	         if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2228 		     (range2->type == XML_REGEXP_PUNCT_DASH) ||
2229 		     (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2230 		     (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2231 		     (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2232 		     (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2233 		     (range2->type == XML_REGEXP_PUNCT_OTHERS))
2234 		     ret = 1;
2235 		 break;
2236 	    case XML_REGEXP_SEPAR:
2237 	         if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2238 		     (range2->type == XML_REGEXP_SEPAR_LINE) ||
2239 		     (range2->type == XML_REGEXP_SEPAR_PARA))
2240 		     ret = 1;
2241 		 break;
2242 	    case XML_REGEXP_SYMBOL:
2243 	         if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2244 		     (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2245 		     (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2246 		     (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2247 		     ret = 1;
2248 		 break;
2249 	    case XML_REGEXP_OTHER:
2250 	         if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2251 		     (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2252 		     (range2->type == XML_REGEXP_OTHER_PRIVATE))
2253 		     ret = 1;
2254 		 break;
2255             default:
2256 	         if ((range2->type >= XML_REGEXP_LETTER) &&
2257 		     (range2->type < XML_REGEXP_BLOCK_NAME))
2258 		     ret = 0;
2259 		 else {
2260 		     /* safety net ! */
2261 		     return(1);
2262 		 }
2263 	}
2264     }
2265     if (((range1->neg == 0) && (range2->neg != 0)) ||
2266         ((range1->neg != 0) && (range2->neg == 0)))
2267 	ret = !ret;
2268     return(ret);
2269 }
2270 
2271 /**
2272  * xmlFACompareAtomTypes:
2273  * @type1:  an atom type
2274  * @type2:  an atom type
2275  *
2276  * Compares two atoms type to check whether they intersect in some ways,
2277  * this is used by xmlFACompareAtoms only
2278  *
2279  * Returns 1 if they may intersect and 0 otherwise
2280  */
2281 static int
xmlFACompareAtomTypes(xmlRegAtomType type1,xmlRegAtomType type2)2282 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2283     if ((type1 == XML_REGEXP_EPSILON) ||
2284         (type1 == XML_REGEXP_CHARVAL) ||
2285 	(type1 == XML_REGEXP_RANGES) ||
2286 	(type1 == XML_REGEXP_SUBREG) ||
2287 	(type1 == XML_REGEXP_STRING) ||
2288 	(type1 == XML_REGEXP_ANYCHAR))
2289 	return(1);
2290     if ((type2 == XML_REGEXP_EPSILON) ||
2291         (type2 == XML_REGEXP_CHARVAL) ||
2292 	(type2 == XML_REGEXP_RANGES) ||
2293 	(type2 == XML_REGEXP_SUBREG) ||
2294 	(type2 == XML_REGEXP_STRING) ||
2295 	(type2 == XML_REGEXP_ANYCHAR))
2296 	return(1);
2297 
2298     if (type1 == type2) return(1);
2299 
2300     /* simplify subsequent compares by making sure type1 < type2 */
2301     if (type1 > type2) {
2302         xmlRegAtomType tmp = type1;
2303 	type1 = type2;
2304 	type2 = tmp;
2305     }
2306     switch (type1) {
2307         case XML_REGEXP_ANYSPACE: /* \s */
2308 	    /* can't be a letter, number, mark, punctuation, symbol */
2309 	    if ((type2 == XML_REGEXP_NOTSPACE) ||
2310 		((type2 >= XML_REGEXP_LETTER) &&
2311 		 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2312 	        ((type2 >= XML_REGEXP_NUMBER) &&
2313 		 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2314 	        ((type2 >= XML_REGEXP_MARK) &&
2315 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2316 	        ((type2 >= XML_REGEXP_PUNCT) &&
2317 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2318 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2319 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2320 	        ) return(0);
2321 	    break;
2322         case XML_REGEXP_NOTSPACE: /* \S */
2323 	    break;
2324         case XML_REGEXP_INITNAME: /* \l */
2325 	    /* can't be a number, mark, separator, punctuation, symbol or other */
2326 	    if ((type2 == XML_REGEXP_NOTINITNAME) ||
2327 	        ((type2 >= XML_REGEXP_NUMBER) &&
2328 		 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2329 	        ((type2 >= XML_REGEXP_MARK) &&
2330 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2331 	        ((type2 >= XML_REGEXP_SEPAR) &&
2332 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2333 	        ((type2 >= XML_REGEXP_PUNCT) &&
2334 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2335 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2336 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2337 	        ((type2 >= XML_REGEXP_OTHER) &&
2338 		 (type2 <= XML_REGEXP_OTHER_NA))
2339 		) return(0);
2340 	    break;
2341         case XML_REGEXP_NOTINITNAME: /* \L */
2342 	    break;
2343         case XML_REGEXP_NAMECHAR: /* \c */
2344 	    /* can't be a mark, separator, punctuation, symbol or other */
2345 	    if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2346 	        ((type2 >= XML_REGEXP_MARK) &&
2347 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2348 	        ((type2 >= XML_REGEXP_PUNCT) &&
2349 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2350 	        ((type2 >= XML_REGEXP_SEPAR) &&
2351 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2352 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2353 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2354 	        ((type2 >= XML_REGEXP_OTHER) &&
2355 		 (type2 <= XML_REGEXP_OTHER_NA))
2356 		) return(0);
2357 	    break;
2358         case XML_REGEXP_NOTNAMECHAR: /* \C */
2359 	    break;
2360         case XML_REGEXP_DECIMAL: /* \d */
2361 	    /* can't be a letter, mark, separator, punctuation, symbol or other */
2362 	    if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2363 	        (type2 == XML_REGEXP_REALCHAR) ||
2364 		((type2 >= XML_REGEXP_LETTER) &&
2365 		 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2366 	        ((type2 >= XML_REGEXP_MARK) &&
2367 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2368 	        ((type2 >= XML_REGEXP_PUNCT) &&
2369 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2370 	        ((type2 >= XML_REGEXP_SEPAR) &&
2371 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2372 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2373 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2374 	        ((type2 >= XML_REGEXP_OTHER) &&
2375 		 (type2 <= XML_REGEXP_OTHER_NA))
2376 		)return(0);
2377 	    break;
2378         case XML_REGEXP_NOTDECIMAL: /* \D */
2379 	    break;
2380         case XML_REGEXP_REALCHAR: /* \w */
2381 	    /* can't be a mark, separator, punctuation, symbol or other */
2382 	    if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2383 	        ((type2 >= XML_REGEXP_MARK) &&
2384 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2385 	        ((type2 >= XML_REGEXP_PUNCT) &&
2386 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2387 	        ((type2 >= XML_REGEXP_SEPAR) &&
2388 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2389 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2390 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2391 	        ((type2 >= XML_REGEXP_OTHER) &&
2392 		 (type2 <= XML_REGEXP_OTHER_NA))
2393 		)return(0);
2394 	    break;
2395         case XML_REGEXP_NOTREALCHAR: /* \W */
2396 	    break;
2397 	/*
2398 	 * at that point we know both type 1 and type2 are from
2399 	 * character categories are ordered and are different,
2400 	 * it becomes simple because this is a partition
2401 	 */
2402         case XML_REGEXP_LETTER:
2403 	    if (type2 <= XML_REGEXP_LETTER_OTHERS)
2404 	        return(1);
2405 	    return(0);
2406         case XML_REGEXP_LETTER_UPPERCASE:
2407         case XML_REGEXP_LETTER_LOWERCASE:
2408         case XML_REGEXP_LETTER_TITLECASE:
2409         case XML_REGEXP_LETTER_MODIFIER:
2410         case XML_REGEXP_LETTER_OTHERS:
2411 	    return(0);
2412         case XML_REGEXP_MARK:
2413 	    if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2414 	        return(1);
2415 	    return(0);
2416         case XML_REGEXP_MARK_NONSPACING:
2417         case XML_REGEXP_MARK_SPACECOMBINING:
2418         case XML_REGEXP_MARK_ENCLOSING:
2419 	    return(0);
2420         case XML_REGEXP_NUMBER:
2421 	    if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2422 	        return(1);
2423 	    return(0);
2424         case XML_REGEXP_NUMBER_DECIMAL:
2425         case XML_REGEXP_NUMBER_LETTER:
2426         case XML_REGEXP_NUMBER_OTHERS:
2427 	    return(0);
2428         case XML_REGEXP_PUNCT:
2429 	    if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2430 	        return(1);
2431 	    return(0);
2432         case XML_REGEXP_PUNCT_CONNECTOR:
2433         case XML_REGEXP_PUNCT_DASH:
2434         case XML_REGEXP_PUNCT_OPEN:
2435         case XML_REGEXP_PUNCT_CLOSE:
2436         case XML_REGEXP_PUNCT_INITQUOTE:
2437         case XML_REGEXP_PUNCT_FINQUOTE:
2438         case XML_REGEXP_PUNCT_OTHERS:
2439 	    return(0);
2440         case XML_REGEXP_SEPAR:
2441 	    if (type2 <= XML_REGEXP_SEPAR_PARA)
2442 	        return(1);
2443 	    return(0);
2444         case XML_REGEXP_SEPAR_SPACE:
2445         case XML_REGEXP_SEPAR_LINE:
2446         case XML_REGEXP_SEPAR_PARA:
2447 	    return(0);
2448         case XML_REGEXP_SYMBOL:
2449 	    if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2450 	        return(1);
2451 	    return(0);
2452         case XML_REGEXP_SYMBOL_MATH:
2453         case XML_REGEXP_SYMBOL_CURRENCY:
2454         case XML_REGEXP_SYMBOL_MODIFIER:
2455         case XML_REGEXP_SYMBOL_OTHERS:
2456 	    return(0);
2457         case XML_REGEXP_OTHER:
2458 	    if (type2 <= XML_REGEXP_OTHER_NA)
2459 	        return(1);
2460 	    return(0);
2461         case XML_REGEXP_OTHER_CONTROL:
2462         case XML_REGEXP_OTHER_FORMAT:
2463         case XML_REGEXP_OTHER_PRIVATE:
2464         case XML_REGEXP_OTHER_NA:
2465 	    return(0);
2466 	default:
2467 	    break;
2468     }
2469     return(1);
2470 }
2471 
2472 /**
2473  * xmlFAEqualAtoms:
2474  * @atom1:  an atom
2475  * @atom2:  an atom
2476  * @deep: if not set only compare string pointers
2477  *
2478  * Compares two atoms to check whether they are the same exactly
2479  * this is used to remove equivalent transitions
2480  *
2481  * Returns 1 if same and 0 otherwise
2482  */
2483 static int
xmlFAEqualAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2,int deep)2484 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2485     int ret = 0;
2486 
2487     if (atom1 == atom2)
2488 	return(1);
2489     if ((atom1 == NULL) || (atom2 == NULL))
2490 	return(0);
2491 
2492     if (atom1->type != atom2->type)
2493         return(0);
2494     switch (atom1->type) {
2495         case XML_REGEXP_EPSILON:
2496 	    ret = 0;
2497 	    break;
2498         case XML_REGEXP_STRING:
2499             if (!deep)
2500                 ret = (atom1->valuep == atom2->valuep);
2501             else
2502                 ret = xmlStrEqual((xmlChar *)atom1->valuep,
2503                                   (xmlChar *)atom2->valuep);
2504 	    break;
2505         case XML_REGEXP_CHARVAL:
2506 	    ret = (atom1->codepoint == atom2->codepoint);
2507 	    break;
2508 	case XML_REGEXP_RANGES:
2509 	    /* too hard to do in the general case */
2510 	    ret = 0;
2511 	default:
2512 	    break;
2513     }
2514     return(ret);
2515 }
2516 
2517 /**
2518  * xmlFACompareAtoms:
2519  * @atom1:  an atom
2520  * @atom2:  an atom
2521  * @deep: if not set only compare string pointers
2522  *
2523  * Compares two atoms to check whether they intersect in some ways,
2524  * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2525  *
2526  * Returns 1 if yes and 0 otherwise
2527  */
2528 static int
xmlFACompareAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2,int deep)2529 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2530     int ret = 1;
2531 
2532     if (atom1 == atom2)
2533 	return(1);
2534     if ((atom1 == NULL) || (atom2 == NULL))
2535 	return(0);
2536 
2537     if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2538         (atom2->type == XML_REGEXP_ANYCHAR))
2539 	return(1);
2540 
2541     if (atom1->type > atom2->type) {
2542 	xmlRegAtomPtr tmp;
2543 	tmp = atom1;
2544 	atom1 = atom2;
2545 	atom2 = tmp;
2546     }
2547     if (atom1->type != atom2->type) {
2548         ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2549 	/* if they can't intersect at the type level break now */
2550 	if (ret == 0)
2551 	    return(0);
2552     }
2553     switch (atom1->type) {
2554         case XML_REGEXP_STRING:
2555             if (!deep)
2556                 ret = (atom1->valuep != atom2->valuep);
2557             else {
2558                 xmlChar *val1 = (xmlChar *)atom1->valuep;
2559                 xmlChar *val2 = (xmlChar *)atom2->valuep;
2560                 int compound1 = (xmlStrchr(val1, '|') != NULL);
2561                 int compound2 = (xmlStrchr(val2, '|') != NULL);
2562 
2563                 /* Ignore negative match flag for ##other namespaces */
2564                 if (compound1 != compound2)
2565                     return(0);
2566 
2567                 ret = xmlRegStrEqualWildcard(val1, val2);
2568             }
2569 	    break;
2570         case XML_REGEXP_EPSILON:
2571 	    goto not_determinist;
2572         case XML_REGEXP_CHARVAL:
2573 	    if (atom2->type == XML_REGEXP_CHARVAL) {
2574 		ret = (atom1->codepoint == atom2->codepoint);
2575 	    } else {
2576 	        ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2577 		if (ret < 0)
2578 		    ret = 1;
2579 	    }
2580 	    break;
2581         case XML_REGEXP_RANGES:
2582 	    if (atom2->type == XML_REGEXP_RANGES) {
2583 	        int i, j, res;
2584 		xmlRegRangePtr r1, r2;
2585 
2586 		/*
2587 		 * need to check that none of the ranges eventually matches
2588 		 */
2589 		for (i = 0;i < atom1->nbRanges;i++) {
2590 		    for (j = 0;j < atom2->nbRanges;j++) {
2591 			r1 = atom1->ranges[i];
2592 			r2 = atom2->ranges[j];
2593 			res = xmlFACompareRanges(r1, r2);
2594 			if (res == 1) {
2595 			    ret = 1;
2596 			    goto done;
2597 			}
2598 		    }
2599 		}
2600 		ret = 0;
2601 	    }
2602 	    break;
2603 	default:
2604 	    goto not_determinist;
2605     }
2606 done:
2607     if (atom1->neg != atom2->neg) {
2608         ret = !ret;
2609     }
2610     if (ret == 0)
2611         return(0);
2612 not_determinist:
2613     return(1);
2614 }
2615 
2616 /**
2617  * xmlFARecurseDeterminism:
2618  * @ctxt:  a regexp parser context
2619  *
2620  * Check whether the associated regexp is determinist,
2621  * should be called after xmlFAEliminateEpsilonTransitions()
2622  *
2623  */
2624 static int
xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,int to,xmlRegAtomPtr atom)2625 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2626 	                 int to, xmlRegAtomPtr atom) {
2627     int ret = 1;
2628     int res;
2629     int transnr, nbTrans;
2630     xmlRegTransPtr t1;
2631     int deep = 1;
2632 
2633     if (state == NULL)
2634 	return(ret);
2635     if (state->markd == XML_REGEXP_MARK_VISITED)
2636 	return(ret);
2637 
2638     if (ctxt->flags & AM_AUTOMATA_RNG)
2639         deep = 0;
2640 
2641     /*
2642      * don't recurse on transitions potentially added in the course of
2643      * the elimination.
2644      */
2645     nbTrans = state->nbTrans;
2646     for (transnr = 0;transnr < nbTrans;transnr++) {
2647 	t1 = &(state->trans[transnr]);
2648 	/*
2649 	 * check transitions conflicting with the one looked at
2650 	 */
2651 	if (t1->atom == NULL) {
2652 	    if (t1->to < 0)
2653 		continue;
2654 	    state->markd = XML_REGEXP_MARK_VISITED;
2655 	    res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2656 		                           to, atom);
2657 	    if (res == 0) {
2658 	        ret = 0;
2659 		/* t1->nd = 1; */
2660 	    }
2661 	    continue;
2662 	}
2663 	if (t1->to != to)
2664 	    continue;
2665 	if (xmlFACompareAtoms(t1->atom, atom, deep)) {
2666 	    ret = 0;
2667 	    /* mark the transition as non-deterministic */
2668 	    t1->nd = 1;
2669 	}
2670     }
2671     return(ret);
2672 }
2673 
2674 /**
2675  * xmlFAFinishRecurseDeterminism:
2676  * @ctxt:  a regexp parser context
2677  *
2678  * Reset flags after checking determinism.
2679  */
2680 static void
xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state)2681 xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
2682     int transnr, nbTrans;
2683 
2684     if (state == NULL)
2685 	return;
2686     if (state->markd != XML_REGEXP_MARK_VISITED)
2687 	return;
2688     state->markd = 0;
2689 
2690     nbTrans = state->nbTrans;
2691     for (transnr = 0; transnr < nbTrans; transnr++) {
2692 	xmlRegTransPtr t1 = &state->trans[transnr];
2693 	if ((t1->atom == NULL) && (t1->to >= 0))
2694 	    xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2695     }
2696 }
2697 
2698 /**
2699  * xmlFAComputesDeterminism:
2700  * @ctxt:  a regexp parser context
2701  *
2702  * Check whether the associated regexp is determinist,
2703  * should be called after xmlFAEliminateEpsilonTransitions()
2704  *
2705  */
2706 static int
xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt)2707 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2708     int statenr, transnr;
2709     xmlRegStatePtr state;
2710     xmlRegTransPtr t1, t2, last;
2711     int i;
2712     int ret = 1;
2713     int deep = 1;
2714 
2715 #ifdef DEBUG_REGEXP_GRAPH
2716     printf("xmlFAComputesDeterminism\n");
2717     xmlRegPrintCtxt(stdout, ctxt);
2718 #endif
2719     if (ctxt->determinist != -1)
2720 	return(ctxt->determinist);
2721 
2722     if (ctxt->flags & AM_AUTOMATA_RNG)
2723         deep = 0;
2724 
2725     /*
2726      * First cleanup the automata removing cancelled transitions
2727      */
2728     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2729 	state = ctxt->states[statenr];
2730 	if (state == NULL)
2731 	    continue;
2732 	if (state->nbTrans < 2)
2733 	    continue;
2734 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
2735 	    t1 = &(state->trans[transnr]);
2736 	    /*
2737 	     * Determinism checks in case of counted or all transitions
2738 	     * will have to be handled separately
2739 	     */
2740 	    if (t1->atom == NULL) {
2741 		/* t1->nd = 1; */
2742 		continue;
2743 	    }
2744 	    if (t1->to == -1) /* eliminated */
2745 		continue;
2746 	    for (i = 0;i < transnr;i++) {
2747 		t2 = &(state->trans[i]);
2748 		if (t2->to == -1) /* eliminated */
2749 		    continue;
2750 		if (t2->atom != NULL) {
2751 		    if (t1->to == t2->to) {
2752                         /*
2753                          * Here we use deep because we want to keep the
2754                          * transitions which indicate a conflict
2755                          */
2756 			if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
2757                             (t1->counter == t2->counter) &&
2758                             (t1->count == t2->count))
2759 			    t2->to = -1; /* eliminated */
2760 		    }
2761 		}
2762 	    }
2763 	}
2764     }
2765 
2766     /*
2767      * Check for all states that there aren't 2 transitions
2768      * with the same atom and a different target.
2769      */
2770     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2771 	state = ctxt->states[statenr];
2772 	if (state == NULL)
2773 	    continue;
2774 	if (state->nbTrans < 2)
2775 	    continue;
2776 	last = NULL;
2777 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
2778 	    t1 = &(state->trans[transnr]);
2779 	    /*
2780 	     * Determinism checks in case of counted or all transitions
2781 	     * will have to be handled separately
2782 	     */
2783 	    if (t1->atom == NULL) {
2784 		continue;
2785 	    }
2786 	    if (t1->to == -1) /* eliminated */
2787 		continue;
2788 	    for (i = 0;i < transnr;i++) {
2789 		t2 = &(state->trans[i]);
2790 		if (t2->to == -1) /* eliminated */
2791 		    continue;
2792 		if (t2->atom != NULL) {
2793                     /*
2794                      * But here we don't use deep because we want to
2795                      * find transitions which indicate a conflict
2796                      */
2797 		    if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
2798 			ret = 0;
2799 			/* mark the transitions as non-deterministic ones */
2800 			t1->nd = 1;
2801 			t2->nd = 1;
2802 			last = t1;
2803 		    }
2804 		} else if (t1->to != -1) {
2805 		    /*
2806 		     * do the closure in case of remaining specific
2807 		     * epsilon transitions like choices or all
2808 		     */
2809 		    ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2810 						   t2->to, t2->atom);
2811                     xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2812 		    /* don't shortcut the computation so all non deterministic
2813 		       transition get marked down
2814 		    if (ret == 0)
2815 			return(0);
2816 		     */
2817 		    if (ret == 0) {
2818 			t1->nd = 1;
2819 			/* t2->nd = 1; */
2820 			last = t1;
2821 		    }
2822 		}
2823 	    }
2824 	    /* don't shortcut the computation so all non deterministic
2825 	       transition get marked down
2826 	    if (ret == 0)
2827 		break; */
2828 	}
2829 
2830 	/*
2831 	 * mark specifically the last non-deterministic transition
2832 	 * from a state since there is no need to set-up rollback
2833 	 * from it
2834 	 */
2835 	if (last != NULL) {
2836 	    last->nd = 2;
2837 	}
2838 
2839 	/* don't shortcut the computation so all non deterministic
2840 	   transition get marked down
2841 	if (ret == 0)
2842 	    break; */
2843     }
2844 
2845     ctxt->determinist = ret;
2846     return(ret);
2847 }
2848 
2849 /************************************************************************
2850  *									*
2851  *	Routines to check input against transition atoms		*
2852  *									*
2853  ************************************************************************/
2854 
2855 static int
xmlRegCheckCharacterRange(xmlRegAtomType type,int codepoint,int neg,int start,int end,const xmlChar * blockName)2856 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2857 	                  int start, int end, const xmlChar *blockName) {
2858     int ret = 0;
2859 
2860     switch (type) {
2861         case XML_REGEXP_STRING:
2862         case XML_REGEXP_SUBREG:
2863         case XML_REGEXP_RANGES:
2864         case XML_REGEXP_EPSILON:
2865 	    return(-1);
2866         case XML_REGEXP_ANYCHAR:
2867 	    ret = ((codepoint != '\n') && (codepoint != '\r'));
2868 	    break;
2869         case XML_REGEXP_CHARVAL:
2870 	    ret = ((codepoint >= start) && (codepoint <= end));
2871 	    break;
2872         case XML_REGEXP_NOTSPACE:
2873 	    neg = !neg;
2874             /* Falls through. */
2875         case XML_REGEXP_ANYSPACE:
2876 	    ret = ((codepoint == '\n') || (codepoint == '\r') ||
2877 		   (codepoint == '\t') || (codepoint == ' '));
2878 	    break;
2879         case XML_REGEXP_NOTINITNAME:
2880 	    neg = !neg;
2881             /* Falls through. */
2882         case XML_REGEXP_INITNAME:
2883 	    ret = (IS_LETTER(codepoint) ||
2884 		   (codepoint == '_') || (codepoint == ':'));
2885 	    break;
2886         case XML_REGEXP_NOTNAMECHAR:
2887 	    neg = !neg;
2888             /* Falls through. */
2889         case XML_REGEXP_NAMECHAR:
2890 	    ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
2891 		   (codepoint == '.') || (codepoint == '-') ||
2892 		   (codepoint == '_') || (codepoint == ':') ||
2893 		   IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
2894 	    break;
2895         case XML_REGEXP_NOTDECIMAL:
2896 	    neg = !neg;
2897             /* Falls through. */
2898         case XML_REGEXP_DECIMAL:
2899 	    ret = xmlUCSIsCatNd(codepoint);
2900 	    break;
2901         case XML_REGEXP_REALCHAR:
2902 	    neg = !neg;
2903             /* Falls through. */
2904         case XML_REGEXP_NOTREALCHAR:
2905 	    ret = xmlUCSIsCatP(codepoint);
2906 	    if (ret == 0)
2907 		ret = xmlUCSIsCatZ(codepoint);
2908 	    if (ret == 0)
2909 		ret = xmlUCSIsCatC(codepoint);
2910 	    break;
2911         case XML_REGEXP_LETTER:
2912 	    ret = xmlUCSIsCatL(codepoint);
2913 	    break;
2914         case XML_REGEXP_LETTER_UPPERCASE:
2915 	    ret = xmlUCSIsCatLu(codepoint);
2916 	    break;
2917         case XML_REGEXP_LETTER_LOWERCASE:
2918 	    ret = xmlUCSIsCatLl(codepoint);
2919 	    break;
2920         case XML_REGEXP_LETTER_TITLECASE:
2921 	    ret = xmlUCSIsCatLt(codepoint);
2922 	    break;
2923         case XML_REGEXP_LETTER_MODIFIER:
2924 	    ret = xmlUCSIsCatLm(codepoint);
2925 	    break;
2926         case XML_REGEXP_LETTER_OTHERS:
2927 	    ret = xmlUCSIsCatLo(codepoint);
2928 	    break;
2929         case XML_REGEXP_MARK:
2930 	    ret = xmlUCSIsCatM(codepoint);
2931 	    break;
2932         case XML_REGEXP_MARK_NONSPACING:
2933 	    ret = xmlUCSIsCatMn(codepoint);
2934 	    break;
2935         case XML_REGEXP_MARK_SPACECOMBINING:
2936 	    ret = xmlUCSIsCatMc(codepoint);
2937 	    break;
2938         case XML_REGEXP_MARK_ENCLOSING:
2939 	    ret = xmlUCSIsCatMe(codepoint);
2940 	    break;
2941         case XML_REGEXP_NUMBER:
2942 	    ret = xmlUCSIsCatN(codepoint);
2943 	    break;
2944         case XML_REGEXP_NUMBER_DECIMAL:
2945 	    ret = xmlUCSIsCatNd(codepoint);
2946 	    break;
2947         case XML_REGEXP_NUMBER_LETTER:
2948 	    ret = xmlUCSIsCatNl(codepoint);
2949 	    break;
2950         case XML_REGEXP_NUMBER_OTHERS:
2951 	    ret = xmlUCSIsCatNo(codepoint);
2952 	    break;
2953         case XML_REGEXP_PUNCT:
2954 	    ret = xmlUCSIsCatP(codepoint);
2955 	    break;
2956         case XML_REGEXP_PUNCT_CONNECTOR:
2957 	    ret = xmlUCSIsCatPc(codepoint);
2958 	    break;
2959         case XML_REGEXP_PUNCT_DASH:
2960 	    ret = xmlUCSIsCatPd(codepoint);
2961 	    break;
2962         case XML_REGEXP_PUNCT_OPEN:
2963 	    ret = xmlUCSIsCatPs(codepoint);
2964 	    break;
2965         case XML_REGEXP_PUNCT_CLOSE:
2966 	    ret = xmlUCSIsCatPe(codepoint);
2967 	    break;
2968         case XML_REGEXP_PUNCT_INITQUOTE:
2969 	    ret = xmlUCSIsCatPi(codepoint);
2970 	    break;
2971         case XML_REGEXP_PUNCT_FINQUOTE:
2972 	    ret = xmlUCSIsCatPf(codepoint);
2973 	    break;
2974         case XML_REGEXP_PUNCT_OTHERS:
2975 	    ret = xmlUCSIsCatPo(codepoint);
2976 	    break;
2977         case XML_REGEXP_SEPAR:
2978 	    ret = xmlUCSIsCatZ(codepoint);
2979 	    break;
2980         case XML_REGEXP_SEPAR_SPACE:
2981 	    ret = xmlUCSIsCatZs(codepoint);
2982 	    break;
2983         case XML_REGEXP_SEPAR_LINE:
2984 	    ret = xmlUCSIsCatZl(codepoint);
2985 	    break;
2986         case XML_REGEXP_SEPAR_PARA:
2987 	    ret = xmlUCSIsCatZp(codepoint);
2988 	    break;
2989         case XML_REGEXP_SYMBOL:
2990 	    ret = xmlUCSIsCatS(codepoint);
2991 	    break;
2992         case XML_REGEXP_SYMBOL_MATH:
2993 	    ret = xmlUCSIsCatSm(codepoint);
2994 	    break;
2995         case XML_REGEXP_SYMBOL_CURRENCY:
2996 	    ret = xmlUCSIsCatSc(codepoint);
2997 	    break;
2998         case XML_REGEXP_SYMBOL_MODIFIER:
2999 	    ret = xmlUCSIsCatSk(codepoint);
3000 	    break;
3001         case XML_REGEXP_SYMBOL_OTHERS:
3002 	    ret = xmlUCSIsCatSo(codepoint);
3003 	    break;
3004         case XML_REGEXP_OTHER:
3005 	    ret = xmlUCSIsCatC(codepoint);
3006 	    break;
3007         case XML_REGEXP_OTHER_CONTROL:
3008 	    ret = xmlUCSIsCatCc(codepoint);
3009 	    break;
3010         case XML_REGEXP_OTHER_FORMAT:
3011 	    ret = xmlUCSIsCatCf(codepoint);
3012 	    break;
3013         case XML_REGEXP_OTHER_PRIVATE:
3014 	    ret = xmlUCSIsCatCo(codepoint);
3015 	    break;
3016         case XML_REGEXP_OTHER_NA:
3017 	    /* ret = xmlUCSIsCatCn(codepoint); */
3018 	    /* Seems it doesn't exist anymore in recent Unicode releases */
3019 	    ret = 0;
3020 	    break;
3021         case XML_REGEXP_BLOCK_NAME:
3022 	    ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
3023 	    break;
3024     }
3025     if (neg)
3026 	return(!ret);
3027     return(ret);
3028 }
3029 
3030 static int
xmlRegCheckCharacter(xmlRegAtomPtr atom,int codepoint)3031 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
3032     int i, ret = 0;
3033     xmlRegRangePtr range;
3034 
3035     if ((atom == NULL) || (!IS_CHAR(codepoint)))
3036 	return(-1);
3037 
3038     switch (atom->type) {
3039         case XML_REGEXP_SUBREG:
3040         case XML_REGEXP_EPSILON:
3041 	    return(-1);
3042         case XML_REGEXP_CHARVAL:
3043             return(codepoint == atom->codepoint);
3044         case XML_REGEXP_RANGES: {
3045 	    int accept = 0;
3046 
3047 	    for (i = 0;i < atom->nbRanges;i++) {
3048 		range = atom->ranges[i];
3049 		if (range->neg == 2) {
3050 		    ret = xmlRegCheckCharacterRange(range->type, codepoint,
3051 						0, range->start, range->end,
3052 						range->blockName);
3053 		    if (ret != 0)
3054 			return(0); /* excluded char */
3055 		} else if (range->neg) {
3056 		    ret = xmlRegCheckCharacterRange(range->type, codepoint,
3057 						0, range->start, range->end,
3058 						range->blockName);
3059 		    if (ret == 0)
3060 		        accept = 1;
3061 		    else
3062 		        return(0);
3063 		} else {
3064 		    ret = xmlRegCheckCharacterRange(range->type, codepoint,
3065 						0, range->start, range->end,
3066 						range->blockName);
3067 		    if (ret != 0)
3068 			accept = 1; /* might still be excluded */
3069 		}
3070 	    }
3071 	    return(accept);
3072 	}
3073         case XML_REGEXP_STRING:
3074 	    printf("TODO: XML_REGEXP_STRING\n");
3075 	    return(-1);
3076         case XML_REGEXP_ANYCHAR:
3077         case XML_REGEXP_ANYSPACE:
3078         case XML_REGEXP_NOTSPACE:
3079         case XML_REGEXP_INITNAME:
3080         case XML_REGEXP_NOTINITNAME:
3081         case XML_REGEXP_NAMECHAR:
3082         case XML_REGEXP_NOTNAMECHAR:
3083         case XML_REGEXP_DECIMAL:
3084         case XML_REGEXP_NOTDECIMAL:
3085         case XML_REGEXP_REALCHAR:
3086         case XML_REGEXP_NOTREALCHAR:
3087         case XML_REGEXP_LETTER:
3088         case XML_REGEXP_LETTER_UPPERCASE:
3089         case XML_REGEXP_LETTER_LOWERCASE:
3090         case XML_REGEXP_LETTER_TITLECASE:
3091         case XML_REGEXP_LETTER_MODIFIER:
3092         case XML_REGEXP_LETTER_OTHERS:
3093         case XML_REGEXP_MARK:
3094         case XML_REGEXP_MARK_NONSPACING:
3095         case XML_REGEXP_MARK_SPACECOMBINING:
3096         case XML_REGEXP_MARK_ENCLOSING:
3097         case XML_REGEXP_NUMBER:
3098         case XML_REGEXP_NUMBER_DECIMAL:
3099         case XML_REGEXP_NUMBER_LETTER:
3100         case XML_REGEXP_NUMBER_OTHERS:
3101         case XML_REGEXP_PUNCT:
3102         case XML_REGEXP_PUNCT_CONNECTOR:
3103         case XML_REGEXP_PUNCT_DASH:
3104         case XML_REGEXP_PUNCT_OPEN:
3105         case XML_REGEXP_PUNCT_CLOSE:
3106         case XML_REGEXP_PUNCT_INITQUOTE:
3107         case XML_REGEXP_PUNCT_FINQUOTE:
3108         case XML_REGEXP_PUNCT_OTHERS:
3109         case XML_REGEXP_SEPAR:
3110         case XML_REGEXP_SEPAR_SPACE:
3111         case XML_REGEXP_SEPAR_LINE:
3112         case XML_REGEXP_SEPAR_PARA:
3113         case XML_REGEXP_SYMBOL:
3114         case XML_REGEXP_SYMBOL_MATH:
3115         case XML_REGEXP_SYMBOL_CURRENCY:
3116         case XML_REGEXP_SYMBOL_MODIFIER:
3117         case XML_REGEXP_SYMBOL_OTHERS:
3118         case XML_REGEXP_OTHER:
3119         case XML_REGEXP_OTHER_CONTROL:
3120         case XML_REGEXP_OTHER_FORMAT:
3121         case XML_REGEXP_OTHER_PRIVATE:
3122         case XML_REGEXP_OTHER_NA:
3123 	case XML_REGEXP_BLOCK_NAME:
3124 	    ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3125 		                            (const xmlChar *)atom->valuep);
3126 	    if (atom->neg)
3127 		ret = !ret;
3128 	    break;
3129     }
3130     return(ret);
3131 }
3132 
3133 /************************************************************************
3134  *									*
3135  *	Saving and restoring state of an execution context		*
3136  *									*
3137  ************************************************************************/
3138 
3139 #ifdef DEBUG_REGEXP_EXEC
3140 static void
xmlFARegDebugExec(xmlRegExecCtxtPtr exec)3141 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
3142     printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
3143     if (exec->inputStack != NULL) {
3144 	int i;
3145 	printf(": ");
3146 	for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
3147 	    printf("%s ", (const char *)
3148 	           exec->inputStack[exec->inputStackNr - (i + 1)].value);
3149     } else {
3150 	printf(": %s", &(exec->inputString[exec->index]));
3151     }
3152     printf("\n");
3153 }
3154 #endif
3155 
3156 static void
xmlFARegExecSave(xmlRegExecCtxtPtr exec)3157 xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3158 #ifdef DEBUG_REGEXP_EXEC
3159     printf("saving ");
3160     exec->transno++;
3161     xmlFARegDebugExec(exec);
3162     exec->transno--;
3163 #endif
3164 #ifdef MAX_PUSH
3165     if (exec->nbPush > MAX_PUSH) {
3166         return;
3167     }
3168     exec->nbPush++;
3169 #endif
3170 
3171     if (exec->maxRollbacks == 0) {
3172 	exec->maxRollbacks = 4;
3173 	exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3174 		                             sizeof(xmlRegExecRollback));
3175 	if (exec->rollbacks == NULL) {
3176 	    xmlRegexpErrMemory(NULL, "saving regexp");
3177 	    exec->maxRollbacks = 0;
3178 	    return;
3179 	}
3180 	memset(exec->rollbacks, 0,
3181 	       exec->maxRollbacks * sizeof(xmlRegExecRollback));
3182     } else if (exec->nbRollbacks >= exec->maxRollbacks) {
3183 	xmlRegExecRollback *tmp;
3184 	int len = exec->maxRollbacks;
3185 
3186 	exec->maxRollbacks *= 2;
3187 	tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3188 			exec->maxRollbacks * sizeof(xmlRegExecRollback));
3189 	if (tmp == NULL) {
3190 	    xmlRegexpErrMemory(NULL, "saving regexp");
3191 	    exec->maxRollbacks /= 2;
3192 	    return;
3193 	}
3194 	exec->rollbacks = tmp;
3195 	tmp = &exec->rollbacks[len];
3196 	memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3197     }
3198     exec->rollbacks[exec->nbRollbacks].state = exec->state;
3199     exec->rollbacks[exec->nbRollbacks].index = exec->index;
3200     exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3201     if (exec->comp->nbCounters > 0) {
3202 	if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3203 	    exec->rollbacks[exec->nbRollbacks].counts = (int *)
3204 		xmlMalloc(exec->comp->nbCounters * sizeof(int));
3205 	    if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3206 		xmlRegexpErrMemory(NULL, "saving regexp");
3207 		exec->status = -5;
3208 		return;
3209 	    }
3210 	}
3211 	memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3212 	       exec->comp->nbCounters * sizeof(int));
3213     }
3214     exec->nbRollbacks++;
3215 }
3216 
3217 static void
xmlFARegExecRollBack(xmlRegExecCtxtPtr exec)3218 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3219     if (exec->nbRollbacks <= 0) {
3220 	exec->status = -1;
3221 #ifdef DEBUG_REGEXP_EXEC
3222 	printf("rollback failed on empty stack\n");
3223 #endif
3224 	return;
3225     }
3226     exec->nbRollbacks--;
3227     exec->state = exec->rollbacks[exec->nbRollbacks].state;
3228     exec->index = exec->rollbacks[exec->nbRollbacks].index;
3229     exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3230     if (exec->comp->nbCounters > 0) {
3231 	if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3232 	    fprintf(stderr, "exec save: allocation failed");
3233 	    exec->status = -6;
3234 	    return;
3235 	}
3236 	if (exec->counts) {
3237 	    memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
3238 	       exec->comp->nbCounters * sizeof(int));
3239 	}
3240     }
3241 
3242 #ifdef DEBUG_REGEXP_EXEC
3243     printf("restored ");
3244     xmlFARegDebugExec(exec);
3245 #endif
3246 }
3247 
3248 /************************************************************************
3249  *									*
3250  *	Verifier, running an input against a compiled regexp		*
3251  *									*
3252  ************************************************************************/
3253 
3254 static int
xmlFARegExec(xmlRegexpPtr comp,const xmlChar * content)3255 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3256     xmlRegExecCtxt execval;
3257     xmlRegExecCtxtPtr exec = &execval;
3258     int ret, codepoint = 0, len, deter;
3259 
3260     exec->inputString = content;
3261     exec->index = 0;
3262     exec->nbPush = 0;
3263     exec->determinist = 1;
3264     exec->maxRollbacks = 0;
3265     exec->nbRollbacks = 0;
3266     exec->rollbacks = NULL;
3267     exec->status = 0;
3268     exec->comp = comp;
3269     exec->state = comp->states[0];
3270     exec->transno = 0;
3271     exec->transcount = 0;
3272     exec->inputStack = NULL;
3273     exec->inputStackMax = 0;
3274     if (comp->nbCounters > 0) {
3275 	exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
3276 	if (exec->counts == NULL) {
3277 	    xmlRegexpErrMemory(NULL, "running regexp");
3278 	    return(-1);
3279 	}
3280         memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3281     } else
3282 	exec->counts = NULL;
3283     while ((exec->status == 0) && (exec->state != NULL) &&
3284 	   ((exec->inputString[exec->index] != 0) ||
3285 	    ((exec->state != NULL) &&
3286 	     (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3287 	xmlRegTransPtr trans;
3288 	xmlRegAtomPtr atom;
3289 
3290 	/*
3291 	 * If end of input on non-terminal state, rollback, however we may
3292 	 * still have epsilon like transition for counted transitions
3293 	 * on counters, in that case don't break too early.  Additionally,
3294 	 * if we are working on a range like "AB{0,2}", where B is not present,
3295 	 * we don't want to break.
3296 	 */
3297 	len = 1;
3298 	if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
3299 	    /*
3300 	     * if there is a transition, we must check if
3301 	     *  atom allows minOccurs of 0
3302 	     */
3303 	    if (exec->transno < exec->state->nbTrans) {
3304 	        trans = &exec->state->trans[exec->transno];
3305 		if (trans->to >=0) {
3306 		    atom = trans->atom;
3307 		    if (!((atom->min == 0) && (atom->max > 0)))
3308 		        goto rollback;
3309 		}
3310 	    } else
3311 	        goto rollback;
3312 	}
3313 
3314 	exec->transcount = 0;
3315 	for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3316 	    trans = &exec->state->trans[exec->transno];
3317 	    if (trans->to < 0)
3318 		continue;
3319 	    atom = trans->atom;
3320 	    ret = 0;
3321 	    deter = 1;
3322 	    if (trans->count >= 0) {
3323 		int count;
3324 		xmlRegCounterPtr counter;
3325 
3326 		if (exec->counts == NULL) {
3327 		    exec->status = -1;
3328 		    goto error;
3329 		}
3330 		/*
3331 		 * A counted transition.
3332 		 */
3333 
3334 		count = exec->counts[trans->count];
3335 		counter = &exec->comp->counters[trans->count];
3336 #ifdef DEBUG_REGEXP_EXEC
3337 		printf("testing count %d: val %d, min %d, max %d\n",
3338 		       trans->count, count, counter->min,  counter->max);
3339 #endif
3340 		ret = ((count >= counter->min) && (count <= counter->max));
3341 		if ((ret) && (counter->min != counter->max))
3342 		    deter = 0;
3343 	    } else if (atom == NULL) {
3344 		fprintf(stderr, "epsilon transition left at runtime\n");
3345 		exec->status = -2;
3346 		break;
3347 	    } else if (exec->inputString[exec->index] != 0) {
3348                 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3349 		ret = xmlRegCheckCharacter(atom, codepoint);
3350 		if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
3351 		    xmlRegStatePtr to = comp->states[trans->to];
3352 
3353 		    /*
3354 		     * this is a multiple input sequence
3355 		     * If there is a counter associated increment it now.
3356 		     * do not increment if the counter is already over the
3357 		     * maximum limit in which case get to next transition
3358 		     */
3359 		    if (trans->counter >= 0) {
3360 			xmlRegCounterPtr counter;
3361 
3362 			if ((exec->counts == NULL) ||
3363 			    (exec->comp == NULL) ||
3364 			    (exec->comp->counters == NULL)) {
3365 			    exec->status = -1;
3366 			    goto error;
3367 			}
3368 			counter = &exec->comp->counters[trans->counter];
3369 			if (exec->counts[trans->counter] >= counter->max)
3370 			    continue; /* for loop on transitions */
3371                     }
3372                     /* Save before incrementing */
3373 		    if (exec->state->nbTrans > exec->transno + 1) {
3374 			xmlFARegExecSave(exec);
3375 		    }
3376 		    if (trans->counter >= 0) {
3377 #ifdef DEBUG_REGEXP_EXEC
3378 			printf("Increasing count %d\n", trans->counter);
3379 #endif
3380 			exec->counts[trans->counter]++;
3381 		    }
3382 		    exec->transcount = 1;
3383 		    do {
3384 			/*
3385 			 * Try to progress as much as possible on the input
3386 			 */
3387 			if (exec->transcount == atom->max) {
3388 			    break;
3389 			}
3390 			exec->index += len;
3391 			/*
3392 			 * End of input: stop here
3393 			 */
3394 			if (exec->inputString[exec->index] == 0) {
3395 			    exec->index -= len;
3396 			    break;
3397 			}
3398 			if (exec->transcount >= atom->min) {
3399 			    int transno = exec->transno;
3400 			    xmlRegStatePtr state = exec->state;
3401 
3402 			    /*
3403 			     * The transition is acceptable save it
3404 			     */
3405 			    exec->transno = -1; /* trick */
3406 			    exec->state = to;
3407 			    xmlFARegExecSave(exec);
3408 			    exec->transno = transno;
3409 			    exec->state = state;
3410 			}
3411 			codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3412 				              len);
3413 			ret = xmlRegCheckCharacter(atom, codepoint);
3414 			exec->transcount++;
3415 		    } while (ret == 1);
3416 		    if (exec->transcount < atom->min)
3417 			ret = 0;
3418 
3419 		    /*
3420 		     * If the last check failed but one transition was found
3421 		     * possible, rollback
3422 		     */
3423 		    if (ret < 0)
3424 			ret = 0;
3425 		    if (ret == 0) {
3426 			goto rollback;
3427 		    }
3428 		    if (trans->counter >= 0) {
3429 			if (exec->counts == NULL) {
3430 			    exec->status = -1;
3431 			    goto error;
3432 			}
3433 #ifdef DEBUG_REGEXP_EXEC
3434 			printf("Decreasing count %d\n", trans->counter);
3435 #endif
3436 			exec->counts[trans->counter]--;
3437 		    }
3438 		} else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3439 		    /*
3440 		     * we don't match on the codepoint, but minOccurs of 0
3441 		     * says that's ok.  Setting len to 0 inhibits stepping
3442 		     * over the codepoint.
3443 		     */
3444 		    exec->transcount = 1;
3445 		    len = 0;
3446 		    ret = 1;
3447 		}
3448 	    } else if ((atom->min == 0) && (atom->max > 0)) {
3449 	        /* another spot to match when minOccurs is 0 */
3450 		exec->transcount = 1;
3451 		len = 0;
3452 		ret = 1;
3453 	    }
3454 	    if (ret == 1) {
3455 		if ((trans->nd == 1) ||
3456 		    ((trans->count >= 0) && (deter == 0) &&
3457 		     (exec->state->nbTrans > exec->transno + 1))) {
3458 #ifdef DEBUG_REGEXP_EXEC
3459 		    if (trans->nd == 1)
3460 		        printf("Saving on nd transition atom %d for %c at %d\n",
3461 			       trans->atom->no, codepoint, exec->index);
3462 		    else
3463 		        printf("Saving on counted transition count %d for %c at %d\n",
3464 			       trans->count, codepoint, exec->index);
3465 #endif
3466 		    xmlFARegExecSave(exec);
3467 		}
3468 		if (trans->counter >= 0) {
3469 		    xmlRegCounterPtr counter;
3470 
3471                     /* make sure we don't go over the counter maximum value */
3472 		    if ((exec->counts == NULL) ||
3473 			(exec->comp == NULL) ||
3474 			(exec->comp->counters == NULL)) {
3475 			exec->status = -1;
3476 			goto error;
3477 		    }
3478 		    counter = &exec->comp->counters[trans->counter];
3479 		    if (exec->counts[trans->counter] >= counter->max)
3480 			continue; /* for loop on transitions */
3481 #ifdef DEBUG_REGEXP_EXEC
3482 		    printf("Increasing count %d\n", trans->counter);
3483 #endif
3484 		    exec->counts[trans->counter]++;
3485 		}
3486 		if ((trans->count >= 0) &&
3487 		    (trans->count < REGEXP_ALL_COUNTER)) {
3488 		    if (exec->counts == NULL) {
3489 		        exec->status = -1;
3490 			goto error;
3491 		    }
3492 #ifdef DEBUG_REGEXP_EXEC
3493 		    printf("resetting count %d on transition\n",
3494 		           trans->count);
3495 #endif
3496 		    exec->counts[trans->count] = 0;
3497 		}
3498 #ifdef DEBUG_REGEXP_EXEC
3499 		printf("entering state %d\n", trans->to);
3500 #endif
3501 		exec->state = comp->states[trans->to];
3502 		exec->transno = 0;
3503 		if (trans->atom != NULL) {
3504 		    exec->index += len;
3505 		}
3506 		goto progress;
3507 	    } else if (ret < 0) {
3508 		exec->status = -4;
3509 		break;
3510 	    }
3511 	}
3512 	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3513 rollback:
3514 	    /*
3515 	     * Failed to find a way out
3516 	     */
3517 	    exec->determinist = 0;
3518 #ifdef DEBUG_REGEXP_EXEC
3519 	    printf("rollback from state %d on %d:%c\n", exec->state->no,
3520 	           codepoint,codepoint);
3521 #endif
3522 	    xmlFARegExecRollBack(exec);
3523 	}
3524 progress:
3525 	continue;
3526     }
3527 error:
3528     if (exec->rollbacks != NULL) {
3529 	if (exec->counts != NULL) {
3530 	    int i;
3531 
3532 	    for (i = 0;i < exec->maxRollbacks;i++)
3533 		if (exec->rollbacks[i].counts != NULL)
3534 		    xmlFree(exec->rollbacks[i].counts);
3535 	}
3536 	xmlFree(exec->rollbacks);
3537     }
3538     if (exec->state == NULL)
3539         return(-1);
3540     if (exec->counts != NULL)
3541 	xmlFree(exec->counts);
3542     if (exec->status == 0)
3543 	return(1);
3544     if (exec->status == -1) {
3545 	if (exec->nbPush > MAX_PUSH)
3546 	    return(-1);
3547 	return(0);
3548     }
3549     return(exec->status);
3550 }
3551 
3552 /************************************************************************
3553  *									*
3554  *	Progressive interface to the verifier one atom at a time	*
3555  *									*
3556  ************************************************************************/
3557 #ifdef DEBUG_ERR
3558 static void testerr(xmlRegExecCtxtPtr exec);
3559 #endif
3560 
3561 /**
3562  * xmlRegNewExecCtxt:
3563  * @comp: a precompiled regular expression
3564  * @callback: a callback function used for handling progresses in the
3565  *            automata matching phase
3566  * @data: the context data associated to the callback in this context
3567  *
3568  * Build a context used for progressive evaluation of a regexp.
3569  *
3570  * Returns the new context
3571  */
3572 xmlRegExecCtxtPtr
xmlRegNewExecCtxt(xmlRegexpPtr comp,xmlRegExecCallbacks callback,void * data)3573 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3574     xmlRegExecCtxtPtr exec;
3575 
3576     if (comp == NULL)
3577 	return(NULL);
3578     if ((comp->compact == NULL) && (comp->states == NULL))
3579         return(NULL);
3580     exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3581     if (exec == NULL) {
3582 	xmlRegexpErrMemory(NULL, "creating execution context");
3583 	return(NULL);
3584     }
3585     memset(exec, 0, sizeof(xmlRegExecCtxt));
3586     exec->inputString = NULL;
3587     exec->index = 0;
3588     exec->determinist = 1;
3589     exec->maxRollbacks = 0;
3590     exec->nbRollbacks = 0;
3591     exec->rollbacks = NULL;
3592     exec->status = 0;
3593     exec->comp = comp;
3594     if (comp->compact == NULL)
3595 	exec->state = comp->states[0];
3596     exec->transno = 0;
3597     exec->transcount = 0;
3598     exec->callback = callback;
3599     exec->data = data;
3600     if (comp->nbCounters > 0) {
3601         /*
3602 	 * For error handling, exec->counts is allocated twice the size
3603 	 * the second half is used to store the data in case of rollback
3604 	 */
3605 	exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3606 	                                 * 2);
3607 	if (exec->counts == NULL) {
3608 	    xmlRegexpErrMemory(NULL, "creating execution context");
3609 	    xmlFree(exec);
3610 	    return(NULL);
3611 	}
3612         memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3613 	exec->errCounts = &exec->counts[comp->nbCounters];
3614     } else {
3615 	exec->counts = NULL;
3616 	exec->errCounts = NULL;
3617     }
3618     exec->inputStackMax = 0;
3619     exec->inputStackNr = 0;
3620     exec->inputStack = NULL;
3621     exec->errStateNo = -1;
3622     exec->errString = NULL;
3623     exec->nbPush = 0;
3624     return(exec);
3625 }
3626 
3627 /**
3628  * xmlRegFreeExecCtxt:
3629  * @exec: a regular expression evaluation context
3630  *
3631  * Free the structures associated to a regular expression evaluation context.
3632  */
3633 void
xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec)3634 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3635     if (exec == NULL)
3636 	return;
3637 
3638     if (exec->rollbacks != NULL) {
3639 	if (exec->counts != NULL) {
3640 	    int i;
3641 
3642 	    for (i = 0;i < exec->maxRollbacks;i++)
3643 		if (exec->rollbacks[i].counts != NULL)
3644 		    xmlFree(exec->rollbacks[i].counts);
3645 	}
3646 	xmlFree(exec->rollbacks);
3647     }
3648     if (exec->counts != NULL)
3649 	xmlFree(exec->counts);
3650     if (exec->inputStack != NULL) {
3651 	int i;
3652 
3653 	for (i = 0;i < exec->inputStackNr;i++) {
3654 	    if (exec->inputStack[i].value != NULL)
3655 		xmlFree(exec->inputStack[i].value);
3656 	}
3657 	xmlFree(exec->inputStack);
3658     }
3659     if (exec->errString != NULL)
3660         xmlFree(exec->errString);
3661     xmlFree(exec);
3662 }
3663 
3664 static void
xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)3665 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3666 	                    void *data) {
3667 #ifdef DEBUG_PUSH
3668     printf("saving value: %d:%s\n", exec->inputStackNr, value);
3669 #endif
3670     if (exec->inputStackMax == 0) {
3671 	exec->inputStackMax = 4;
3672 	exec->inputStack = (xmlRegInputTokenPtr)
3673 	    xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3674 	if (exec->inputStack == NULL) {
3675 	    xmlRegexpErrMemory(NULL, "pushing input string");
3676 	    exec->inputStackMax = 0;
3677 	    return;
3678 	}
3679     } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3680 	xmlRegInputTokenPtr tmp;
3681 
3682 	exec->inputStackMax *= 2;
3683 	tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3684 			exec->inputStackMax * sizeof(xmlRegInputToken));
3685 	if (tmp == NULL) {
3686 	    xmlRegexpErrMemory(NULL, "pushing input string");
3687 	    exec->inputStackMax /= 2;
3688 	    return;
3689 	}
3690 	exec->inputStack = tmp;
3691     }
3692     exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3693     exec->inputStack[exec->inputStackNr].data = data;
3694     exec->inputStackNr++;
3695     exec->inputStack[exec->inputStackNr].value = NULL;
3696     exec->inputStack[exec->inputStackNr].data = NULL;
3697 }
3698 
3699 /**
3700  * xmlRegStrEqualWildcard:
3701  * @expStr:  the string to be evaluated
3702  * @valStr:  the validation string
3703  *
3704  * Checks if both strings are equal or have the same content. "*"
3705  * can be used as a wildcard in @valStr; "|" is used as a separator of
3706  * substrings in both @expStr and @valStr.
3707  *
3708  * Returns 1 if the comparison is satisfied and the number of substrings
3709  * is equal, 0 otherwise.
3710  */
3711 
3712 static int
xmlRegStrEqualWildcard(const xmlChar * expStr,const xmlChar * valStr)3713 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3714     if (expStr == valStr) return(1);
3715     if (expStr == NULL) return(0);
3716     if (valStr == NULL) return(0);
3717     do {
3718 	/*
3719 	* Eval if we have a wildcard for the current item.
3720 	*/
3721         if (*expStr != *valStr) {
3722 	    /* if one of them starts with a wildcard make valStr be it */
3723 	    if (*valStr == '*') {
3724 	        const xmlChar *tmp;
3725 
3726 		tmp = valStr;
3727 		valStr = expStr;
3728 		expStr = tmp;
3729 	    }
3730 	    if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3731 		do {
3732 		    if (*valStr == XML_REG_STRING_SEPARATOR)
3733 			break;
3734 		    valStr++;
3735 		} while (*valStr != 0);
3736 		continue;
3737 	    } else
3738 		return(0);
3739 	}
3740 	expStr++;
3741 	valStr++;
3742     } while (*valStr != 0);
3743     if (*expStr != 0)
3744 	return (0);
3745     else
3746 	return (1);
3747 }
3748 
3749 /**
3750  * xmlRegCompactPushString:
3751  * @exec: a regexp execution context
3752  * @comp:  the precompiled exec with a compact table
3753  * @value: a string token input
3754  * @data: data associated to the token to reuse in callbacks
3755  *
3756  * Push one input token in the execution context
3757  *
3758  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3759  *     a negative value in case of error.
3760  */
3761 static int
xmlRegCompactPushString(xmlRegExecCtxtPtr exec,xmlRegexpPtr comp,const xmlChar * value,void * data)3762 xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3763 	                xmlRegexpPtr comp,
3764 	                const xmlChar *value,
3765 	                void *data) {
3766     int state = exec->index;
3767     int i, target;
3768 
3769     if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3770 	return(-1);
3771 
3772     if (value == NULL) {
3773 	/*
3774 	 * are we at a final state ?
3775 	 */
3776 	if (comp->compact[state * (comp->nbstrings + 1)] ==
3777             XML_REGEXP_FINAL_STATE)
3778 	    return(1);
3779 	return(0);
3780     }
3781 
3782 #ifdef DEBUG_PUSH
3783     printf("value pushed: %s\n", value);
3784 #endif
3785 
3786     /*
3787      * Examine all outside transitions from current state
3788      */
3789     for (i = 0;i < comp->nbstrings;i++) {
3790 	target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3791 	if ((target > 0) && (target <= comp->nbstates)) {
3792 	    target--; /* to avoid 0 */
3793 	    if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3794 		exec->index = target;
3795 		if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3796 		    exec->callback(exec->data, value,
3797 			  comp->transdata[state * comp->nbstrings + i], data);
3798 		}
3799 #ifdef DEBUG_PUSH
3800 		printf("entering state %d\n", target);
3801 #endif
3802 		if (comp->compact[target * (comp->nbstrings + 1)] ==
3803 		    XML_REGEXP_SINK_STATE)
3804 		    goto error;
3805 
3806 		if (comp->compact[target * (comp->nbstrings + 1)] ==
3807 		    XML_REGEXP_FINAL_STATE)
3808 		    return(1);
3809 		return(0);
3810 	    }
3811 	}
3812     }
3813     /*
3814      * Failed to find an exit transition out from current state for the
3815      * current token
3816      */
3817 #ifdef DEBUG_PUSH
3818     printf("failed to find a transition for %s on state %d\n", value, state);
3819 #endif
3820 error:
3821     if (exec->errString != NULL)
3822         xmlFree(exec->errString);
3823     exec->errString = xmlStrdup(value);
3824     exec->errStateNo = state;
3825     exec->status = -1;
3826 #ifdef DEBUG_ERR
3827     testerr(exec);
3828 #endif
3829     return(-1);
3830 }
3831 
3832 /**
3833  * xmlRegExecPushStringInternal:
3834  * @exec: a regexp execution context or NULL to indicate the end
3835  * @value: a string token input
3836  * @data: data associated to the token to reuse in callbacks
3837  * @compound: value was assembled from 2 strings
3838  *
3839  * Push one input token in the execution context
3840  *
3841  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3842  *     a negative value in case of error.
3843  */
3844 static int
xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data,int compound)3845 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3846 	                     void *data, int compound) {
3847     xmlRegTransPtr trans;
3848     xmlRegAtomPtr atom;
3849     int ret;
3850     int final = 0;
3851     int progress = 1;
3852 
3853     if (exec == NULL)
3854 	return(-1);
3855     if (exec->comp == NULL)
3856 	return(-1);
3857     if (exec->status != 0)
3858 	return(exec->status);
3859 
3860     if (exec->comp->compact != NULL)
3861 	return(xmlRegCompactPushString(exec, exec->comp, value, data));
3862 
3863     if (value == NULL) {
3864         if (exec->state->type == XML_REGEXP_FINAL_STATE)
3865 	    return(1);
3866 	final = 1;
3867     }
3868 
3869 #ifdef DEBUG_PUSH
3870     printf("value pushed: %s\n", value);
3871 #endif
3872     /*
3873      * If we have an active rollback stack push the new value there
3874      * and get back to where we were left
3875      */
3876     if ((value != NULL) && (exec->inputStackNr > 0)) {
3877 	xmlFARegExecSaveInputString(exec, value, data);
3878 	value = exec->inputStack[exec->index].value;
3879 	data = exec->inputStack[exec->index].data;
3880 #ifdef DEBUG_PUSH
3881 	printf("value loaded: %s\n", value);
3882 #endif
3883     }
3884 
3885     while ((exec->status == 0) &&
3886 	   ((value != NULL) ||
3887 	    ((final == 1) &&
3888 	     (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3889 
3890 	/*
3891 	 * End of input on non-terminal state, rollback, however we may
3892 	 * still have epsilon like transition for counted transitions
3893 	 * on counters, in that case don't break too early.
3894 	 */
3895 	if ((value == NULL) && (exec->counts == NULL))
3896 	    goto rollback;
3897 
3898 	exec->transcount = 0;
3899 	for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3900 	    trans = &exec->state->trans[exec->transno];
3901 	    if (trans->to < 0)
3902 		continue;
3903 	    atom = trans->atom;
3904 	    ret = 0;
3905 	    if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3906 		int i;
3907 		int count;
3908 		xmlRegTransPtr t;
3909 		xmlRegCounterPtr counter;
3910 
3911 		ret = 0;
3912 
3913 #ifdef DEBUG_PUSH
3914 		printf("testing all lax %d\n", trans->count);
3915 #endif
3916 		/*
3917 		 * Check all counted transitions from the current state
3918 		 */
3919 		if ((value == NULL) && (final)) {
3920 		    ret = 1;
3921 		} else if (value != NULL) {
3922 		    for (i = 0;i < exec->state->nbTrans;i++) {
3923 			t = &exec->state->trans[i];
3924 			if ((t->counter < 0) || (t == trans))
3925 			    continue;
3926 			counter = &exec->comp->counters[t->counter];
3927 			count = exec->counts[t->counter];
3928 			if ((count < counter->max) &&
3929 		            (t->atom != NULL) &&
3930 			    (xmlStrEqual(value, t->atom->valuep))) {
3931 			    ret = 0;
3932 			    break;
3933 			}
3934 			if ((count >= counter->min) &&
3935 			    (count < counter->max) &&
3936 			    (t->atom != NULL) &&
3937 			    (xmlStrEqual(value, t->atom->valuep))) {
3938 			    ret = 1;
3939 			    break;
3940 			}
3941 		    }
3942 		}
3943 	    } else if (trans->count == REGEXP_ALL_COUNTER) {
3944 		int i;
3945 		int count;
3946 		xmlRegTransPtr t;
3947 		xmlRegCounterPtr counter;
3948 
3949 		ret = 1;
3950 
3951 #ifdef DEBUG_PUSH
3952 		printf("testing all %d\n", trans->count);
3953 #endif
3954 		/*
3955 		 * Check all counted transitions from the current state
3956 		 */
3957 		for (i = 0;i < exec->state->nbTrans;i++) {
3958                     t = &exec->state->trans[i];
3959 		    if ((t->counter < 0) || (t == trans))
3960 			continue;
3961                     counter = &exec->comp->counters[t->counter];
3962 		    count = exec->counts[t->counter];
3963 		    if ((count < counter->min) || (count > counter->max)) {
3964 			ret = 0;
3965 			break;
3966 		    }
3967 		}
3968 	    } else if (trans->count >= 0) {
3969 		int count;
3970 		xmlRegCounterPtr counter;
3971 
3972 		/*
3973 		 * A counted transition.
3974 		 */
3975 
3976 		count = exec->counts[trans->count];
3977 		counter = &exec->comp->counters[trans->count];
3978 #ifdef DEBUG_PUSH
3979 		printf("testing count %d: val %d, min %d, max %d\n",
3980 		       trans->count, count, counter->min,  counter->max);
3981 #endif
3982 		ret = ((count >= counter->min) && (count <= counter->max));
3983 	    } else if (atom == NULL) {
3984 		fprintf(stderr, "epsilon transition left at runtime\n");
3985 		exec->status = -2;
3986 		break;
3987 	    } else if (value != NULL) {
3988 		ret = xmlRegStrEqualWildcard(atom->valuep, value);
3989 		if (atom->neg) {
3990 		    ret = !ret;
3991 		    if (!compound)
3992 		        ret = 0;
3993 		}
3994 		if ((ret == 1) && (trans->counter >= 0)) {
3995 		    xmlRegCounterPtr counter;
3996 		    int count;
3997 
3998 		    count = exec->counts[trans->counter];
3999 		    counter = &exec->comp->counters[trans->counter];
4000 		    if (count >= counter->max)
4001 			ret = 0;
4002 		}
4003 
4004 		if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4005 		    xmlRegStatePtr to = exec->comp->states[trans->to];
4006 
4007 		    /*
4008 		     * this is a multiple input sequence
4009 		     */
4010 		    if (exec->state->nbTrans > exec->transno + 1) {
4011 			if (exec->inputStackNr <= 0) {
4012 			    xmlFARegExecSaveInputString(exec, value, data);
4013 			}
4014 			xmlFARegExecSave(exec);
4015 		    }
4016 		    exec->transcount = 1;
4017 		    do {
4018 			/*
4019 			 * Try to progress as much as possible on the input
4020 			 */
4021 			if (exec->transcount == atom->max) {
4022 			    break;
4023 			}
4024 			exec->index++;
4025 			value = exec->inputStack[exec->index].value;
4026 			data = exec->inputStack[exec->index].data;
4027 #ifdef DEBUG_PUSH
4028 			printf("value loaded: %s\n", value);
4029 #endif
4030 
4031 			/*
4032 			 * End of input: stop here
4033 			 */
4034 			if (value == NULL) {
4035 			    exec->index --;
4036 			    break;
4037 			}
4038 			if (exec->transcount >= atom->min) {
4039 			    int transno = exec->transno;
4040 			    xmlRegStatePtr state = exec->state;
4041 
4042 			    /*
4043 			     * The transition is acceptable save it
4044 			     */
4045 			    exec->transno = -1; /* trick */
4046 			    exec->state = to;
4047 			    if (exec->inputStackNr <= 0) {
4048 				xmlFARegExecSaveInputString(exec, value, data);
4049 			    }
4050 			    xmlFARegExecSave(exec);
4051 			    exec->transno = transno;
4052 			    exec->state = state;
4053 			}
4054 			ret = xmlStrEqual(value, atom->valuep);
4055 			exec->transcount++;
4056 		    } while (ret == 1);
4057 		    if (exec->transcount < atom->min)
4058 			ret = 0;
4059 
4060 		    /*
4061 		     * If the last check failed but one transition was found
4062 		     * possible, rollback
4063 		     */
4064 		    if (ret < 0)
4065 			ret = 0;
4066 		    if (ret == 0) {
4067 			goto rollback;
4068 		    }
4069 		}
4070 	    }
4071 	    if (ret == 1) {
4072 		if ((exec->callback != NULL) && (atom != NULL) &&
4073 			(data != NULL)) {
4074 		    exec->callback(exec->data, atom->valuep,
4075 			           atom->data, data);
4076 		}
4077 		if (exec->state->nbTrans > exec->transno + 1) {
4078 		    if (exec->inputStackNr <= 0) {
4079 			xmlFARegExecSaveInputString(exec, value, data);
4080 		    }
4081 		    xmlFARegExecSave(exec);
4082 		}
4083 		if (trans->counter >= 0) {
4084 #ifdef DEBUG_PUSH
4085 		    printf("Increasing count %d\n", trans->counter);
4086 #endif
4087 		    exec->counts[trans->counter]++;
4088 		}
4089 		if ((trans->count >= 0) &&
4090 		    (trans->count < REGEXP_ALL_COUNTER)) {
4091 #ifdef DEBUG_REGEXP_EXEC
4092 		    printf("resetting count %d on transition\n",
4093 		           trans->count);
4094 #endif
4095 		    exec->counts[trans->count] = 0;
4096 		}
4097 #ifdef DEBUG_PUSH
4098 		printf("entering state %d\n", trans->to);
4099 #endif
4100                 if ((exec->comp->states[trans->to] != NULL) &&
4101 		    (exec->comp->states[trans->to]->type ==
4102 		     XML_REGEXP_SINK_STATE)) {
4103 		    /*
4104 		     * entering a sink state, save the current state as error
4105 		     * state.
4106 		     */
4107 		    if (exec->errString != NULL)
4108 			xmlFree(exec->errString);
4109 		    exec->errString = xmlStrdup(value);
4110 		    exec->errState = exec->state;
4111 		    memcpy(exec->errCounts, exec->counts,
4112 			   exec->comp->nbCounters * sizeof(int));
4113 		}
4114 		exec->state = exec->comp->states[trans->to];
4115 		exec->transno = 0;
4116 		if (trans->atom != NULL) {
4117 		    if (exec->inputStack != NULL) {
4118 			exec->index++;
4119 			if (exec->index < exec->inputStackNr) {
4120 			    value = exec->inputStack[exec->index].value;
4121 			    data = exec->inputStack[exec->index].data;
4122 #ifdef DEBUG_PUSH
4123 			    printf("value loaded: %s\n", value);
4124 #endif
4125 			} else {
4126 			    value = NULL;
4127 			    data = NULL;
4128 #ifdef DEBUG_PUSH
4129 			    printf("end of input\n");
4130 #endif
4131 			}
4132 		    } else {
4133 			value = NULL;
4134 			data = NULL;
4135 #ifdef DEBUG_PUSH
4136 			printf("end of input\n");
4137 #endif
4138 		    }
4139 		}
4140 		goto progress;
4141 	    } else if (ret < 0) {
4142 		exec->status = -4;
4143 		break;
4144 	    }
4145 	}
4146 	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4147 rollback:
4148             /*
4149 	     * if we didn't yet rollback on the current input
4150 	     * store the current state as the error state.
4151 	     */
4152 	    if ((progress) && (exec->state != NULL) &&
4153 	        (exec->state->type != XML_REGEXP_SINK_STATE)) {
4154 	        progress = 0;
4155 		if (exec->errString != NULL)
4156 		    xmlFree(exec->errString);
4157 		exec->errString = xmlStrdup(value);
4158 		exec->errState = exec->state;
4159                 if (exec->comp->nbCounters)
4160                     memcpy(exec->errCounts, exec->counts,
4161                            exec->comp->nbCounters * sizeof(int));
4162 	    }
4163 
4164 	    /*
4165 	     * Failed to find a way out
4166 	     */
4167 	    exec->determinist = 0;
4168 	    xmlFARegExecRollBack(exec);
4169 	    if ((exec->inputStack != NULL ) && (exec->status == 0)) {
4170 		value = exec->inputStack[exec->index].value;
4171 		data = exec->inputStack[exec->index].data;
4172 #ifdef DEBUG_PUSH
4173 		printf("value loaded: %s\n", value);
4174 #endif
4175 	    }
4176 	}
4177 	continue;
4178 progress:
4179         progress = 1;
4180 	continue;
4181     }
4182     if (exec->status == 0) {
4183         return(exec->state->type == XML_REGEXP_FINAL_STATE);
4184     }
4185 #ifdef DEBUG_ERR
4186     if (exec->status < 0) {
4187 	testerr(exec);
4188     }
4189 #endif
4190     return(exec->status);
4191 }
4192 
4193 /**
4194  * xmlRegExecPushString:
4195  * @exec: a regexp execution context or NULL to indicate the end
4196  * @value: a string token input
4197  * @data: data associated to the token to reuse in callbacks
4198  *
4199  * Push one input token in the execution context
4200  *
4201  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4202  *     a negative value in case of error.
4203  */
4204 int
xmlRegExecPushString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)4205 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4206 	             void *data) {
4207     return(xmlRegExecPushStringInternal(exec, value, data, 0));
4208 }
4209 
4210 /**
4211  * xmlRegExecPushString2:
4212  * @exec: a regexp execution context or NULL to indicate the end
4213  * @value: the first string token input
4214  * @value2: the second string token input
4215  * @data: data associated to the token to reuse in callbacks
4216  *
4217  * Push one input token in the execution context
4218  *
4219  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4220  *     a negative value in case of error.
4221  */
4222 int
xmlRegExecPushString2(xmlRegExecCtxtPtr exec,const xmlChar * value,const xmlChar * value2,void * data)4223 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4224                       const xmlChar *value2, void *data) {
4225     xmlChar buf[150];
4226     int lenn, lenp, ret;
4227     xmlChar *str;
4228 
4229     if (exec == NULL)
4230 	return(-1);
4231     if (exec->comp == NULL)
4232 	return(-1);
4233     if (exec->status != 0)
4234 	return(exec->status);
4235 
4236     if (value2 == NULL)
4237         return(xmlRegExecPushString(exec, value, data));
4238 
4239     lenn = strlen((char *) value2);
4240     lenp = strlen((char *) value);
4241 
4242     if (150 < lenn + lenp + 2) {
4243 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4244 	if (str == NULL) {
4245 	    exec->status = -1;
4246 	    return(-1);
4247 	}
4248     } else {
4249 	str = buf;
4250     }
4251     memcpy(&str[0], value, lenp);
4252     str[lenp] = XML_REG_STRING_SEPARATOR;
4253     memcpy(&str[lenp + 1], value2, lenn);
4254     str[lenn + lenp + 1] = 0;
4255 
4256     if (exec->comp->compact != NULL)
4257 	ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4258     else
4259         ret = xmlRegExecPushStringInternal(exec, str, data, 1);
4260 
4261     if (str != buf)
4262         xmlFree(str);
4263     return(ret);
4264 }
4265 
4266 /**
4267  * xmlRegExecGetValues:
4268  * @exec: a regexp execution context
4269  * @err: error extraction or normal one
4270  * @nbval: pointer to the number of accepted values IN/OUT
4271  * @nbneg: return number of negative transitions
4272  * @values: pointer to the array of acceptable values
4273  * @terminal: return value if this was a terminal state
4274  *
4275  * Extract information from the regexp execution, internal routine to
4276  * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
4277  *
4278  * Returns: 0 in case of success or -1 in case of error.
4279  */
4280 static int
xmlRegExecGetValues(xmlRegExecCtxtPtr exec,int err,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4281 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
4282                     int *nbval, int *nbneg,
4283 		    xmlChar **values, int *terminal) {
4284     int maxval;
4285     int nb = 0;
4286 
4287     if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
4288         (values == NULL) || (*nbval <= 0))
4289         return(-1);
4290 
4291     maxval = *nbval;
4292     *nbval = 0;
4293     *nbneg = 0;
4294     if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4295         xmlRegexpPtr comp;
4296 	int target, i, state;
4297 
4298         comp = exec->comp;
4299 
4300 	if (err) {
4301 	    if (exec->errStateNo == -1) return(-1);
4302 	    state = exec->errStateNo;
4303 	} else {
4304 	    state = exec->index;
4305 	}
4306 	if (terminal != NULL) {
4307 	    if (comp->compact[state * (comp->nbstrings + 1)] ==
4308 	        XML_REGEXP_FINAL_STATE)
4309 		*terminal = 1;
4310 	    else
4311 		*terminal = 0;
4312 	}
4313 	for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4314 	    target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4315 	    if ((target > 0) && (target <= comp->nbstates) &&
4316 	        (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4317 		 XML_REGEXP_SINK_STATE)) {
4318 	        values[nb++] = comp->stringMap[i];
4319 		(*nbval)++;
4320 	    }
4321 	}
4322 	for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4323 	    target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4324 	    if ((target > 0) && (target <= comp->nbstates) &&
4325 	        (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4326 		 XML_REGEXP_SINK_STATE)) {
4327 	        values[nb++] = comp->stringMap[i];
4328 		(*nbneg)++;
4329 	    }
4330 	}
4331     } else {
4332         int transno;
4333 	xmlRegTransPtr trans;
4334 	xmlRegAtomPtr atom;
4335 	xmlRegStatePtr state;
4336 
4337 	if (terminal != NULL) {
4338 	    if (exec->state->type == XML_REGEXP_FINAL_STATE)
4339 		*terminal = 1;
4340 	    else
4341 		*terminal = 0;
4342 	}
4343 
4344 	if (err) {
4345 	    if (exec->errState == NULL) return(-1);
4346 	    state = exec->errState;
4347 	} else {
4348 	    if (exec->state == NULL) return(-1);
4349 	    state = exec->state;
4350 	}
4351 	for (transno = 0;
4352 	     (transno < state->nbTrans) && (nb < maxval);
4353 	     transno++) {
4354 	    trans = &state->trans[transno];
4355 	    if (trans->to < 0)
4356 		continue;
4357 	    atom = trans->atom;
4358 	    if ((atom == NULL) || (atom->valuep == NULL))
4359 		continue;
4360 	    if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4361 	        /* this should not be reached but ... */
4362 	        TODO;
4363 	    } else if (trans->count == REGEXP_ALL_COUNTER) {
4364 	        /* this should not be reached but ... */
4365 	        TODO;
4366 	    } else if (trans->counter >= 0) {
4367 		xmlRegCounterPtr counter = NULL;
4368 		int count;
4369 
4370 		if (err)
4371 		    count = exec->errCounts[trans->counter];
4372 		else
4373 		    count = exec->counts[trans->counter];
4374 		if (exec->comp != NULL)
4375 		    counter = &exec->comp->counters[trans->counter];
4376 		if ((counter == NULL) || (count < counter->max)) {
4377 		    if (atom->neg)
4378 			values[nb++] = (xmlChar *) atom->valuep2;
4379 		    else
4380 			values[nb++] = (xmlChar *) atom->valuep;
4381 		    (*nbval)++;
4382 		}
4383 	    } else {
4384                 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
4385 		    (exec->comp->states[trans->to]->type !=
4386 		     XML_REGEXP_SINK_STATE)) {
4387 		    if (atom->neg)
4388 			values[nb++] = (xmlChar *) atom->valuep2;
4389 		    else
4390 			values[nb++] = (xmlChar *) atom->valuep;
4391 		    (*nbval)++;
4392 		}
4393 	    }
4394 	}
4395 	for (transno = 0;
4396 	     (transno < state->nbTrans) && (nb < maxval);
4397 	     transno++) {
4398 	    trans = &state->trans[transno];
4399 	    if (trans->to < 0)
4400 		continue;
4401 	    atom = trans->atom;
4402 	    if ((atom == NULL) || (atom->valuep == NULL))
4403 		continue;
4404 	    if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4405 	        continue;
4406 	    } else if (trans->count == REGEXP_ALL_COUNTER) {
4407 	        continue;
4408 	    } else if (trans->counter >= 0) {
4409 	        continue;
4410 	    } else {
4411                 if ((exec->comp->states[trans->to] != NULL) &&
4412 		    (exec->comp->states[trans->to]->type ==
4413 		     XML_REGEXP_SINK_STATE)) {
4414 		    if (atom->neg)
4415 			values[nb++] = (xmlChar *) atom->valuep2;
4416 		    else
4417 			values[nb++] = (xmlChar *) atom->valuep;
4418 		    (*nbneg)++;
4419 		}
4420 	    }
4421 	}
4422     }
4423     return(0);
4424 }
4425 
4426 /**
4427  * xmlRegExecNextValues:
4428  * @exec: a regexp execution context
4429  * @nbval: pointer to the number of accepted values IN/OUT
4430  * @nbneg: return number of negative transitions
4431  * @values: pointer to the array of acceptable values
4432  * @terminal: return value if this was a terminal state
4433  *
4434  * Extract information from the regexp execution,
4435  * the parameter @values must point to an array of @nbval string pointers
4436  * on return nbval will contain the number of possible strings in that
4437  * state and the @values array will be updated with them. The string values
4438  * returned will be freed with the @exec context and don't need to be
4439  * deallocated.
4440  *
4441  * Returns: 0 in case of success or -1 in case of error.
4442  */
4443 int
xmlRegExecNextValues(xmlRegExecCtxtPtr exec,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4444 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4445                      xmlChar **values, int *terminal) {
4446     return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
4447 }
4448 
4449 /**
4450  * xmlRegExecErrInfo:
4451  * @exec: a regexp execution context generating an error
4452  * @string: return value for the error string
4453  * @nbval: pointer to the number of accepted values IN/OUT
4454  * @nbneg: return number of negative transitions
4455  * @values: pointer to the array of acceptable values
4456  * @terminal: return value if this was a terminal state
4457  *
4458  * Extract error information from the regexp execution, the parameter
4459  * @string will be updated with the value pushed and not accepted,
4460  * the parameter @values must point to an array of @nbval string pointers
4461  * on return nbval will contain the number of possible strings in that
4462  * state and the @values array will be updated with them. The string values
4463  * returned will be freed with the @exec context and don't need to be
4464  * deallocated.
4465  *
4466  * Returns: 0 in case of success or -1 in case of error.
4467  */
4468 int
xmlRegExecErrInfo(xmlRegExecCtxtPtr exec,const xmlChar ** string,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4469 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
4470                   int *nbval, int *nbneg, xmlChar **values, int *terminal) {
4471     if (exec == NULL)
4472         return(-1);
4473     if (string != NULL) {
4474         if (exec->status != 0)
4475 	    *string = exec->errString;
4476 	else
4477 	    *string = NULL;
4478     }
4479     return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
4480 }
4481 
4482 #ifdef DEBUG_ERR
testerr(xmlRegExecCtxtPtr exec)4483 static void testerr(xmlRegExecCtxtPtr exec) {
4484     const xmlChar *string;
4485     xmlChar *values[5];
4486     int nb = 5;
4487     int nbneg;
4488     int terminal;
4489     xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
4490 }
4491 #endif
4492 
4493 #if 0
4494 static int
4495 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4496     xmlRegTransPtr trans;
4497     xmlRegAtomPtr atom;
4498     int ret;
4499     int codepoint, len;
4500 
4501     if (exec == NULL)
4502 	return(-1);
4503     if (exec->status != 0)
4504 	return(exec->status);
4505 
4506     while ((exec->status == 0) &&
4507 	   ((exec->inputString[exec->index] != 0) ||
4508 	    (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4509 
4510 	/*
4511 	 * End of input on non-terminal state, rollback, however we may
4512 	 * still have epsilon like transition for counted transitions
4513 	 * on counters, in that case don't break too early.
4514 	 */
4515 	if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4516 	    goto rollback;
4517 
4518 	exec->transcount = 0;
4519 	for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4520 	    trans = &exec->state->trans[exec->transno];
4521 	    if (trans->to < 0)
4522 		continue;
4523 	    atom = trans->atom;
4524 	    ret = 0;
4525 	    if (trans->count >= 0) {
4526 		int count;
4527 		xmlRegCounterPtr counter;
4528 
4529 		/*
4530 		 * A counted transition.
4531 		 */
4532 
4533 		count = exec->counts[trans->count];
4534 		counter = &exec->comp->counters[trans->count];
4535 #ifdef DEBUG_REGEXP_EXEC
4536 		printf("testing count %d: val %d, min %d, max %d\n",
4537 		       trans->count, count, counter->min,  counter->max);
4538 #endif
4539 		ret = ((count >= counter->min) && (count <= counter->max));
4540 	    } else if (atom == NULL) {
4541 		fprintf(stderr, "epsilon transition left at runtime\n");
4542 		exec->status = -2;
4543 		break;
4544 	    } else if (exec->inputString[exec->index] != 0) {
4545                 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4546 		ret = xmlRegCheckCharacter(atom, codepoint);
4547 		if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4548 		    xmlRegStatePtr to = exec->comp->states[trans->to];
4549 
4550 		    /*
4551 		     * this is a multiple input sequence
4552 		     */
4553 		    if (exec->state->nbTrans > exec->transno + 1) {
4554 			xmlFARegExecSave(exec);
4555 		    }
4556 		    exec->transcount = 1;
4557 		    do {
4558 			/*
4559 			 * Try to progress as much as possible on the input
4560 			 */
4561 			if (exec->transcount == atom->max) {
4562 			    break;
4563 			}
4564 			exec->index += len;
4565 			/*
4566 			 * End of input: stop here
4567 			 */
4568 			if (exec->inputString[exec->index] == 0) {
4569 			    exec->index -= len;
4570 			    break;
4571 			}
4572 			if (exec->transcount >= atom->min) {
4573 			    int transno = exec->transno;
4574 			    xmlRegStatePtr state = exec->state;
4575 
4576 			    /*
4577 			     * The transition is acceptable save it
4578 			     */
4579 			    exec->transno = -1; /* trick */
4580 			    exec->state = to;
4581 			    xmlFARegExecSave(exec);
4582 			    exec->transno = transno;
4583 			    exec->state = state;
4584 			}
4585 			codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4586 				              len);
4587 			ret = xmlRegCheckCharacter(atom, codepoint);
4588 			exec->transcount++;
4589 		    } while (ret == 1);
4590 		    if (exec->transcount < atom->min)
4591 			ret = 0;
4592 
4593 		    /*
4594 		     * If the last check failed but one transition was found
4595 		     * possible, rollback
4596 		     */
4597 		    if (ret < 0)
4598 			ret = 0;
4599 		    if (ret == 0) {
4600 			goto rollback;
4601 		    }
4602 		}
4603 	    }
4604 	    if (ret == 1) {
4605 		if (exec->state->nbTrans > exec->transno + 1) {
4606 		    xmlFARegExecSave(exec);
4607 		}
4608 		/*
4609 		 * restart count for expressions like this ((abc){2})*
4610 		 */
4611 		if (trans->count >= 0) {
4612 #ifdef DEBUG_REGEXP_EXEC
4613 		    printf("Reset count %d\n", trans->count);
4614 #endif
4615 		    exec->counts[trans->count] = 0;
4616 		}
4617 		if (trans->counter >= 0) {
4618 #ifdef DEBUG_REGEXP_EXEC
4619 		    printf("Increasing count %d\n", trans->counter);
4620 #endif
4621 		    exec->counts[trans->counter]++;
4622 		}
4623 #ifdef DEBUG_REGEXP_EXEC
4624 		printf("entering state %d\n", trans->to);
4625 #endif
4626 		exec->state = exec->comp->states[trans->to];
4627 		exec->transno = 0;
4628 		if (trans->atom != NULL) {
4629 		    exec->index += len;
4630 		}
4631 		goto progress;
4632 	    } else if (ret < 0) {
4633 		exec->status = -4;
4634 		break;
4635 	    }
4636 	}
4637 	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4638 rollback:
4639 	    /*
4640 	     * Failed to find a way out
4641 	     */
4642 	    exec->determinist = 0;
4643 	    xmlFARegExecRollBack(exec);
4644 	}
4645 progress:
4646 	continue;
4647     }
4648 }
4649 #endif
4650 /************************************************************************
4651  *									*
4652  *	Parser for the Schemas Datatype Regular Expressions		*
4653  *	http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs	*
4654  *									*
4655  ************************************************************************/
4656 
4657 /**
4658  * xmlFAIsChar:
4659  * @ctxt:  a regexp parser context
4660  *
4661  * [10]   Char   ::=   [^.\?*+()|#x5B#x5D]
4662  */
4663 static int
xmlFAIsChar(xmlRegParserCtxtPtr ctxt)4664 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4665     int cur;
4666     int len;
4667 
4668     cur = CUR_SCHAR(ctxt->cur, len);
4669     if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4670 	(cur == '*') || (cur == '+') || (cur == '(') ||
4671 	(cur == ')') || (cur == '|') || (cur == 0x5B) ||
4672 	(cur == 0x5D) || (cur == 0))
4673 	return(-1);
4674     return(cur);
4675 }
4676 
4677 /**
4678  * xmlFAParseCharProp:
4679  * @ctxt:  a regexp parser context
4680  *
4681  * [27]   charProp   ::=   IsCategory | IsBlock
4682  * [28]   IsCategory ::= Letters | Marks | Numbers | Punctuation |
4683  *                       Separators | Symbols | Others
4684  * [29]   Letters   ::=   'L' [ultmo]?
4685  * [30]   Marks   ::=   'M' [nce]?
4686  * [31]   Numbers   ::=   'N' [dlo]?
4687  * [32]   Punctuation   ::=   'P' [cdseifo]?
4688  * [33]   Separators   ::=   'Z' [slp]?
4689  * [34]   Symbols   ::=   'S' [mcko]?
4690  * [35]   Others   ::=   'C' [cfon]?
4691  * [36]   IsBlock   ::=   'Is' [a-zA-Z0-9#x2D]+
4692  */
4693 static void
xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt)4694 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4695     int cur;
4696     xmlRegAtomType type = (xmlRegAtomType) 0;
4697     xmlChar *blockName = NULL;
4698 
4699     cur = CUR;
4700     if (cur == 'L') {
4701 	NEXT;
4702 	cur = CUR;
4703 	if (cur == 'u') {
4704 	    NEXT;
4705 	    type = XML_REGEXP_LETTER_UPPERCASE;
4706 	} else if (cur == 'l') {
4707 	    NEXT;
4708 	    type = XML_REGEXP_LETTER_LOWERCASE;
4709 	} else if (cur == 't') {
4710 	    NEXT;
4711 	    type = XML_REGEXP_LETTER_TITLECASE;
4712 	} else if (cur == 'm') {
4713 	    NEXT;
4714 	    type = XML_REGEXP_LETTER_MODIFIER;
4715 	} else if (cur == 'o') {
4716 	    NEXT;
4717 	    type = XML_REGEXP_LETTER_OTHERS;
4718 	} else {
4719 	    type = XML_REGEXP_LETTER;
4720 	}
4721     } else if (cur == 'M') {
4722 	NEXT;
4723 	cur = CUR;
4724 	if (cur == 'n') {
4725 	    NEXT;
4726 	    /* nonspacing */
4727 	    type = XML_REGEXP_MARK_NONSPACING;
4728 	} else if (cur == 'c') {
4729 	    NEXT;
4730 	    /* spacing combining */
4731 	    type = XML_REGEXP_MARK_SPACECOMBINING;
4732 	} else if (cur == 'e') {
4733 	    NEXT;
4734 	    /* enclosing */
4735 	    type = XML_REGEXP_MARK_ENCLOSING;
4736 	} else {
4737 	    /* all marks */
4738 	    type = XML_REGEXP_MARK;
4739 	}
4740     } else if (cur == 'N') {
4741 	NEXT;
4742 	cur = CUR;
4743 	if (cur == 'd') {
4744 	    NEXT;
4745 	    /* digital */
4746 	    type = XML_REGEXP_NUMBER_DECIMAL;
4747 	} else if (cur == 'l') {
4748 	    NEXT;
4749 	    /* letter */
4750 	    type = XML_REGEXP_NUMBER_LETTER;
4751 	} else if (cur == 'o') {
4752 	    NEXT;
4753 	    /* other */
4754 	    type = XML_REGEXP_NUMBER_OTHERS;
4755 	} else {
4756 	    /* all numbers */
4757 	    type = XML_REGEXP_NUMBER;
4758 	}
4759     } else if (cur == 'P') {
4760 	NEXT;
4761 	cur = CUR;
4762 	if (cur == 'c') {
4763 	    NEXT;
4764 	    /* connector */
4765 	    type = XML_REGEXP_PUNCT_CONNECTOR;
4766 	} else if (cur == 'd') {
4767 	    NEXT;
4768 	    /* dash */
4769 	    type = XML_REGEXP_PUNCT_DASH;
4770 	} else if (cur == 's') {
4771 	    NEXT;
4772 	    /* open */
4773 	    type = XML_REGEXP_PUNCT_OPEN;
4774 	} else if (cur == 'e') {
4775 	    NEXT;
4776 	    /* close */
4777 	    type = XML_REGEXP_PUNCT_CLOSE;
4778 	} else if (cur == 'i') {
4779 	    NEXT;
4780 	    /* initial quote */
4781 	    type = XML_REGEXP_PUNCT_INITQUOTE;
4782 	} else if (cur == 'f') {
4783 	    NEXT;
4784 	    /* final quote */
4785 	    type = XML_REGEXP_PUNCT_FINQUOTE;
4786 	} else if (cur == 'o') {
4787 	    NEXT;
4788 	    /* other */
4789 	    type = XML_REGEXP_PUNCT_OTHERS;
4790 	} else {
4791 	    /* all punctuation */
4792 	    type = XML_REGEXP_PUNCT;
4793 	}
4794     } else if (cur == 'Z') {
4795 	NEXT;
4796 	cur = CUR;
4797 	if (cur == 's') {
4798 	    NEXT;
4799 	    /* space */
4800 	    type = XML_REGEXP_SEPAR_SPACE;
4801 	} else if (cur == 'l') {
4802 	    NEXT;
4803 	    /* line */
4804 	    type = XML_REGEXP_SEPAR_LINE;
4805 	} else if (cur == 'p') {
4806 	    NEXT;
4807 	    /* paragraph */
4808 	    type = XML_REGEXP_SEPAR_PARA;
4809 	} else {
4810 	    /* all separators */
4811 	    type = XML_REGEXP_SEPAR;
4812 	}
4813     } else if (cur == 'S') {
4814 	NEXT;
4815 	cur = CUR;
4816 	if (cur == 'm') {
4817 	    NEXT;
4818 	    type = XML_REGEXP_SYMBOL_MATH;
4819 	    /* math */
4820 	} else if (cur == 'c') {
4821 	    NEXT;
4822 	    type = XML_REGEXP_SYMBOL_CURRENCY;
4823 	    /* currency */
4824 	} else if (cur == 'k') {
4825 	    NEXT;
4826 	    type = XML_REGEXP_SYMBOL_MODIFIER;
4827 	    /* modifiers */
4828 	} else if (cur == 'o') {
4829 	    NEXT;
4830 	    type = XML_REGEXP_SYMBOL_OTHERS;
4831 	    /* other */
4832 	} else {
4833 	    /* all symbols */
4834 	    type = XML_REGEXP_SYMBOL;
4835 	}
4836     } else if (cur == 'C') {
4837 	NEXT;
4838 	cur = CUR;
4839 	if (cur == 'c') {
4840 	    NEXT;
4841 	    /* control */
4842 	    type = XML_REGEXP_OTHER_CONTROL;
4843 	} else if (cur == 'f') {
4844 	    NEXT;
4845 	    /* format */
4846 	    type = XML_REGEXP_OTHER_FORMAT;
4847 	} else if (cur == 'o') {
4848 	    NEXT;
4849 	    /* private use */
4850 	    type = XML_REGEXP_OTHER_PRIVATE;
4851 	} else if (cur == 'n') {
4852 	    NEXT;
4853 	    /* not assigned */
4854 	    type = XML_REGEXP_OTHER_NA;
4855 	} else {
4856 	    /* all others */
4857 	    type = XML_REGEXP_OTHER;
4858 	}
4859     } else if (cur == 'I') {
4860 	const xmlChar *start;
4861 	NEXT;
4862 	cur = CUR;
4863 	if (cur != 's') {
4864 	    ERROR("IsXXXX expected");
4865 	    return;
4866 	}
4867 	NEXT;
4868 	start = ctxt->cur;
4869 	cur = CUR;
4870 	if (((cur >= 'a') && (cur <= 'z')) ||
4871 	    ((cur >= 'A') && (cur <= 'Z')) ||
4872 	    ((cur >= '0') && (cur <= '9')) ||
4873 	    (cur == 0x2D)) {
4874 	    NEXT;
4875 	    cur = CUR;
4876 	    while (((cur >= 'a') && (cur <= 'z')) ||
4877 		((cur >= 'A') && (cur <= 'Z')) ||
4878 		((cur >= '0') && (cur <= '9')) ||
4879 		(cur == 0x2D)) {
4880 		NEXT;
4881 		cur = CUR;
4882 	    }
4883 	}
4884 	type = XML_REGEXP_BLOCK_NAME;
4885 	blockName = xmlStrndup(start, ctxt->cur - start);
4886     } else {
4887 	ERROR("Unknown char property");
4888 	return;
4889     }
4890     if (ctxt->atom == NULL) {
4891 	ctxt->atom = xmlRegNewAtom(ctxt, type);
4892         if (ctxt->atom == NULL) {
4893             xmlFree(blockName);
4894             return;
4895         }
4896 	ctxt->atom->valuep = blockName;
4897     } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4898         if (xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4899                                type, 0, 0, blockName) == NULL) {
4900             xmlFree(blockName);
4901         }
4902     }
4903 }
4904 
parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt)4905 static int parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt)
4906 {
4907     int val = 0, i, cur;
4908     for (i = 0; i < 4; i++) {
4909 	NEXT;
4910 	val *= 16;
4911 	cur = CUR;
4912 	if (cur >= '0' && cur <= '9') {
4913 	    val += cur - '0';
4914 	} else if (cur >= 'A' && cur <= 'F') {
4915 	    val += cur - 'A' + 10;
4916 	} else if (cur >= 'a' && cur <= 'f') {
4917 	    val += cur - 'a' + 10;
4918 	} else {
4919 	    ERROR("Expecting hex digit");
4920 	    return -1;
4921 	}
4922     }
4923     return val;
4924 }
4925 
parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt)4926 static int parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt)
4927 {
4928     int val = parse_escaped_codeunit(ctxt);
4929     if (0xD800 <= val && val <= 0xDBFF) {
4930 	NEXT;
4931 	if (CUR == '\\') {
4932 	    NEXT;
4933 	    if (CUR == 'u') {
4934 		int low = parse_escaped_codeunit(ctxt);
4935 		if (0xDC00 <= low && low <= 0xDFFF) {
4936 		    return (val - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;
4937 		}
4938 	    }
4939 	}
4940 	ERROR("Invalid low surrogate pair code unit");
4941 	val = -1;
4942     }
4943     return val;
4944 }
4945 
4946 /**
4947  * xmlFAParseCharClassEsc:
4948  * @ctxt:  a regexp parser context
4949  *
4950  * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4951  * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4952  * [25] catEsc   ::=   '\p{' charProp '}'
4953  * [26] complEsc ::=   '\P{' charProp '}'
4954  * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4955  */
4956 static void
xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt)4957 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4958     int cur;
4959 
4960     if (CUR == '.') {
4961 	if (ctxt->atom == NULL) {
4962 	    ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4963 	} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4964 	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4965 			       XML_REGEXP_ANYCHAR, 0, 0, NULL);
4966 	}
4967 	NEXT;
4968 	return;
4969     }
4970     if (CUR != '\\') {
4971 	ERROR("Escaped sequence: expecting \\");
4972 	return;
4973     }
4974     NEXT;
4975     cur = CUR;
4976     if (cur == 'p') {
4977 	NEXT;
4978 	if (CUR != '{') {
4979 	    ERROR("Expecting '{'");
4980 	    return;
4981 	}
4982 	NEXT;
4983 	xmlFAParseCharProp(ctxt);
4984 	if (CUR != '}') {
4985 	    ERROR("Expecting '}'");
4986 	    return;
4987 	}
4988 	NEXT;
4989     } else if (cur == 'P') {
4990 	NEXT;
4991 	if (CUR != '{') {
4992 	    ERROR("Expecting '{'");
4993 	    return;
4994 	}
4995 	NEXT;
4996 	xmlFAParseCharProp(ctxt);
4997         if (ctxt->atom != NULL)
4998 	    ctxt->atom->neg = 1;
4999 	if (CUR != '}') {
5000 	    ERROR("Expecting '}'");
5001 	    return;
5002 	}
5003 	NEXT;
5004     } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
5005 	(cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
5006 	(cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
5007 	(cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
5008 	(cur == 0x5E) ||
5009 
5010 	/* Non-standard escape sequences:
5011 	 *                  Java 1.8|.NET Core 3.1|MSXML 6 */
5012 	(cur == '!') ||     /*   +  |     +       |    +   */
5013 	(cur == '"') ||     /*   +  |     +       |    +   */
5014 	(cur == '#') ||     /*   +  |     +       |    +   */
5015 	(cur == '$') ||     /*   +  |     +       |    +   */
5016 	(cur == '%') ||     /*   +  |     +       |    +   */
5017 	(cur == ',') ||     /*   +  |     +       |    +   */
5018 	(cur == '/') ||     /*   +  |     +       |    +   */
5019 	(cur == ':') ||     /*   +  |     +       |    +   */
5020 	(cur == ';') ||     /*   +  |     +       |    +   */
5021 	(cur == '=') ||     /*   +  |     +       |    +   */
5022 	(cur == '>') ||     /*      |     +       |    +   */
5023 	(cur == '@') ||     /*   +  |     +       |    +   */
5024 	(cur == '`') ||     /*   +  |     +       |    +   */
5025 	(cur == '~') ||     /*   +  |     +       |    +   */
5026 	(cur == 'u')) {     /*      |     +       |    +   */
5027 	if (ctxt->atom == NULL) {
5028 	    ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5029 	    if (ctxt->atom != NULL) {
5030 	        switch (cur) {
5031 		    case 'n':
5032 		        ctxt->atom->codepoint = '\n';
5033 			break;
5034 		    case 'r':
5035 		        ctxt->atom->codepoint = '\r';
5036 			break;
5037 		    case 't':
5038 		        ctxt->atom->codepoint = '\t';
5039 			break;
5040 		    case 'u':
5041 			cur = parse_escaped_codepoint(ctxt);
5042 			if (cur < 0) {
5043 			    return;
5044 			}
5045 			ctxt->atom->codepoint = cur;
5046 			break;
5047 		    default:
5048 			ctxt->atom->codepoint = cur;
5049 		}
5050 	    }
5051 	} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
5052             switch (cur) {
5053                 case 'n':
5054                     cur = '\n';
5055                     break;
5056                 case 'r':
5057                     cur = '\r';
5058                     break;
5059                 case 't':
5060                     cur = '\t';
5061                     break;
5062             }
5063 	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5064 			       XML_REGEXP_CHARVAL, cur, cur, NULL);
5065 	}
5066 	NEXT;
5067     } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
5068 	(cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
5069 	(cur == 'w') || (cur == 'W')) {
5070 	xmlRegAtomType type = XML_REGEXP_ANYSPACE;
5071 
5072 	switch (cur) {
5073 	    case 's':
5074 		type = XML_REGEXP_ANYSPACE;
5075 		break;
5076 	    case 'S':
5077 		type = XML_REGEXP_NOTSPACE;
5078 		break;
5079 	    case 'i':
5080 		type = XML_REGEXP_INITNAME;
5081 		break;
5082 	    case 'I':
5083 		type = XML_REGEXP_NOTINITNAME;
5084 		break;
5085 	    case 'c':
5086 		type = XML_REGEXP_NAMECHAR;
5087 		break;
5088 	    case 'C':
5089 		type = XML_REGEXP_NOTNAMECHAR;
5090 		break;
5091 	    case 'd':
5092 		type = XML_REGEXP_DECIMAL;
5093 		break;
5094 	    case 'D':
5095 		type = XML_REGEXP_NOTDECIMAL;
5096 		break;
5097 	    case 'w':
5098 		type = XML_REGEXP_REALCHAR;
5099 		break;
5100 	    case 'W':
5101 		type = XML_REGEXP_NOTREALCHAR;
5102 		break;
5103 	}
5104 	NEXT;
5105 	if (ctxt->atom == NULL) {
5106 	    ctxt->atom = xmlRegNewAtom(ctxt, type);
5107 	} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
5108 	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5109 			       type, 0, 0, NULL);
5110 	}
5111     } else {
5112 	ERROR("Wrong escape sequence, misuse of character '\\'");
5113     }
5114 }
5115 
5116 /**
5117  * xmlFAParseCharRange:
5118  * @ctxt:  a regexp parser context
5119  *
5120  * [17]   charRange   ::=     seRange | XmlCharRef | XmlCharIncDash
5121  * [18]   seRange   ::=   charOrEsc '-' charOrEsc
5122  * [20]   charOrEsc   ::=   XmlChar | SingleCharEsc
5123  * [21]   XmlChar   ::=   [^\#x2D#x5B#x5D]
5124  * [22]   XmlCharIncDash   ::=   [^\#x5B#x5D]
5125  */
5126 static void
xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt)5127 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
5128     int cur, len;
5129     int start = -1;
5130     int end = -1;
5131 
5132     if (CUR == '\0') {
5133         ERROR("Expecting ']'");
5134 	return;
5135     }
5136 
5137     cur = CUR;
5138     if (cur == '\\') {
5139 	NEXT;
5140 	cur = CUR;
5141 	switch (cur) {
5142 	    case 'n': start = 0xA; break;
5143 	    case 'r': start = 0xD; break;
5144 	    case 't': start = 0x9; break;
5145 	    case '\\': case '|': case '.': case '-': case '^': case '?':
5146 	    case '*': case '+': case '{': case '}': case '(': case ')':
5147 	    case '[': case ']':
5148 		start = cur; break;
5149 	    default:
5150 		ERROR("Invalid escape value");
5151 		return;
5152 	}
5153 	end = start;
5154         len = 1;
5155     } else if ((cur != 0x5B) && (cur != 0x5D)) {
5156         end = start = CUR_SCHAR(ctxt->cur, len);
5157     } else {
5158 	ERROR("Expecting a char range");
5159 	return;
5160     }
5161     /*
5162      * Since we are "inside" a range, we can assume ctxt->cur is past
5163      * the start of ctxt->string, and PREV should be safe
5164      */
5165     if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
5166 	NEXTL(len);
5167 	return;
5168     }
5169     NEXTL(len);
5170     cur = CUR;
5171     if ((cur != '-') || (NXT(1) == '[') || (NXT(1) == ']')) {
5172         xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5173 		              XML_REGEXP_CHARVAL, start, end, NULL);
5174 	return;
5175     }
5176     NEXT;
5177     cur = CUR;
5178     if (cur == '\\') {
5179 	NEXT;
5180 	cur = CUR;
5181 	switch (cur) {
5182 	    case 'n': end = 0xA; break;
5183 	    case 'r': end = 0xD; break;
5184 	    case 't': end = 0x9; break;
5185 	    case '\\': case '|': case '.': case '-': case '^': case '?':
5186 	    case '*': case '+': case '{': case '}': case '(': case ')':
5187 	    case '[': case ']':
5188 		end = cur; break;
5189 	    default:
5190 		ERROR("Invalid escape value");
5191 		return;
5192 	}
5193         len = 1;
5194     } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
5195         end = CUR_SCHAR(ctxt->cur, len);
5196     } else {
5197 	ERROR("Expecting the end of a char range");
5198 	return;
5199     }
5200 
5201     /* TODO check that the values are acceptable character ranges for XML */
5202     if (end < start) {
5203 	ERROR("End of range is before start of range");
5204     } else {
5205         NEXTL(len);
5206         xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5207 		           XML_REGEXP_CHARVAL, start, end, NULL);
5208     }
5209     return;
5210 }
5211 
5212 /**
5213  * xmlFAParsePosCharGroup:
5214  * @ctxt:  a regexp parser context
5215  *
5216  * [14]   posCharGroup ::= ( charRange | charClassEsc  )+
5217  */
5218 static void
xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt)5219 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5220     do {
5221 	if (CUR == '\\') {
5222 	    xmlFAParseCharClassEsc(ctxt);
5223 	} else {
5224 	    xmlFAParseCharRange(ctxt);
5225 	}
5226     } while ((CUR != ']') && (CUR != '-') &&
5227              (CUR != 0) && (ctxt->error == 0));
5228 }
5229 
5230 /**
5231  * xmlFAParseCharGroup:
5232  * @ctxt:  a regexp parser context
5233  *
5234  * [13]   charGroup    ::= posCharGroup | negCharGroup | charClassSub
5235  * [15]   negCharGroup ::= '^' posCharGroup
5236  * [16]   charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
5237  * [12]   charClassExpr ::= '[' charGroup ']'
5238  */
5239 static void
xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt)5240 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
5241     int neg = ctxt->neg;
5242 
5243     if (CUR == '^') {
5244 	NEXT;
5245 	ctxt->neg = !ctxt->neg;
5246 	xmlFAParsePosCharGroup(ctxt);
5247 	ctxt->neg = neg;
5248     }
5249     while ((CUR != ']') && (ctxt->error == 0)) {
5250 	if ((CUR == '-') && (NXT(1) == '[')) {
5251 	    NEXT;	/* eat the '-' */
5252 	    NEXT;	/* eat the '[' */
5253 	    ctxt->neg = 2;
5254 	    xmlFAParseCharGroup(ctxt);
5255 	    ctxt->neg = neg;
5256 	    if (CUR == ']') {
5257 		NEXT;
5258 	    } else {
5259 		ERROR("charClassExpr: ']' expected");
5260 	    }
5261 	    break;
5262 	} else {
5263 	    xmlFAParsePosCharGroup(ctxt);
5264 	}
5265     }
5266 }
5267 
5268 /**
5269  * xmlFAParseCharClass:
5270  * @ctxt:  a regexp parser context
5271  *
5272  * [11]   charClass   ::=     charClassEsc | charClassExpr
5273  * [12]   charClassExpr   ::=   '[' charGroup ']'
5274  */
5275 static void
xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt)5276 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5277     if (CUR == '[') {
5278 	NEXT;
5279 	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5280 	if (ctxt->atom == NULL)
5281 	    return;
5282 	xmlFAParseCharGroup(ctxt);
5283 	if (CUR == ']') {
5284 	    NEXT;
5285 	} else {
5286 	    ERROR("xmlFAParseCharClass: ']' expected");
5287 	}
5288     } else {
5289 	xmlFAParseCharClassEsc(ctxt);
5290     }
5291 }
5292 
5293 /**
5294  * xmlFAParseQuantExact:
5295  * @ctxt:  a regexp parser context
5296  *
5297  * [8]   QuantExact   ::=   [0-9]+
5298  *
5299  * Returns 0 if success or -1 in case of error
5300  */
5301 static int
xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt)5302 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5303     int ret = 0;
5304     int ok = 0;
5305     int overflow = 0;
5306 
5307     while ((CUR >= '0') && (CUR <= '9')) {
5308         if (ret > INT_MAX / 10) {
5309             overflow = 1;
5310         } else {
5311             int digit = CUR - '0';
5312 
5313             ret *= 10;
5314             if (ret > INT_MAX - digit)
5315                 overflow = 1;
5316             else
5317                 ret += digit;
5318         }
5319 	ok = 1;
5320 	NEXT;
5321     }
5322     if ((ok != 1) || (overflow == 1)) {
5323 	return(-1);
5324     }
5325     return(ret);
5326 }
5327 
5328 /**
5329  * xmlFAParseQuantifier:
5330  * @ctxt:  a regexp parser context
5331  *
5332  * [4]   quantifier   ::=   [?*+] | ( '{' quantity '}' )
5333  * [5]   quantity   ::=   quantRange | quantMin | QuantExact
5334  * [6]   quantRange   ::=   QuantExact ',' QuantExact
5335  * [7]   quantMin   ::=   QuantExact ','
5336  * [8]   QuantExact   ::=   [0-9]+
5337  */
5338 static int
xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt)5339 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5340     int cur;
5341 
5342     cur = CUR;
5343     if ((cur == '?') || (cur == '*') || (cur == '+')) {
5344 	if (ctxt->atom != NULL) {
5345 	    if (cur == '?')
5346 		ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5347 	    else if (cur == '*')
5348 		ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5349 	    else if (cur == '+')
5350 		ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5351 	}
5352 	NEXT;
5353 	return(1);
5354     }
5355     if (cur == '{') {
5356 	int min = 0, max = 0;
5357 
5358 	NEXT;
5359 	cur = xmlFAParseQuantExact(ctxt);
5360 	if (cur >= 0)
5361 	    min = cur;
5362         else {
5363             ERROR("Improper quantifier");
5364         }
5365 	if (CUR == ',') {
5366 	    NEXT;
5367 	    if (CUR == '}')
5368 	        max = INT_MAX;
5369 	    else {
5370 	        cur = xmlFAParseQuantExact(ctxt);
5371 	        if (cur >= 0)
5372 		    max = cur;
5373 		else {
5374 		    ERROR("Improper quantifier");
5375 		}
5376 	    }
5377 	}
5378 	if (CUR == '}') {
5379 	    NEXT;
5380 	} else {
5381 	    ERROR("Unterminated quantifier");
5382 	}
5383 	if (max == 0)
5384 	    max = min;
5385 	if (ctxt->atom != NULL) {
5386 	    ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5387 	    ctxt->atom->min = min;
5388 	    ctxt->atom->max = max;
5389 	}
5390 	return(1);
5391     }
5392     return(0);
5393 }
5394 
5395 /**
5396  * xmlFAParseAtom:
5397  * @ctxt:  a regexp parser context
5398  *
5399  * [9]   atom   ::=   Char | charClass | ( '(' regExp ')' )
5400  */
5401 static int
xmlFAParseAtom(xmlRegParserCtxtPtr ctxt)5402 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5403     int codepoint, len;
5404 
5405     codepoint = xmlFAIsChar(ctxt);
5406     if (codepoint > 0) {
5407 	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5408 	if (ctxt->atom == NULL)
5409 	    return(-1);
5410 	codepoint = CUR_SCHAR(ctxt->cur, len);
5411 	ctxt->atom->codepoint = codepoint;
5412 	NEXTL(len);
5413 	return(1);
5414     } else if (CUR == '|') {
5415 	return(0);
5416     } else if (CUR == 0) {
5417 	return(0);
5418     } else if (CUR == ')') {
5419 	return(0);
5420     } else if (CUR == '(') {
5421 	xmlRegStatePtr start, oldend, start0;
5422 
5423 	NEXT;
5424         if (ctxt->depth >= 50) {
5425 	    ERROR("xmlFAParseAtom: maximum nesting depth exceeded");
5426             return(-1);
5427         }
5428 	/*
5429 	 * this extra Epsilon transition is needed if we count with 0 allowed
5430 	 * unfortunately this can't be known at that point
5431 	 */
5432 	xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5433 	start0 = ctxt->state;
5434 	xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5435 	start = ctxt->state;
5436 	oldend = ctxt->end;
5437 	ctxt->end = NULL;
5438 	ctxt->atom = NULL;
5439         ctxt->depth++;
5440 	xmlFAParseRegExp(ctxt, 0);
5441         ctxt->depth--;
5442 	if (CUR == ')') {
5443 	    NEXT;
5444 	} else {
5445 	    ERROR("xmlFAParseAtom: expecting ')'");
5446 	}
5447 	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5448 	if (ctxt->atom == NULL)
5449 	    return(-1);
5450 	ctxt->atom->start = start;
5451 	ctxt->atom->start0 = start0;
5452 	ctxt->atom->stop = ctxt->state;
5453 	ctxt->end = oldend;
5454 	return(1);
5455     } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5456 	xmlFAParseCharClass(ctxt);
5457 	return(1);
5458     }
5459     return(0);
5460 }
5461 
5462 /**
5463  * xmlFAParsePiece:
5464  * @ctxt:  a regexp parser context
5465  *
5466  * [3]   piece   ::=   atom quantifier?
5467  */
5468 static int
xmlFAParsePiece(xmlRegParserCtxtPtr ctxt)5469 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5470     int ret;
5471 
5472     ctxt->atom = NULL;
5473     ret = xmlFAParseAtom(ctxt);
5474     if (ret == 0)
5475 	return(0);
5476     if (ctxt->atom == NULL) {
5477 	ERROR("internal: no atom generated");
5478     }
5479     xmlFAParseQuantifier(ctxt);
5480     return(1);
5481 }
5482 
5483 /**
5484  * xmlFAParseBranch:
5485  * @ctxt:  a regexp parser context
5486  * @to: optional target to the end of the branch
5487  *
5488  * @to is used to optimize by removing duplicate path in automata
5489  * in expressions like (a|b)(c|d)
5490  *
5491  * [2]   branch   ::=   piece*
5492  */
5493 static int
xmlFAParseBranch(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr to)5494 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
5495     xmlRegStatePtr previous;
5496     int ret;
5497 
5498     previous = ctxt->state;
5499     ret = xmlFAParsePiece(ctxt);
5500     if (ret == 0) {
5501         /* Empty branch */
5502 	xmlFAGenerateEpsilonTransition(ctxt, previous, to);
5503     } else {
5504 	if (xmlFAGenerateTransitions(ctxt, previous,
5505 	        (CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5506                 ctxt->atom) < 0) {
5507             xmlRegFreeAtom(ctxt->atom);
5508             ctxt->atom = NULL;
5509 	    return(-1);
5510         }
5511 	previous = ctxt->state;
5512 	ctxt->atom = NULL;
5513     }
5514     while ((ret != 0) && (ctxt->error == 0)) {
5515 	ret = xmlFAParsePiece(ctxt);
5516 	if (ret != 0) {
5517 	    if (xmlFAGenerateTransitions(ctxt, previous,
5518 	            (CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5519                     ctxt->atom) < 0) {
5520                 xmlRegFreeAtom(ctxt->atom);
5521                 ctxt->atom = NULL;
5522                 return(-1);
5523             }
5524 	    previous = ctxt->state;
5525 	    ctxt->atom = NULL;
5526 	}
5527     }
5528     return(0);
5529 }
5530 
5531 /**
5532  * xmlFAParseRegExp:
5533  * @ctxt:  a regexp parser context
5534  * @top:  is this the top-level expression ?
5535  *
5536  * [1]   regExp   ::=     branch  ( '|' branch )*
5537  */
5538 static void
xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt,int top)5539 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
5540     xmlRegStatePtr start, end;
5541 
5542     /* if not top start should have been generated by an epsilon trans */
5543     start = ctxt->state;
5544     ctxt->end = NULL;
5545     xmlFAParseBranch(ctxt, NULL);
5546     if (top) {
5547 #ifdef DEBUG_REGEXP_GRAPH
5548 	printf("State %d is final\n", ctxt->state->no);
5549 #endif
5550 	ctxt->state->type = XML_REGEXP_FINAL_STATE;
5551     }
5552     if (CUR != '|') {
5553 	ctxt->end = ctxt->state;
5554 	return;
5555     }
5556     end = ctxt->state;
5557     while ((CUR == '|') && (ctxt->error == 0)) {
5558 	NEXT;
5559 	ctxt->state = start;
5560 	ctxt->end = NULL;
5561 	xmlFAParseBranch(ctxt, end);
5562     }
5563     if (!top) {
5564 	ctxt->state = end;
5565 	ctxt->end = end;
5566     }
5567 }
5568 
5569 /************************************************************************
5570  *									*
5571  *			The basic API					*
5572  *									*
5573  ************************************************************************/
5574 
5575 /**
5576  * xmlRegexpPrint:
5577  * @output: the file for the output debug
5578  * @regexp: the compiled regexp
5579  *
5580  * Print the content of the compiled regular expression
5581  */
5582 void
xmlRegexpPrint(FILE * output,xmlRegexpPtr regexp)5583 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5584     int i;
5585 
5586     if (output == NULL)
5587         return;
5588     fprintf(output, " regexp: ");
5589     if (regexp == NULL) {
5590 	fprintf(output, "NULL\n");
5591 	return;
5592     }
5593     fprintf(output, "'%s' ", regexp->string);
5594     fprintf(output, "\n");
5595     fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5596     for (i = 0;i < regexp->nbAtoms; i++) {
5597 	fprintf(output, " %02d ", i);
5598 	xmlRegPrintAtom(output, regexp->atoms[i]);
5599     }
5600     fprintf(output, "%d states:", regexp->nbStates);
5601     fprintf(output, "\n");
5602     for (i = 0;i < regexp->nbStates; i++) {
5603 	xmlRegPrintState(output, regexp->states[i]);
5604     }
5605     fprintf(output, "%d counters:\n", regexp->nbCounters);
5606     for (i = 0;i < regexp->nbCounters; i++) {
5607 	fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5608 		                                regexp->counters[i].max);
5609     }
5610 }
5611 
5612 /**
5613  * xmlRegexpCompile:
5614  * @regexp:  a regular expression string
5615  *
5616  * Parses a regular expression conforming to XML Schemas Part 2 Datatype
5617  * Appendix F and builds an automata suitable for testing strings against
5618  * that regular expression
5619  *
5620  * Returns the compiled expression or NULL in case of error
5621  */
5622 xmlRegexpPtr
xmlRegexpCompile(const xmlChar * regexp)5623 xmlRegexpCompile(const xmlChar *regexp) {
5624     xmlRegexpPtr ret = NULL;
5625     xmlRegParserCtxtPtr ctxt;
5626 
5627     ctxt = xmlRegNewParserCtxt(regexp);
5628     if (ctxt == NULL)
5629 	return(NULL);
5630 
5631     /* initialize the parser */
5632     ctxt->state = xmlRegStatePush(ctxt);
5633     if (ctxt->state == NULL)
5634         goto error;
5635     ctxt->start = ctxt->state;
5636     ctxt->end = NULL;
5637 
5638     /* parse the expression building an automata */
5639     xmlFAParseRegExp(ctxt, 1);
5640     if (CUR != 0) {
5641 	ERROR("xmlFAParseRegExp: extra characters");
5642     }
5643     if (ctxt->error != 0)
5644         goto error;
5645     ctxt->end = ctxt->state;
5646     ctxt->start->type = XML_REGEXP_START_STATE;
5647     ctxt->end->type = XML_REGEXP_FINAL_STATE;
5648 
5649     /* remove the Epsilon except for counted transitions */
5650     xmlFAEliminateEpsilonTransitions(ctxt);
5651 
5652 
5653     if (ctxt->error != 0)
5654         goto error;
5655     ret = xmlRegEpxFromParse(ctxt);
5656 
5657 error:
5658     xmlRegFreeParserCtxt(ctxt);
5659     return(ret);
5660 }
5661 
5662 /**
5663  * xmlRegexpExec:
5664  * @comp:  the compiled regular expression
5665  * @content:  the value to check against the regular expression
5666  *
5667  * Check if the regular expression generates the value
5668  *
5669  * Returns 1 if it matches, 0 if not and a negative value in case of error
5670  */
5671 int
xmlRegexpExec(xmlRegexpPtr comp,const xmlChar * content)5672 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5673     if ((comp == NULL) || (content == NULL))
5674 	return(-1);
5675     return(xmlFARegExec(comp, content));
5676 }
5677 
5678 /**
5679  * xmlRegexpIsDeterminist:
5680  * @comp:  the compiled regular expression
5681  *
5682  * Check if the regular expression is determinist
5683  *
5684  * Returns 1 if it yes, 0 if not and a negative value in case of error
5685  */
5686 int
xmlRegexpIsDeterminist(xmlRegexpPtr comp)5687 xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5688     xmlAutomataPtr am;
5689     int ret;
5690 
5691     if (comp == NULL)
5692 	return(-1);
5693     if (comp->determinist != -1)
5694 	return(comp->determinist);
5695 
5696     am = xmlNewAutomata();
5697     if (am == NULL)
5698         return(-1);
5699     if (am->states != NULL) {
5700 	int i;
5701 
5702 	for (i = 0;i < am->nbStates;i++)
5703 	    xmlRegFreeState(am->states[i]);
5704 	xmlFree(am->states);
5705     }
5706     am->nbAtoms = comp->nbAtoms;
5707     am->atoms = comp->atoms;
5708     am->nbStates = comp->nbStates;
5709     am->states = comp->states;
5710     am->determinist = -1;
5711     am->flags = comp->flags;
5712     ret = xmlFAComputesDeterminism(am);
5713     am->atoms = NULL;
5714     am->states = NULL;
5715     xmlFreeAutomata(am);
5716     comp->determinist = ret;
5717     return(ret);
5718 }
5719 
5720 /**
5721  * xmlRegFreeRegexp:
5722  * @regexp:  the regexp
5723  *
5724  * Free a regexp
5725  */
5726 void
xmlRegFreeRegexp(xmlRegexpPtr regexp)5727 xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5728     int i;
5729     if (regexp == NULL)
5730 	return;
5731 
5732     if (regexp->string != NULL)
5733 	xmlFree(regexp->string);
5734     if (regexp->states != NULL) {
5735 	for (i = 0;i < regexp->nbStates;i++)
5736 	    xmlRegFreeState(regexp->states[i]);
5737 	xmlFree(regexp->states);
5738     }
5739     if (regexp->atoms != NULL) {
5740 	for (i = 0;i < regexp->nbAtoms;i++)
5741 	    xmlRegFreeAtom(regexp->atoms[i]);
5742 	xmlFree(regexp->atoms);
5743     }
5744     if (regexp->counters != NULL)
5745 	xmlFree(regexp->counters);
5746     if (regexp->compact != NULL)
5747 	xmlFree(regexp->compact);
5748     if (regexp->transdata != NULL)
5749 	xmlFree(regexp->transdata);
5750     if (regexp->stringMap != NULL) {
5751 	for (i = 0; i < regexp->nbstrings;i++)
5752 	    xmlFree(regexp->stringMap[i]);
5753 	xmlFree(regexp->stringMap);
5754     }
5755 
5756     xmlFree(regexp);
5757 }
5758 
5759 #ifdef LIBXML_AUTOMATA_ENABLED
5760 /************************************************************************
5761  *									*
5762  *			The Automata interface				*
5763  *									*
5764  ************************************************************************/
5765 
5766 /**
5767  * xmlNewAutomata:
5768  *
5769  * Create a new automata
5770  *
5771  * Returns the new object or NULL in case of failure
5772  */
5773 xmlAutomataPtr
xmlNewAutomata(void)5774 xmlNewAutomata(void) {
5775     xmlAutomataPtr ctxt;
5776 
5777     ctxt = xmlRegNewParserCtxt(NULL);
5778     if (ctxt == NULL)
5779 	return(NULL);
5780 
5781     /* initialize the parser */
5782     ctxt->state = xmlRegStatePush(ctxt);
5783     if (ctxt->state == NULL) {
5784 	xmlFreeAutomata(ctxt);
5785 	return(NULL);
5786     }
5787     ctxt->start = ctxt->state;
5788     ctxt->end = NULL;
5789 
5790     ctxt->start->type = XML_REGEXP_START_STATE;
5791     ctxt->flags = 0;
5792 
5793     return(ctxt);
5794 }
5795 
5796 /**
5797  * xmlFreeAutomata:
5798  * @am: an automata
5799  *
5800  * Free an automata
5801  */
5802 void
xmlFreeAutomata(xmlAutomataPtr am)5803 xmlFreeAutomata(xmlAutomataPtr am) {
5804     if (am == NULL)
5805 	return;
5806     xmlRegFreeParserCtxt(am);
5807 }
5808 
5809 /**
5810  * xmlAutomataSetFlags:
5811  * @am: an automata
5812  * @flags:  a set of internal flags
5813  *
5814  * Set some flags on the automata
5815  */
5816 void
xmlAutomataSetFlags(xmlAutomataPtr am,int flags)5817 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5818     if (am == NULL)
5819 	return;
5820     am->flags |= flags;
5821 }
5822 
5823 /**
5824  * xmlAutomataGetInitState:
5825  * @am: an automata
5826  *
5827  * Initial state lookup
5828  *
5829  * Returns the initial state of the automata
5830  */
5831 xmlAutomataStatePtr
xmlAutomataGetInitState(xmlAutomataPtr am)5832 xmlAutomataGetInitState(xmlAutomataPtr am) {
5833     if (am == NULL)
5834 	return(NULL);
5835     return(am->start);
5836 }
5837 
5838 /**
5839  * xmlAutomataSetFinalState:
5840  * @am: an automata
5841  * @state: a state in this automata
5842  *
5843  * Makes that state a final state
5844  *
5845  * Returns 0 or -1 in case of error
5846  */
5847 int
xmlAutomataSetFinalState(xmlAutomataPtr am,xmlAutomataStatePtr state)5848 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5849     if ((am == NULL) || (state == NULL))
5850 	return(-1);
5851     state->type = XML_REGEXP_FINAL_STATE;
5852     return(0);
5853 }
5854 
5855 /**
5856  * xmlAutomataNewTransition:
5857  * @am: an automata
5858  * @from: the starting point of the transition
5859  * @to: the target point of the transition or NULL
5860  * @token: the input string associated to that transition
5861  * @data: data passed to the callback function if the transition is activated
5862  *
5863  * If @to is NULL, this creates first a new target state in the automata
5864  * and then adds a transition from the @from state to the target state
5865  * activated by the value of @token
5866  *
5867  * Returns the target state or NULL in case of error
5868  */
5869 xmlAutomataStatePtr
xmlAutomataNewTransition(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,void * data)5870 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5871 			 xmlAutomataStatePtr to, const xmlChar *token,
5872 			 void *data) {
5873     xmlRegAtomPtr atom;
5874 
5875     if ((am == NULL) || (from == NULL) || (token == NULL))
5876 	return(NULL);
5877     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5878     if (atom == NULL)
5879         return(NULL);
5880     atom->data = data;
5881     atom->valuep = xmlStrdup(token);
5882 
5883     if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5884         xmlRegFreeAtom(atom);
5885 	return(NULL);
5886     }
5887     if (to == NULL)
5888 	return(am->state);
5889     return(to);
5890 }
5891 
5892 /**
5893  * xmlAutomataNewTransition2:
5894  * @am: an automata
5895  * @from: the starting point of the transition
5896  * @to: the target point of the transition or NULL
5897  * @token: the first input string associated to that transition
5898  * @token2: the second input string associated to that transition
5899  * @data: data passed to the callback function if the transition is activated
5900  *
5901  * If @to is NULL, this creates first a new target state in the automata
5902  * and then adds a transition from the @from state to the target state
5903  * activated by the value of @token
5904  *
5905  * Returns the target state or NULL in case of error
5906  */
5907 xmlAutomataStatePtr
xmlAutomataNewTransition2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5908 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5909 			  xmlAutomataStatePtr to, const xmlChar *token,
5910 			  const xmlChar *token2, void *data) {
5911     xmlRegAtomPtr atom;
5912 
5913     if ((am == NULL) || (from == NULL) || (token == NULL))
5914 	return(NULL);
5915     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5916     if (atom == NULL)
5917 	return(NULL);
5918     atom->data = data;
5919     if ((token2 == NULL) || (*token2 == 0)) {
5920 	atom->valuep = xmlStrdup(token);
5921     } else {
5922 	int lenn, lenp;
5923 	xmlChar *str;
5924 
5925 	lenn = strlen((char *) token2);
5926 	lenp = strlen((char *) token);
5927 
5928 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5929 	if (str == NULL) {
5930 	    xmlRegFreeAtom(atom);
5931 	    return(NULL);
5932 	}
5933 	memcpy(&str[0], token, lenp);
5934 	str[lenp] = '|';
5935 	memcpy(&str[lenp + 1], token2, lenn);
5936 	str[lenn + lenp + 1] = 0;
5937 
5938 	atom->valuep = str;
5939     }
5940 
5941     if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5942         xmlRegFreeAtom(atom);
5943 	return(NULL);
5944     }
5945     if (to == NULL)
5946 	return(am->state);
5947     return(to);
5948 }
5949 
5950 /**
5951  * xmlAutomataNewNegTrans:
5952  * @am: an automata
5953  * @from: the starting point of the transition
5954  * @to: the target point of the transition or NULL
5955  * @token: the first input string associated to that transition
5956  * @token2: the second input string associated to that transition
5957  * @data: data passed to the callback function if the transition is activated
5958  *
5959  * If @to is NULL, this creates first a new target state in the automata
5960  * and then adds a transition from the @from state to the target state
5961  * activated by any value except (@token,@token2)
5962  * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5963  # the semantic of XSD ##other
5964  *
5965  * Returns the target state or NULL in case of error
5966  */
5967 xmlAutomataStatePtr
xmlAutomataNewNegTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5968 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5969 		       xmlAutomataStatePtr to, const xmlChar *token,
5970 		       const xmlChar *token2, void *data) {
5971     xmlRegAtomPtr atom;
5972     xmlChar err_msg[200];
5973 
5974     if ((am == NULL) || (from == NULL) || (token == NULL))
5975 	return(NULL);
5976     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5977     if (atom == NULL)
5978 	return(NULL);
5979     atom->data = data;
5980     atom->neg = 1;
5981     if ((token2 == NULL) || (*token2 == 0)) {
5982 	atom->valuep = xmlStrdup(token);
5983     } else {
5984 	int lenn, lenp;
5985 	xmlChar *str;
5986 
5987 	lenn = strlen((char *) token2);
5988 	lenp = strlen((char *) token);
5989 
5990 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5991 	if (str == NULL) {
5992 	    xmlRegFreeAtom(atom);
5993 	    return(NULL);
5994 	}
5995 	memcpy(&str[0], token, lenp);
5996 	str[lenp] = '|';
5997 	memcpy(&str[lenp + 1], token2, lenn);
5998 	str[lenn + lenp + 1] = 0;
5999 
6000 	atom->valuep = str;
6001     }
6002     snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
6003     err_msg[199] = 0;
6004     atom->valuep2 = xmlStrdup(err_msg);
6005 
6006     if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
6007         xmlRegFreeAtom(atom);
6008 	return(NULL);
6009     }
6010     am->negs++;
6011     if (to == NULL)
6012 	return(am->state);
6013     return(to);
6014 }
6015 
6016 /**
6017  * xmlAutomataNewCountTrans2:
6018  * @am: an automata
6019  * @from: the starting point of the transition
6020  * @to: the target point of the transition or NULL
6021  * @token: the input string associated to that transition
6022  * @token2: the second input string associated to that transition
6023  * @min:  the minimum successive occurrences of token
6024  * @max:  the maximum successive occurrences of token
6025  * @data:  data associated to the transition
6026  *
6027  * If @to is NULL, this creates first a new target state in the automata
6028  * and then adds a transition from the @from state to the target state
6029  * activated by a succession of input of value @token and @token2 and
6030  * whose number is between @min and @max
6031  *
6032  * Returns the target state or NULL in case of error
6033  */
6034 xmlAutomataStatePtr
xmlAutomataNewCountTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)6035 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6036 			 xmlAutomataStatePtr to, const xmlChar *token,
6037 			 const xmlChar *token2,
6038 			 int min, int max, void *data) {
6039     xmlRegAtomPtr atom;
6040     int counter;
6041 
6042     if ((am == NULL) || (from == NULL) || (token == NULL))
6043 	return(NULL);
6044     if (min < 0)
6045 	return(NULL);
6046     if ((max < min) || (max < 1))
6047 	return(NULL);
6048     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6049     if (atom == NULL)
6050 	return(NULL);
6051     if ((token2 == NULL) || (*token2 == 0)) {
6052 	atom->valuep = xmlStrdup(token);
6053         if (atom->valuep == NULL)
6054             goto error;
6055     } else {
6056 	int lenn, lenp;
6057 	xmlChar *str;
6058 
6059 	lenn = strlen((char *) token2);
6060 	lenp = strlen((char *) token);
6061 
6062 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6063 	if (str == NULL)
6064 	    goto error;
6065 	memcpy(&str[0], token, lenp);
6066 	str[lenp] = '|';
6067 	memcpy(&str[lenp + 1], token2, lenn);
6068 	str[lenn + lenp + 1] = 0;
6069 
6070 	atom->valuep = str;
6071     }
6072     atom->data = data;
6073     if (min == 0)
6074 	atom->min = 1;
6075     else
6076 	atom->min = min;
6077     atom->max = max;
6078 
6079     /*
6080      * associate a counter to the transition.
6081      */
6082     counter = xmlRegGetCounter(am);
6083     if (counter < 0)
6084         goto error;
6085     am->counters[counter].min = min;
6086     am->counters[counter].max = max;
6087 
6088     /* xmlFAGenerateTransitions(am, from, to, atom); */
6089     if (to == NULL) {
6090 	to = xmlRegStatePush(am);
6091         if (to == NULL)
6092             goto error;
6093     }
6094     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6095     if (xmlRegAtomPush(am, atom) < 0)
6096         goto error;
6097     am->state = to;
6098 
6099     if (to == NULL)
6100 	to = am->state;
6101     if (to == NULL)
6102 	return(NULL);
6103     if (min == 0)
6104 	xmlFAGenerateEpsilonTransition(am, from, to);
6105     return(to);
6106 
6107 error:
6108     xmlRegFreeAtom(atom);
6109     return(NULL);
6110 }
6111 
6112 /**
6113  * xmlAutomataNewCountTrans:
6114  * @am: an automata
6115  * @from: the starting point of the transition
6116  * @to: the target point of the transition or NULL
6117  * @token: the input string associated to that transition
6118  * @min:  the minimum successive occurrences of token
6119  * @max:  the maximum successive occurrences of token
6120  * @data:  data associated to the transition
6121  *
6122  * If @to is NULL, this creates first a new target state in the automata
6123  * and then adds a transition from the @from state to the target state
6124  * activated by a succession of input of value @token and whose number
6125  * is between @min and @max
6126  *
6127  * Returns the target state or NULL in case of error
6128  */
6129 xmlAutomataStatePtr
xmlAutomataNewCountTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)6130 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6131 			 xmlAutomataStatePtr to, const xmlChar *token,
6132 			 int min, int max, void *data) {
6133     xmlRegAtomPtr atom;
6134     int counter;
6135 
6136     if ((am == NULL) || (from == NULL) || (token == NULL))
6137 	return(NULL);
6138     if (min < 0)
6139 	return(NULL);
6140     if ((max < min) || (max < 1))
6141 	return(NULL);
6142     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6143     if (atom == NULL)
6144 	return(NULL);
6145     atom->valuep = xmlStrdup(token);
6146     if (atom->valuep == NULL)
6147         goto error;
6148     atom->data = data;
6149     if (min == 0)
6150 	atom->min = 1;
6151     else
6152 	atom->min = min;
6153     atom->max = max;
6154 
6155     /*
6156      * associate a counter to the transition.
6157      */
6158     counter = xmlRegGetCounter(am);
6159     if (counter < 0)
6160         goto error;
6161     am->counters[counter].min = min;
6162     am->counters[counter].max = max;
6163 
6164     /* xmlFAGenerateTransitions(am, from, to, atom); */
6165     if (to == NULL) {
6166 	to = xmlRegStatePush(am);
6167         if (to == NULL)
6168             goto error;
6169     }
6170     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6171     if (xmlRegAtomPush(am, atom) < 0)
6172         goto error;
6173     am->state = to;
6174 
6175     if (to == NULL)
6176 	to = am->state;
6177     if (to == NULL)
6178 	return(NULL);
6179     if (min == 0)
6180 	xmlFAGenerateEpsilonTransition(am, from, to);
6181     return(to);
6182 
6183 error:
6184     xmlRegFreeAtom(atom);
6185     return(NULL);
6186 }
6187 
6188 /**
6189  * xmlAutomataNewOnceTrans2:
6190  * @am: an automata
6191  * @from: the starting point of the transition
6192  * @to: the target point of the transition or NULL
6193  * @token: the input string associated to that transition
6194  * @token2: the second input string associated to that transition
6195  * @min:  the minimum successive occurrences of token
6196  * @max:  the maximum successive occurrences of token
6197  * @data:  data associated to the transition
6198  *
6199  * If @to is NULL, this creates first a new target state in the automata
6200  * and then adds a transition from the @from state to the target state
6201  * activated by a succession of input of value @token and @token2 and whose
6202  * number is between @min and @max, moreover that transition can only be
6203  * crossed once.
6204  *
6205  * Returns the target state or NULL in case of error
6206  */
6207 xmlAutomataStatePtr
xmlAutomataNewOnceTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)6208 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6209 			 xmlAutomataStatePtr to, const xmlChar *token,
6210 			 const xmlChar *token2,
6211 			 int min, int max, void *data) {
6212     xmlRegAtomPtr atom;
6213     int counter;
6214 
6215     if ((am == NULL) || (from == NULL) || (token == NULL))
6216 	return(NULL);
6217     if (min < 1)
6218 	return(NULL);
6219     if (max < min)
6220 	return(NULL);
6221     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6222     if (atom == NULL)
6223 	return(NULL);
6224     if ((token2 == NULL) || (*token2 == 0)) {
6225 	atom->valuep = xmlStrdup(token);
6226         if (atom->valuep == NULL)
6227             goto error;
6228     } else {
6229 	int lenn, lenp;
6230 	xmlChar *str;
6231 
6232 	lenn = strlen((char *) token2);
6233 	lenp = strlen((char *) token);
6234 
6235 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6236 	if (str == NULL)
6237 	    goto error;
6238 	memcpy(&str[0], token, lenp);
6239 	str[lenp] = '|';
6240 	memcpy(&str[lenp + 1], token2, lenn);
6241 	str[lenn + lenp + 1] = 0;
6242 
6243 	atom->valuep = str;
6244     }
6245     atom->data = data;
6246     atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6247     atom->min = min;
6248     atom->max = max;
6249     /*
6250      * associate a counter to the transition.
6251      */
6252     counter = xmlRegGetCounter(am);
6253     if (counter < 0)
6254         goto error;
6255     am->counters[counter].min = 1;
6256     am->counters[counter].max = 1;
6257 
6258     /* xmlFAGenerateTransitions(am, from, to, atom); */
6259     if (to == NULL) {
6260 	to = xmlRegStatePush(am);
6261         if (to == NULL)
6262             goto error;
6263     }
6264     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6265     if (xmlRegAtomPush(am, atom) < 0)
6266         goto error;
6267     am->state = to;
6268     return(to);
6269 
6270 error:
6271     xmlRegFreeAtom(atom);
6272     return(NULL);
6273 }
6274 
6275 
6276 
6277 /**
6278  * xmlAutomataNewOnceTrans:
6279  * @am: an automata
6280  * @from: the starting point of the transition
6281  * @to: the target point of the transition or NULL
6282  * @token: the input string associated to that transition
6283  * @min:  the minimum successive occurrences of token
6284  * @max:  the maximum successive occurrences of token
6285  * @data:  data associated to the transition
6286  *
6287  * If @to is NULL, this creates first a new target state in the automata
6288  * and then adds a transition from the @from state to the target state
6289  * activated by a succession of input of value @token and whose number
6290  * is between @min and @max, moreover that transition can only be crossed
6291  * once.
6292  *
6293  * Returns the target state or NULL in case of error
6294  */
6295 xmlAutomataStatePtr
xmlAutomataNewOnceTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)6296 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6297 			 xmlAutomataStatePtr to, const xmlChar *token,
6298 			 int min, int max, void *data) {
6299     xmlRegAtomPtr atom;
6300     int counter;
6301 
6302     if ((am == NULL) || (from == NULL) || (token == NULL))
6303 	return(NULL);
6304     if (min < 1)
6305 	return(NULL);
6306     if (max < min)
6307 	return(NULL);
6308     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6309     if (atom == NULL)
6310 	return(NULL);
6311     atom->valuep = xmlStrdup(token);
6312     atom->data = data;
6313     atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6314     atom->min = min;
6315     atom->max = max;
6316     /*
6317      * associate a counter to the transition.
6318      */
6319     counter = xmlRegGetCounter(am);
6320     if (counter < 0)
6321         goto error;
6322     am->counters[counter].min = 1;
6323     am->counters[counter].max = 1;
6324 
6325     /* xmlFAGenerateTransitions(am, from, to, atom); */
6326     if (to == NULL) {
6327 	to = xmlRegStatePush(am);
6328         if (to == NULL)
6329             goto error;
6330     }
6331     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6332     if (xmlRegAtomPush(am, atom) < 0)
6333         goto error;
6334     am->state = to;
6335     return(to);
6336 
6337 error:
6338     xmlRegFreeAtom(atom);
6339     return(NULL);
6340 }
6341 
6342 /**
6343  * xmlAutomataNewState:
6344  * @am: an automata
6345  *
6346  * Create a new disconnected state in the automata
6347  *
6348  * Returns the new state or NULL in case of error
6349  */
6350 xmlAutomataStatePtr
xmlAutomataNewState(xmlAutomataPtr am)6351 xmlAutomataNewState(xmlAutomataPtr am) {
6352     if (am == NULL)
6353 	return(NULL);
6354     return(xmlRegStatePush(am));
6355 }
6356 
6357 /**
6358  * xmlAutomataNewEpsilon:
6359  * @am: an automata
6360  * @from: the starting point of the transition
6361  * @to: the target point of the transition or NULL
6362  *
6363  * If @to is NULL, this creates first a new target state in the automata
6364  * and then adds an epsilon transition from the @from state to the
6365  * target state
6366  *
6367  * Returns the target state or NULL in case of error
6368  */
6369 xmlAutomataStatePtr
xmlAutomataNewEpsilon(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to)6370 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
6371 		      xmlAutomataStatePtr to) {
6372     if ((am == NULL) || (from == NULL))
6373 	return(NULL);
6374     xmlFAGenerateEpsilonTransition(am, from, to);
6375     if (to == NULL)
6376 	return(am->state);
6377     return(to);
6378 }
6379 
6380 /**
6381  * xmlAutomataNewAllTrans:
6382  * @am: an automata
6383  * @from: the starting point of the transition
6384  * @to: the target point of the transition or NULL
6385  * @lax: allow to transition if not all all transitions have been activated
6386  *
6387  * If @to is NULL, this creates first a new target state in the automata
6388  * and then adds a an ALL transition from the @from state to the
6389  * target state. That transition is an epsilon transition allowed only when
6390  * all transitions from the @from node have been activated.
6391  *
6392  * Returns the target state or NULL in case of error
6393  */
6394 xmlAutomataStatePtr
xmlAutomataNewAllTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int lax)6395 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6396 		       xmlAutomataStatePtr to, int lax) {
6397     if ((am == NULL) || (from == NULL))
6398 	return(NULL);
6399     xmlFAGenerateAllTransition(am, from, to, lax);
6400     if (to == NULL)
6401 	return(am->state);
6402     return(to);
6403 }
6404 
6405 /**
6406  * xmlAutomataNewCounter:
6407  * @am: an automata
6408  * @min:  the minimal value on the counter
6409  * @max:  the maximal value on the counter
6410  *
6411  * Create a new counter
6412  *
6413  * Returns the counter number or -1 in case of error
6414  */
6415 int
xmlAutomataNewCounter(xmlAutomataPtr am,int min,int max)6416 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6417     int ret;
6418 
6419     if (am == NULL)
6420 	return(-1);
6421 
6422     ret = xmlRegGetCounter(am);
6423     if (ret < 0)
6424 	return(-1);
6425     am->counters[ret].min = min;
6426     am->counters[ret].max = max;
6427     return(ret);
6428 }
6429 
6430 /**
6431  * xmlAutomataNewCountedTrans:
6432  * @am: an automata
6433  * @from: the starting point of the transition
6434  * @to: the target point of the transition or NULL
6435  * @counter: the counter associated to that transition
6436  *
6437  * If @to is NULL, this creates first a new target state in the automata
6438  * and then adds an epsilon transition from the @from state to the target state
6439  * which will increment the counter provided
6440  *
6441  * Returns the target state or NULL in case of error
6442  */
6443 xmlAutomataStatePtr
xmlAutomataNewCountedTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6444 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6445 		xmlAutomataStatePtr to, int counter) {
6446     if ((am == NULL) || (from == NULL) || (counter < 0))
6447 	return(NULL);
6448     xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6449     if (to == NULL)
6450 	return(am->state);
6451     return(to);
6452 }
6453 
6454 /**
6455  * xmlAutomataNewCounterTrans:
6456  * @am: an automata
6457  * @from: the starting point of the transition
6458  * @to: the target point of the transition or NULL
6459  * @counter: the counter associated to that transition
6460  *
6461  * If @to is NULL, this creates first a new target state in the automata
6462  * and then adds an epsilon transition from the @from state to the target state
6463  * which will be allowed only if the counter is within the right range.
6464  *
6465  * Returns the target state or NULL in case of error
6466  */
6467 xmlAutomataStatePtr
xmlAutomataNewCounterTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6468 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6469 		xmlAutomataStatePtr to, int counter) {
6470     if ((am == NULL) || (from == NULL) || (counter < 0))
6471 	return(NULL);
6472     xmlFAGenerateCountedTransition(am, from, to, counter);
6473     if (to == NULL)
6474 	return(am->state);
6475     return(to);
6476 }
6477 
6478 /**
6479  * xmlAutomataCompile:
6480  * @am: an automata
6481  *
6482  * Compile the automata into a Reg Exp ready for being executed.
6483  * The automata should be free after this point.
6484  *
6485  * Returns the compiled regexp or NULL in case of error
6486  */
6487 xmlRegexpPtr
xmlAutomataCompile(xmlAutomataPtr am)6488 xmlAutomataCompile(xmlAutomataPtr am) {
6489     xmlRegexpPtr ret;
6490 
6491     if ((am == NULL) || (am->error != 0)) return(NULL);
6492     xmlFAEliminateEpsilonTransitions(am);
6493     /* xmlFAComputesDeterminism(am); */
6494     ret = xmlRegEpxFromParse(am);
6495 
6496     return(ret);
6497 }
6498 
6499 /**
6500  * xmlAutomataIsDeterminist:
6501  * @am: an automata
6502  *
6503  * Checks if an automata is determinist.
6504  *
6505  * Returns 1 if true, 0 if not, and -1 in case of error
6506  */
6507 int
xmlAutomataIsDeterminist(xmlAutomataPtr am)6508 xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6509     int ret;
6510 
6511     if (am == NULL)
6512 	return(-1);
6513 
6514     ret = xmlFAComputesDeterminism(am);
6515     return(ret);
6516 }
6517 #endif /* LIBXML_AUTOMATA_ENABLED */
6518 
6519 #ifdef LIBXML_EXPR_ENABLED
6520 /************************************************************************
6521  *									*
6522  *		Formal Expression handling code				*
6523  *									*
6524  ************************************************************************/
6525 /************************************************************************
6526  *									*
6527  *		Expression handling context				*
6528  *									*
6529  ************************************************************************/
6530 
6531 struct _xmlExpCtxt {
6532     xmlDictPtr dict;
6533     xmlExpNodePtr *table;
6534     int size;
6535     int nbElems;
6536     int nb_nodes;
6537     int maxNodes;
6538     const char *expr;
6539     const char *cur;
6540     int nb_cons;
6541     int tabSize;
6542 };
6543 
6544 /**
6545  * xmlExpNewCtxt:
6546  * @maxNodes:  the maximum number of nodes
6547  * @dict:  optional dictionary to use internally
6548  *
6549  * Creates a new context for manipulating expressions
6550  *
6551  * Returns the context or NULL in case of error
6552  */
6553 xmlExpCtxtPtr
xmlExpNewCtxt(int maxNodes,xmlDictPtr dict)6554 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6555     xmlExpCtxtPtr ret;
6556     int size = 256;
6557 
6558     if (maxNodes <= 4096)
6559         maxNodes = 4096;
6560 
6561     ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6562     if (ret == NULL)
6563         return(NULL);
6564     memset(ret, 0, sizeof(xmlExpCtxt));
6565     ret->size = size;
6566     ret->nbElems = 0;
6567     ret->maxNodes = maxNodes;
6568     ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6569     if (ret->table == NULL) {
6570         xmlFree(ret);
6571 	return(NULL);
6572     }
6573     memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6574     if (dict == NULL) {
6575         ret->dict = xmlDictCreate();
6576 	if (ret->dict == NULL) {
6577 	    xmlFree(ret->table);
6578 	    xmlFree(ret);
6579 	    return(NULL);
6580 	}
6581     } else {
6582         ret->dict = dict;
6583 	xmlDictReference(ret->dict);
6584     }
6585     return(ret);
6586 }
6587 
6588 /**
6589  * xmlExpFreeCtxt:
6590  * @ctxt:  an expression context
6591  *
6592  * Free an expression context
6593  */
6594 void
xmlExpFreeCtxt(xmlExpCtxtPtr ctxt)6595 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6596     if (ctxt == NULL)
6597         return;
6598     xmlDictFree(ctxt->dict);
6599     if (ctxt->table != NULL)
6600 	xmlFree(ctxt->table);
6601     xmlFree(ctxt);
6602 }
6603 
6604 /************************************************************************
6605  *									*
6606  *		Structure associated to an expression node		*
6607  *									*
6608  ************************************************************************/
6609 #define MAX_NODES 10000
6610 
6611 /* #define DEBUG_DERIV */
6612 
6613 /*
6614  * TODO:
6615  * - Wildcards
6616  * - public API for creation
6617  *
6618  * Started
6619  * - regression testing
6620  *
6621  * Done
6622  * - split into module and test tool
6623  * - memleaks
6624  */
6625 
6626 typedef enum {
6627     XML_EXP_NILABLE = (1 << 0)
6628 } xmlExpNodeInfo;
6629 
6630 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6631 
6632 struct _xmlExpNode {
6633     unsigned char type;/* xmlExpNodeType */
6634     unsigned char info;/* OR of xmlExpNodeInfo */
6635     unsigned short key;	/* the hash key */
6636     unsigned int ref;	/* The number of references */
6637     int c_max;		/* the maximum length it can consume */
6638     xmlExpNodePtr exp_left;
6639     xmlExpNodePtr next;/* the next node in the hash table or free list */
6640     union {
6641 	struct {
6642 	    int f_min;
6643 	    int f_max;
6644 	} count;
6645 	struct {
6646 	    xmlExpNodePtr f_right;
6647 	} children;
6648         const xmlChar *f_str;
6649     } field;
6650 };
6651 
6652 #define exp_min field.count.f_min
6653 #define exp_max field.count.f_max
6654 /* #define exp_left field.children.f_left */
6655 #define exp_right field.children.f_right
6656 #define exp_str field.f_str
6657 
6658 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6659 static xmlExpNode forbiddenExpNode = {
6660     XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6661 };
6662 xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6663 static xmlExpNode emptyExpNode = {
6664     XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6665 };
6666 xmlExpNodePtr emptyExp = &emptyExpNode;
6667 
6668 /************************************************************************
6669  *									*
6670  *  The custom hash table for unicity and canonicalization		*
6671  *  of sub-expressions pointers						*
6672  *									*
6673  ************************************************************************/
6674 /*
6675  * xmlExpHashNameComputeKey:
6676  * Calculate the hash key for a token
6677  */
6678 static unsigned short
xmlExpHashNameComputeKey(const xmlChar * name)6679 xmlExpHashNameComputeKey(const xmlChar *name) {
6680     unsigned short value = 0L;
6681     char ch;
6682 
6683     if (name != NULL) {
6684 	value += 30 * (*name);
6685 	while ((ch = *name++) != 0) {
6686 	    value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6687 	}
6688     }
6689     return (value);
6690 }
6691 
6692 /*
6693  * xmlExpHashComputeKey:
6694  * Calculate the hash key for a compound expression
6695  */
6696 static unsigned short
xmlExpHashComputeKey(xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right)6697 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6698                      xmlExpNodePtr right) {
6699     unsigned long value;
6700     unsigned short ret;
6701 
6702     switch (type) {
6703         case XML_EXP_SEQ:
6704 	    value = left->key;
6705 	    value += right->key;
6706 	    value *= 3;
6707 	    ret = (unsigned short) value;
6708 	    break;
6709         case XML_EXP_OR:
6710 	    value = left->key;
6711 	    value += right->key;
6712 	    value *= 7;
6713 	    ret = (unsigned short) value;
6714 	    break;
6715         case XML_EXP_COUNT:
6716 	    value = left->key;
6717 	    value += right->key;
6718 	    ret = (unsigned short) value;
6719 	    break;
6720 	default:
6721 	    ret = 0;
6722     }
6723     return(ret);
6724 }
6725 
6726 
6727 static xmlExpNodePtr
xmlExpNewNode(xmlExpCtxtPtr ctxt,xmlExpNodeType type)6728 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6729     xmlExpNodePtr ret;
6730 
6731     if (ctxt->nb_nodes >= MAX_NODES)
6732         return(NULL);
6733     ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6734     if (ret == NULL)
6735         return(NULL);
6736     memset(ret, 0, sizeof(xmlExpNode));
6737     ret->type = type;
6738     ret->next = NULL;
6739     ctxt->nb_nodes++;
6740     ctxt->nb_cons++;
6741     return(ret);
6742 }
6743 
6744 /**
6745  * xmlExpHashGetEntry:
6746  * @table: the hash table
6747  *
6748  * Get the unique entry from the hash table. The entry is created if
6749  * needed. @left and @right are consumed, i.e. their ref count will
6750  * be decremented by the operation.
6751  *
6752  * Returns the pointer or NULL in case of error
6753  */
6754 static xmlExpNodePtr
xmlExpHashGetEntry(xmlExpCtxtPtr ctxt,xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right,const xmlChar * name,int min,int max)6755 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6756                    xmlExpNodePtr left, xmlExpNodePtr right,
6757 		   const xmlChar *name, int min, int max) {
6758     unsigned short kbase, key;
6759     xmlExpNodePtr entry;
6760     xmlExpNodePtr insert;
6761 
6762     if (ctxt == NULL)
6763 	return(NULL);
6764 
6765     /*
6766      * Check for duplicate and insertion location.
6767      */
6768     if (type == XML_EXP_ATOM) {
6769 	kbase = xmlExpHashNameComputeKey(name);
6770     } else if (type == XML_EXP_COUNT) {
6771         /* COUNT reduction rule 1 */
6772 	/* a{1} -> a */
6773 	if (min == max) {
6774 	    if (min == 1) {
6775 		return(left);
6776 	    }
6777 	    if (min == 0) {
6778 		xmlExpFree(ctxt, left);
6779 	        return(emptyExp);
6780 	    }
6781 	}
6782 	if (min < 0) {
6783 	    xmlExpFree(ctxt, left);
6784 	    return(forbiddenExp);
6785 	}
6786         if (max == -1)
6787 	    kbase = min + 79;
6788 	else
6789 	    kbase = max - min;
6790 	kbase += left->key;
6791     } else if (type == XML_EXP_OR) {
6792         /* Forbid reduction rules */
6793         if (left->type == XML_EXP_FORBID) {
6794 	    xmlExpFree(ctxt, left);
6795 	    return(right);
6796 	}
6797         if (right->type == XML_EXP_FORBID) {
6798 	    xmlExpFree(ctxt, right);
6799 	    return(left);
6800 	}
6801 
6802         /* OR reduction rule 1 */
6803 	/* a | a reduced to a */
6804         if (left == right) {
6805 	    xmlExpFree(ctxt, right);
6806 	    return(left);
6807 	}
6808         /* OR canonicalization rule 1 */
6809 	/* linearize (a | b) | c into a | (b | c) */
6810         if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6811 	    xmlExpNodePtr tmp = left;
6812             left = right;
6813 	    right = tmp;
6814 	}
6815         /* OR reduction rule 2 */
6816 	/* a | (a | b) and b | (a | b) are reduced to a | b */
6817         if (right->type == XML_EXP_OR) {
6818 	    if ((left == right->exp_left) ||
6819 	        (left == right->exp_right)) {
6820 		xmlExpFree(ctxt, left);
6821 		return(right);
6822 	    }
6823 	}
6824         /* OR canonicalization rule 2 */
6825 	/* linearize (a | b) | c into a | (b | c) */
6826         if (left->type == XML_EXP_OR) {
6827 	    xmlExpNodePtr tmp;
6828 
6829 	    /* OR canonicalization rule 2 */
6830 	    if ((left->exp_right->type != XML_EXP_OR) &&
6831 	        (left->exp_right->key < left->exp_left->key)) {
6832 	        tmp = left->exp_right;
6833 		left->exp_right = left->exp_left;
6834 		left->exp_left = tmp;
6835 	    }
6836 	    left->exp_right->ref++;
6837 	    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6838 	                             NULL, 0, 0);
6839 	    left->exp_left->ref++;
6840 	    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6841 	                             NULL, 0, 0);
6842 
6843 	    xmlExpFree(ctxt, left);
6844 	    return(tmp);
6845 	}
6846 	if (right->type == XML_EXP_OR) {
6847 	    /* Ordering in the tree */
6848 	    /* C | (A | B) -> A | (B | C) */
6849 	    if (left->key > right->exp_right->key) {
6850 		xmlExpNodePtr tmp;
6851 		right->exp_right->ref++;
6852 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6853 		                         left, NULL, 0, 0);
6854 		right->exp_left->ref++;
6855 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6856 		                         tmp, NULL, 0, 0);
6857 		xmlExpFree(ctxt, right);
6858 		return(tmp);
6859 	    }
6860 	    /* Ordering in the tree */
6861 	    /* B | (A | C) -> A | (B | C) */
6862 	    if (left->key > right->exp_left->key) {
6863 		xmlExpNodePtr tmp;
6864 		right->exp_right->ref++;
6865 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6866 		                         right->exp_right, NULL, 0, 0);
6867 		right->exp_left->ref++;
6868 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6869 		                         tmp, NULL, 0, 0);
6870 		xmlExpFree(ctxt, right);
6871 		return(tmp);
6872 	    }
6873 	}
6874 	/* we know both types are != XML_EXP_OR here */
6875         else if (left->key > right->key) {
6876 	    xmlExpNodePtr tmp = left;
6877             left = right;
6878 	    right = tmp;
6879 	}
6880 	kbase = xmlExpHashComputeKey(type, left, right);
6881     } else if (type == XML_EXP_SEQ) {
6882         /* Forbid reduction rules */
6883         if (left->type == XML_EXP_FORBID) {
6884 	    xmlExpFree(ctxt, right);
6885 	    return(left);
6886 	}
6887         if (right->type == XML_EXP_FORBID) {
6888 	    xmlExpFree(ctxt, left);
6889 	    return(right);
6890 	}
6891         /* Empty reduction rules */
6892         if (right->type == XML_EXP_EMPTY) {
6893 	    return(left);
6894 	}
6895         if (left->type == XML_EXP_EMPTY) {
6896 	    return(right);
6897 	}
6898 	kbase = xmlExpHashComputeKey(type, left, right);
6899     } else
6900         return(NULL);
6901 
6902     key = kbase % ctxt->size;
6903     if (ctxt->table[key] != NULL) {
6904 	for (insert = ctxt->table[key]; insert != NULL;
6905 	     insert = insert->next) {
6906 	    if ((insert->key == kbase) &&
6907 	        (insert->type == type)) {
6908 		if (type == XML_EXP_ATOM) {
6909 		    if (name == insert->exp_str) {
6910 			insert->ref++;
6911 			return(insert);
6912 		    }
6913 		} else if (type == XML_EXP_COUNT) {
6914 		    if ((insert->exp_min == min) && (insert->exp_max == max) &&
6915 		        (insert->exp_left == left)) {
6916 			insert->ref++;
6917 			left->ref--;
6918 			return(insert);
6919 		    }
6920 		} else if ((insert->exp_left == left) &&
6921 			   (insert->exp_right == right)) {
6922 		    insert->ref++;
6923 		    left->ref--;
6924 		    right->ref--;
6925 		    return(insert);
6926 		}
6927 	    }
6928 	}
6929     }
6930 
6931     entry = xmlExpNewNode(ctxt, type);
6932     if (entry == NULL)
6933         return(NULL);
6934     entry->key = kbase;
6935     if (type == XML_EXP_ATOM) {
6936 	entry->exp_str = name;
6937 	entry->c_max = 1;
6938     } else if (type == XML_EXP_COUNT) {
6939         entry->exp_min = min;
6940         entry->exp_max = max;
6941 	entry->exp_left = left;
6942 	if ((min == 0) || (IS_NILLABLE(left)))
6943 	    entry->info |= XML_EXP_NILABLE;
6944 	if (max < 0)
6945 	    entry->c_max = -1;
6946 	else
6947 	    entry->c_max = max * entry->exp_left->c_max;
6948     } else {
6949 	entry->exp_left = left;
6950 	entry->exp_right = right;
6951 	if (type == XML_EXP_OR) {
6952 	    if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6953 		entry->info |= XML_EXP_NILABLE;
6954 	    if ((entry->exp_left->c_max == -1) ||
6955 	        (entry->exp_right->c_max == -1))
6956 		entry->c_max = -1;
6957 	    else if (entry->exp_left->c_max > entry->exp_right->c_max)
6958 	        entry->c_max = entry->exp_left->c_max;
6959 	    else
6960 	        entry->c_max = entry->exp_right->c_max;
6961 	} else {
6962 	    if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6963 		entry->info |= XML_EXP_NILABLE;
6964 	    if ((entry->exp_left->c_max == -1) ||
6965 	        (entry->exp_right->c_max == -1))
6966 		entry->c_max = -1;
6967 	    else
6968 	        entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6969 	}
6970     }
6971     entry->ref = 1;
6972     if (ctxt->table[key] != NULL)
6973         entry->next = ctxt->table[key];
6974 
6975     ctxt->table[key] = entry;
6976     ctxt->nbElems++;
6977 
6978     return(entry);
6979 }
6980 
6981 /**
6982  * xmlExpFree:
6983  * @ctxt: the expression context
6984  * @exp: the expression
6985  *
6986  * Dereference the expression
6987  */
6988 void
xmlExpFree(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp)6989 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6990     if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6991         return;
6992     exp->ref--;
6993     if (exp->ref == 0) {
6994         unsigned short key;
6995 
6996         /* Unlink it first from the hash table */
6997 	key = exp->key % ctxt->size;
6998 	if (ctxt->table[key] == exp) {
6999 	    ctxt->table[key] = exp->next;
7000 	} else {
7001 	    xmlExpNodePtr tmp;
7002 
7003 	    tmp = ctxt->table[key];
7004 	    while (tmp != NULL) {
7005 	        if (tmp->next == exp) {
7006 		    tmp->next = exp->next;
7007 		    break;
7008 		}
7009 	        tmp = tmp->next;
7010 	    }
7011 	}
7012 
7013         if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
7014 	    xmlExpFree(ctxt, exp->exp_left);
7015 	    xmlExpFree(ctxt, exp->exp_right);
7016 	} else if (exp->type == XML_EXP_COUNT) {
7017 	    xmlExpFree(ctxt, exp->exp_left);
7018 	}
7019         xmlFree(exp);
7020 	ctxt->nb_nodes--;
7021     }
7022 }
7023 
7024 /**
7025  * xmlExpRef:
7026  * @exp: the expression
7027  *
7028  * Increase the reference count of the expression
7029  */
7030 void
xmlExpRef(xmlExpNodePtr exp)7031 xmlExpRef(xmlExpNodePtr exp) {
7032     if (exp != NULL)
7033         exp->ref++;
7034 }
7035 
7036 /**
7037  * xmlExpNewAtom:
7038  * @ctxt: the expression context
7039  * @name: the atom name
7040  * @len: the atom name length in byte (or -1);
7041  *
7042  * Get the atom associated to this name from that context
7043  *
7044  * Returns the node or NULL in case of error
7045  */
7046 xmlExpNodePtr
xmlExpNewAtom(xmlExpCtxtPtr ctxt,const xmlChar * name,int len)7047 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
7048     if ((ctxt == NULL) || (name == NULL))
7049         return(NULL);
7050     name = xmlDictLookup(ctxt->dict, name, len);
7051     if (name == NULL)
7052         return(NULL);
7053     return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
7054 }
7055 
7056 /**
7057  * xmlExpNewOr:
7058  * @ctxt: the expression context
7059  * @left: left expression
7060  * @right: right expression
7061  *
7062  * Get the atom associated to the choice @left | @right
7063  * Note that @left and @right are consumed in the operation, to keep
7064  * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
7065  * this is true even in case of failure (unless ctxt == NULL).
7066  *
7067  * Returns the node or NULL in case of error
7068  */
7069 xmlExpNodePtr
xmlExpNewOr(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)7070 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
7071     if (ctxt == NULL)
7072         return(NULL);
7073     if ((left == NULL) || (right == NULL)) {
7074         xmlExpFree(ctxt, left);
7075         xmlExpFree(ctxt, right);
7076         return(NULL);
7077     }
7078     return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
7079 }
7080 
7081 /**
7082  * xmlExpNewSeq:
7083  * @ctxt: the expression context
7084  * @left: left expression
7085  * @right: right expression
7086  *
7087  * Get the atom associated to the sequence @left , @right
7088  * Note that @left and @right are consumed in the operation, to keep
7089  * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
7090  * this is true even in case of failure (unless ctxt == NULL).
7091  *
7092  * Returns the node or NULL in case of error
7093  */
7094 xmlExpNodePtr
xmlExpNewSeq(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)7095 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
7096     if (ctxt == NULL)
7097         return(NULL);
7098     if ((left == NULL) || (right == NULL)) {
7099         xmlExpFree(ctxt, left);
7100         xmlExpFree(ctxt, right);
7101         return(NULL);
7102     }
7103     return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
7104 }
7105 
7106 /**
7107  * xmlExpNewRange:
7108  * @ctxt: the expression context
7109  * @subset: the expression to be repeated
7110  * @min: the lower bound for the repetition
7111  * @max: the upper bound for the repetition, -1 means infinite
7112  *
7113  * Get the atom associated to the range (@subset){@min, @max}
7114  * Note that @subset is consumed in the operation, to keep
7115  * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
7116  * this is true even in case of failure (unless ctxt == NULL).
7117  *
7118  * Returns the node or NULL in case of error
7119  */
7120 xmlExpNodePtr
xmlExpNewRange(xmlExpCtxtPtr ctxt,xmlExpNodePtr subset,int min,int max)7121 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
7122     if (ctxt == NULL)
7123         return(NULL);
7124     if ((subset == NULL) || (min < 0) || (max < -1) ||
7125         ((max >= 0) && (min > max))) {
7126 	xmlExpFree(ctxt, subset);
7127         return(NULL);
7128     }
7129     return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
7130                               NULL, NULL, min, max));
7131 }
7132 
7133 /************************************************************************
7134  *									*
7135  *		Public API for operations on expressions		*
7136  *									*
7137  ************************************************************************/
7138 
7139 static int
xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)7140 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7141                      const xmlChar**list, int len, int nb) {
7142     int tmp, tmp2;
7143 tail:
7144     switch (exp->type) {
7145         case XML_EXP_EMPTY:
7146 	    return(0);
7147         case XML_EXP_ATOM:
7148 	    for (tmp = 0;tmp < nb;tmp++)
7149 	        if (list[tmp] == exp->exp_str)
7150 		    return(0);
7151             if (nb >= len)
7152 	        return(-2);
7153 	    list[nb] = exp->exp_str;
7154 	    return(1);
7155         case XML_EXP_COUNT:
7156 	    exp = exp->exp_left;
7157 	    goto tail;
7158         case XML_EXP_SEQ:
7159         case XML_EXP_OR:
7160 	    tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
7161 	    if (tmp < 0)
7162 	        return(tmp);
7163 	    tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
7164 	                                nb + tmp);
7165 	    if (tmp2 < 0)
7166 	        return(tmp2);
7167             return(tmp + tmp2);
7168     }
7169     return(-1);
7170 }
7171 
7172 /**
7173  * xmlExpGetLanguage:
7174  * @ctxt: the expression context
7175  * @exp: the expression
7176  * @langList: where to store the tokens
7177  * @len: the allocated length of @list
7178  *
7179  * Find all the strings used in @exp and store them in @list
7180  *
7181  * Returns the number of unique strings found, -1 in case of errors and
7182  *         -2 if there is more than @len strings
7183  */
7184 int
xmlExpGetLanguage(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** langList,int len)7185 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7186                   const xmlChar**langList, int len) {
7187     if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
7188         return(-1);
7189     return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
7190 }
7191 
7192 static int
xmlExpGetStartInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)7193 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7194                   const xmlChar**list, int len, int nb) {
7195     int tmp, tmp2;
7196 tail:
7197     switch (exp->type) {
7198         case XML_EXP_FORBID:
7199 	    return(0);
7200         case XML_EXP_EMPTY:
7201 	    return(0);
7202         case XML_EXP_ATOM:
7203 	    for (tmp = 0;tmp < nb;tmp++)
7204 	        if (list[tmp] == exp->exp_str)
7205 		    return(0);
7206             if (nb >= len)
7207 	        return(-2);
7208 	    list[nb] = exp->exp_str;
7209 	    return(1);
7210         case XML_EXP_COUNT:
7211 	    exp = exp->exp_left;
7212 	    goto tail;
7213         case XML_EXP_SEQ:
7214 	    tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7215 	    if (tmp < 0)
7216 	        return(tmp);
7217 	    if (IS_NILLABLE(exp->exp_left)) {
7218 		tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7219 					    nb + tmp);
7220 		if (tmp2 < 0)
7221 		    return(tmp2);
7222 		tmp += tmp2;
7223 	    }
7224             return(tmp);
7225         case XML_EXP_OR:
7226 	    tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7227 	    if (tmp < 0)
7228 	        return(tmp);
7229 	    tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7230 	                                nb + tmp);
7231 	    if (tmp2 < 0)
7232 	        return(tmp2);
7233             return(tmp + tmp2);
7234     }
7235     return(-1);
7236 }
7237 
7238 /**
7239  * xmlExpGetStart:
7240  * @ctxt: the expression context
7241  * @exp: the expression
7242  * @tokList: where to store the tokens
7243  * @len: the allocated length of @list
7244  *
7245  * Find all the strings that appears at the start of the languages
7246  * accepted by @exp and store them in @list. E.g. for (a, b) | c
7247  * it will return the list [a, c]
7248  *
7249  * Returns the number of unique strings found, -1 in case of errors and
7250  *         -2 if there is more than @len strings
7251  */
7252 int
xmlExpGetStart(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** tokList,int len)7253 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7254                const xmlChar**tokList, int len) {
7255     if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
7256         return(-1);
7257     return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
7258 }
7259 
7260 /**
7261  * xmlExpIsNillable:
7262  * @exp: the expression
7263  *
7264  * Finds if the expression is nillable, i.e. if it accepts the empty sequence
7265  *
7266  * Returns 1 if nillable, 0 if not and -1 in case of error
7267  */
7268 int
xmlExpIsNillable(xmlExpNodePtr exp)7269 xmlExpIsNillable(xmlExpNodePtr exp) {
7270     if (exp == NULL)
7271         return(-1);
7272     return(IS_NILLABLE(exp) != 0);
7273 }
7274 
7275 static xmlExpNodePtr
xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str)7276 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
7277 {
7278     xmlExpNodePtr ret;
7279 
7280     switch (exp->type) {
7281 	case XML_EXP_EMPTY:
7282 	    return(forbiddenExp);
7283 	case XML_EXP_FORBID:
7284 	    return(forbiddenExp);
7285 	case XML_EXP_ATOM:
7286 	    if (exp->exp_str == str) {
7287 #ifdef DEBUG_DERIV
7288 		printf("deriv atom: equal => Empty\n");
7289 #endif
7290 	        ret = emptyExp;
7291 	    } else {
7292 #ifdef DEBUG_DERIV
7293 		printf("deriv atom: mismatch => forbid\n");
7294 #endif
7295 	        /* TODO wildcards here */
7296 		ret = forbiddenExp;
7297 	    }
7298 	    return(ret);
7299 	case XML_EXP_OR: {
7300 	    xmlExpNodePtr tmp;
7301 
7302 #ifdef DEBUG_DERIV
7303 	    printf("deriv or: => or(derivs)\n");
7304 #endif
7305 	    tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7306 	    if (tmp == NULL) {
7307 		return(NULL);
7308 	    }
7309 	    ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7310 	    if (ret == NULL) {
7311 	        xmlExpFree(ctxt, tmp);
7312 		return(NULL);
7313 	    }
7314             ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
7315 			     NULL, 0, 0);
7316 	    return(ret);
7317 	}
7318 	case XML_EXP_SEQ:
7319 #ifdef DEBUG_DERIV
7320 	    printf("deriv seq: starting with left\n");
7321 #endif
7322 	    ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7323 	    if (ret == NULL) {
7324 	        return(NULL);
7325 	    } else if (ret == forbiddenExp) {
7326 	        if (IS_NILLABLE(exp->exp_left)) {
7327 #ifdef DEBUG_DERIV
7328 		    printf("deriv seq: left failed but nillable\n");
7329 #endif
7330 		    ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7331 		}
7332 	    } else {
7333 #ifdef DEBUG_DERIV
7334 		printf("deriv seq: left match => sequence\n");
7335 #endif
7336 	        exp->exp_right->ref++;
7337 	        ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
7338 		                         NULL, 0, 0);
7339 	    }
7340 	    return(ret);
7341 	case XML_EXP_COUNT: {
7342 	    int min, max;
7343 	    xmlExpNodePtr tmp;
7344 
7345 	    if (exp->exp_max == 0)
7346 		return(forbiddenExp);
7347 	    ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7348 	    if (ret == NULL)
7349 	        return(NULL);
7350 	    if (ret == forbiddenExp) {
7351 #ifdef DEBUG_DERIV
7352 		printf("deriv count: pattern mismatch => forbid\n");
7353 #endif
7354 	        return(ret);
7355 	    }
7356 	    if (exp->exp_max == 1)
7357 		return(ret);
7358 	    if (exp->exp_max < 0) /* unbounded */
7359 		max = -1;
7360 	    else
7361 		max = exp->exp_max - 1;
7362 	    if (exp->exp_min > 0)
7363 		min = exp->exp_min - 1;
7364 	    else
7365 		min = 0;
7366 	    exp->exp_left->ref++;
7367 	    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
7368 				     NULL, min, max);
7369 	    if (ret == emptyExp) {
7370 #ifdef DEBUG_DERIV
7371 		printf("deriv count: match to empty => new count\n");
7372 #endif
7373 	        return(tmp);
7374 	    }
7375 #ifdef DEBUG_DERIV
7376 	    printf("deriv count: match => sequence with new count\n");
7377 #endif
7378 	    return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
7379 	                              NULL, 0, 0));
7380 	}
7381     }
7382     return(NULL);
7383 }
7384 
7385 /**
7386  * xmlExpStringDerive:
7387  * @ctxt: the expression context
7388  * @exp: the expression
7389  * @str: the string
7390  * @len: the string len in bytes if available
7391  *
7392  * Do one step of Brzozowski derivation of the expression @exp with
7393  * respect to the input string
7394  *
7395  * Returns the resulting expression or NULL in case of internal error
7396  */
7397 xmlExpNodePtr
xmlExpStringDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str,int len)7398 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7399                    const xmlChar *str, int len) {
7400     const xmlChar *input;
7401 
7402     if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
7403         return(NULL);
7404     }
7405     /*
7406      * check the string is in the dictionary, if yes use an interned
7407      * copy, otherwise we know it's not an acceptable input
7408      */
7409     input = xmlDictExists(ctxt->dict, str, len);
7410     if (input == NULL) {
7411         return(forbiddenExp);
7412     }
7413     return(xmlExpStringDeriveInt(ctxt, exp, input));
7414 }
7415 
7416 static int
xmlExpCheckCard(xmlExpNodePtr exp,xmlExpNodePtr sub)7417 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7418     int ret = 1;
7419 
7420     if (sub->c_max == -1) {
7421         if (exp->c_max != -1)
7422 	    ret = 0;
7423     } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7424         ret = 0;
7425     }
7426 #if 0
7427     if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7428         ret = 0;
7429 #endif
7430     return(ret);
7431 }
7432 
7433 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7434                                         xmlExpNodePtr sub);
7435 /**
7436  * xmlExpDivide:
7437  * @ctxt: the expressions context
7438  * @exp: the englobing expression
7439  * @sub: the subexpression
7440  * @mult: the multiple expression
7441  * @remain: the remain from the derivation of the multiple
7442  *
7443  * Check if exp is a multiple of sub, i.e. if there is a finite number n
7444  * so that sub{n} subsume exp
7445  *
7446  * Returns the multiple value if successful, 0 if it is not a multiple
7447  *         and -1 in case of internal error.
7448  */
7449 
7450 static int
xmlExpDivide(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub,xmlExpNodePtr * mult,xmlExpNodePtr * remain)7451 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7452              xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7453     int i;
7454     xmlExpNodePtr tmp, tmp2;
7455 
7456     if (mult != NULL) *mult = NULL;
7457     if (remain != NULL) *remain = NULL;
7458     if (exp->c_max == -1) return(0);
7459     if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7460 
7461     for (i = 1;i <= exp->c_max;i++) {
7462         sub->ref++;
7463         tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7464 				 sub, NULL, NULL, i, i);
7465 	if (tmp == NULL) {
7466 	    return(-1);
7467 	}
7468 	if (!xmlExpCheckCard(tmp, exp)) {
7469 	    xmlExpFree(ctxt, tmp);
7470 	    continue;
7471 	}
7472 	tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7473 	if (tmp2 == NULL) {
7474 	    xmlExpFree(ctxt, tmp);
7475 	    return(-1);
7476 	}
7477 	if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7478 	    if (remain != NULL)
7479 	        *remain = tmp2;
7480 	    else
7481 	        xmlExpFree(ctxt, tmp2);
7482 	    if (mult != NULL)
7483 	        *mult = tmp;
7484 	    else
7485 	        xmlExpFree(ctxt, tmp);
7486 #ifdef DEBUG_DERIV
7487 	    printf("Divide succeeded %d\n", i);
7488 #endif
7489 	    return(i);
7490 	}
7491 	xmlExpFree(ctxt, tmp);
7492 	xmlExpFree(ctxt, tmp2);
7493     }
7494 #ifdef DEBUG_DERIV
7495     printf("Divide failed\n");
7496 #endif
7497     return(0);
7498 }
7499 
7500 /**
7501  * xmlExpExpDeriveInt:
7502  * @ctxt: the expressions context
7503  * @exp: the englobing expression
7504  * @sub: the subexpression
7505  *
7506  * Try to do a step of Brzozowski derivation but at a higher level
7507  * the input being a subexpression.
7508  *
7509  * Returns the resulting expression or NULL in case of internal error
7510  */
7511 static xmlExpNodePtr
xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7512 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7513     xmlExpNodePtr ret, tmp, tmp2, tmp3;
7514     const xmlChar **tab;
7515     int len, i;
7516 
7517     /*
7518      * In case of equality and if the expression can only consume a finite
7519      * amount, then the derivation is empty
7520      */
7521     if ((exp == sub) && (exp->c_max >= 0)) {
7522 #ifdef DEBUG_DERIV
7523         printf("Equal(exp, sub) and finite -> Empty\n");
7524 #endif
7525         return(emptyExp);
7526     }
7527     /*
7528      * decompose sub sequence first
7529      */
7530     if (sub->type == XML_EXP_EMPTY) {
7531 #ifdef DEBUG_DERIV
7532         printf("Empty(sub) -> Empty\n");
7533 #endif
7534 	exp->ref++;
7535         return(exp);
7536     }
7537     if (sub->type == XML_EXP_SEQ) {
7538 #ifdef DEBUG_DERIV
7539         printf("Seq(sub) -> decompose\n");
7540 #endif
7541         tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7542 	if (tmp == NULL)
7543 	    return(NULL);
7544 	if (tmp == forbiddenExp)
7545 	    return(tmp);
7546 	ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7547 	xmlExpFree(ctxt, tmp);
7548 	return(ret);
7549     }
7550     if (sub->type == XML_EXP_OR) {
7551 #ifdef DEBUG_DERIV
7552         printf("Or(sub) -> decompose\n");
7553 #endif
7554         tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7555 	if (tmp == forbiddenExp)
7556 	    return(tmp);
7557 	if (tmp == NULL)
7558 	    return(NULL);
7559 	ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7560 	if ((ret == NULL) || (ret == forbiddenExp)) {
7561 	    xmlExpFree(ctxt, tmp);
7562 	    return(ret);
7563 	}
7564 	return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7565     }
7566     if (!xmlExpCheckCard(exp, sub)) {
7567 #ifdef DEBUG_DERIV
7568         printf("CheckCard(exp, sub) failed -> Forbid\n");
7569 #endif
7570         return(forbiddenExp);
7571     }
7572     switch (exp->type) {
7573         case XML_EXP_EMPTY:
7574 	    if (sub == emptyExp)
7575 	        return(emptyExp);
7576 #ifdef DEBUG_DERIV
7577 	    printf("Empty(exp) -> Forbid\n");
7578 #endif
7579 	    return(forbiddenExp);
7580         case XML_EXP_FORBID:
7581 #ifdef DEBUG_DERIV
7582 	    printf("Forbid(exp) -> Forbid\n");
7583 #endif
7584 	    return(forbiddenExp);
7585         case XML_EXP_ATOM:
7586 	    if (sub->type == XML_EXP_ATOM) {
7587 	        /* TODO: handle wildcards */
7588 	        if (exp->exp_str == sub->exp_str) {
7589 #ifdef DEBUG_DERIV
7590 		    printf("Atom match -> Empty\n");
7591 #endif
7592 		    return(emptyExp);
7593                 }
7594 #ifdef DEBUG_DERIV
7595 		printf("Atom mismatch -> Forbid\n");
7596 #endif
7597 	        return(forbiddenExp);
7598 	    }
7599 	    if ((sub->type == XML_EXP_COUNT) &&
7600 	        (sub->exp_max == 1) &&
7601 	        (sub->exp_left->type == XML_EXP_ATOM)) {
7602 	        /* TODO: handle wildcards */
7603 	        if (exp->exp_str == sub->exp_left->exp_str) {
7604 #ifdef DEBUG_DERIV
7605 		    printf("Atom match -> Empty\n");
7606 #endif
7607 		    return(emptyExp);
7608 		}
7609 #ifdef DEBUG_DERIV
7610 		printf("Atom mismatch -> Forbid\n");
7611 #endif
7612 	        return(forbiddenExp);
7613 	    }
7614 #ifdef DEBUG_DERIV
7615 	    printf("Complex exp vs Atom -> Forbid\n");
7616 #endif
7617 	    return(forbiddenExp);
7618         case XML_EXP_SEQ:
7619 	    /* try to get the sequence consumed only if possible */
7620 	    if (xmlExpCheckCard(exp->exp_left, sub)) {
7621 		/* See if the sequence can be consumed directly */
7622 #ifdef DEBUG_DERIV
7623 		printf("Seq trying left only\n");
7624 #endif
7625 		ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7626 		if ((ret != forbiddenExp) && (ret != NULL)) {
7627 #ifdef DEBUG_DERIV
7628 		    printf("Seq trying left only worked\n");
7629 #endif
7630 		    /*
7631 		     * TODO: assumption here that we are determinist
7632 		     *       i.e. we won't get to a nillable exp left
7633 		     *       subset which could be matched by the right
7634 		     *       part too.
7635 		     * e.g.: (a | b)+,(a | c) and 'a+,a'
7636 		     */
7637 		    exp->exp_right->ref++;
7638 		    return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7639 					      exp->exp_right, NULL, 0, 0));
7640 		}
7641 #ifdef DEBUG_DERIV
7642 	    } else {
7643 		printf("Seq: left too short\n");
7644 #endif
7645 	    }
7646 	    /* Try instead to decompose */
7647 	    if (sub->type == XML_EXP_COUNT) {
7648 		int min, max;
7649 
7650 #ifdef DEBUG_DERIV
7651 		printf("Seq: sub is a count\n");
7652 #endif
7653 	        ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7654 		if (ret == NULL)
7655 		    return(NULL);
7656 		if (ret != forbiddenExp) {
7657 #ifdef DEBUG_DERIV
7658 		    printf("Seq , Count match on left\n");
7659 #endif
7660 		    if (sub->exp_max < 0)
7661 		        max = -1;
7662 	            else
7663 		        max = sub->exp_max -1;
7664 		    if (sub->exp_min > 0)
7665 		        min = sub->exp_min -1;
7666 		    else
7667 		        min = 0;
7668 		    exp->exp_right->ref++;
7669 		    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7670 		                             exp->exp_right, NULL, 0, 0);
7671 		    if (tmp == NULL)
7672 		        return(NULL);
7673 
7674 		    sub->exp_left->ref++;
7675 		    tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7676 				      sub->exp_left, NULL, NULL, min, max);
7677 		    if (tmp2 == NULL) {
7678 		        xmlExpFree(ctxt, tmp);
7679 			return(NULL);
7680 		    }
7681 		    ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7682 		    xmlExpFree(ctxt, tmp);
7683 		    xmlExpFree(ctxt, tmp2);
7684 		    return(ret);
7685 		}
7686 	    }
7687 	    /* we made no progress on structured operations */
7688 	    break;
7689         case XML_EXP_OR:
7690 #ifdef DEBUG_DERIV
7691 	    printf("Or , trying both side\n");
7692 #endif
7693 	    ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7694 	    if (ret == NULL)
7695 	        return(NULL);
7696 	    tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7697 	    if (tmp == NULL) {
7698 		xmlExpFree(ctxt, ret);
7699 	        return(NULL);
7700 	    }
7701 	    return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7702         case XML_EXP_COUNT: {
7703 	    int min, max;
7704 
7705 	    if (sub->type == XML_EXP_COUNT) {
7706 	        /*
7707 		 * Try to see if the loop is completely subsumed
7708 		 */
7709 	        tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7710 		if (tmp == NULL)
7711 		    return(NULL);
7712 		if (tmp == forbiddenExp) {
7713 		    int mult;
7714 
7715 #ifdef DEBUG_DERIV
7716 		    printf("Count, Count inner don't subsume\n");
7717 #endif
7718 		    mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7719 		                        NULL, &tmp);
7720 		    if (mult <= 0) {
7721 #ifdef DEBUG_DERIV
7722 			printf("Count, Count not multiple => forbidden\n");
7723 #endif
7724                         return(forbiddenExp);
7725 		    }
7726 		    if (sub->exp_max == -1) {
7727 		        max = -1;
7728 			if (exp->exp_max == -1) {
7729 			    if (exp->exp_min <= sub->exp_min * mult)
7730 			        min = 0;
7731 			    else
7732 			        min = exp->exp_min - sub->exp_min * mult;
7733 			} else {
7734 #ifdef DEBUG_DERIV
7735 			    printf("Count, Count finite can't subsume infinite\n");
7736 #endif
7737                             xmlExpFree(ctxt, tmp);
7738 			    return(forbiddenExp);
7739 			}
7740 		    } else {
7741 			if (exp->exp_max == -1) {
7742 #ifdef DEBUG_DERIV
7743 			    printf("Infinite loop consume mult finite loop\n");
7744 #endif
7745 			    if (exp->exp_min > sub->exp_min * mult) {
7746 				max = -1;
7747 				min = exp->exp_min - sub->exp_min * mult;
7748 			    } else {
7749 				max = -1;
7750 				min = 0;
7751 			    }
7752 			} else {
7753 			    if (exp->exp_max < sub->exp_max * mult) {
7754 #ifdef DEBUG_DERIV
7755 				printf("loops max mult mismatch => forbidden\n");
7756 #endif
7757 				xmlExpFree(ctxt, tmp);
7758 				return(forbiddenExp);
7759 			    }
7760 			    if (sub->exp_max * mult > exp->exp_min)
7761 				min = 0;
7762 			    else
7763 				min = exp->exp_min - sub->exp_max * mult;
7764 			    max = exp->exp_max - sub->exp_max * mult;
7765 			}
7766 		    }
7767 		} else if (!IS_NILLABLE(tmp)) {
7768 		    /*
7769 		     * TODO: loop here to try to grow if working on finite
7770 		     *       blocks.
7771 		     */
7772 #ifdef DEBUG_DERIV
7773 		    printf("Count, Count remain not nillable => forbidden\n");
7774 #endif
7775 		    xmlExpFree(ctxt, tmp);
7776 		    return(forbiddenExp);
7777 		} else if (sub->exp_max == -1) {
7778 		    if (exp->exp_max == -1) {
7779 		        if (exp->exp_min <= sub->exp_min) {
7780 #ifdef DEBUG_DERIV
7781 			    printf("Infinite loops Okay => COUNT(0,Inf)\n");
7782 #endif
7783                             max = -1;
7784 			    min = 0;
7785 			} else {
7786 #ifdef DEBUG_DERIV
7787 			    printf("Infinite loops min => Count(X,Inf)\n");
7788 #endif
7789                             max = -1;
7790 			    min = exp->exp_min - sub->exp_min;
7791 			}
7792 		    } else if (exp->exp_min > sub->exp_min) {
7793 #ifdef DEBUG_DERIV
7794 			printf("loops min mismatch 1 => forbidden ???\n");
7795 #endif
7796 		        xmlExpFree(ctxt, tmp);
7797 		        return(forbiddenExp);
7798 		    } else {
7799 			max = -1;
7800 			min = 0;
7801 		    }
7802 		} else {
7803 		    if (exp->exp_max == -1) {
7804 #ifdef DEBUG_DERIV
7805 			printf("Infinite loop consume finite loop\n");
7806 #endif
7807 		        if (exp->exp_min > sub->exp_min) {
7808 			    max = -1;
7809 			    min = exp->exp_min - sub->exp_min;
7810 			} else {
7811 			    max = -1;
7812 			    min = 0;
7813 			}
7814 		    } else {
7815 		        if (exp->exp_max < sub->exp_max) {
7816 #ifdef DEBUG_DERIV
7817 			    printf("loops max mismatch => forbidden\n");
7818 #endif
7819 			    xmlExpFree(ctxt, tmp);
7820 			    return(forbiddenExp);
7821 			}
7822 			if (sub->exp_max > exp->exp_min)
7823 			    min = 0;
7824 			else
7825 			    min = exp->exp_min - sub->exp_max;
7826 			max = exp->exp_max - sub->exp_max;
7827 		    }
7828 		}
7829 #ifdef DEBUG_DERIV
7830 		printf("loops match => SEQ(COUNT())\n");
7831 #endif
7832 		exp->exp_left->ref++;
7833 		tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7834 		                          NULL, NULL, min, max);
7835 		if (tmp2 == NULL) {
7836 		    return(NULL);
7837 		}
7838                 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7839 		                         NULL, 0, 0);
7840 		return(ret);
7841 	    }
7842 	    tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7843 	    if (tmp == NULL)
7844 		return(NULL);
7845 	    if (tmp == forbiddenExp) {
7846 #ifdef DEBUG_DERIV
7847 		printf("loop mismatch => forbidden\n");
7848 #endif
7849 		return(forbiddenExp);
7850 	    }
7851 	    if (exp->exp_min > 0)
7852 		min = exp->exp_min - 1;
7853 	    else
7854 		min = 0;
7855 	    if (exp->exp_max < 0)
7856 		max = -1;
7857 	    else
7858 		max = exp->exp_max - 1;
7859 
7860 #ifdef DEBUG_DERIV
7861 	    printf("loop match => SEQ(COUNT())\n");
7862 #endif
7863 	    exp->exp_left->ref++;
7864 	    tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7865 				      NULL, NULL, min, max);
7866 	    if (tmp2 == NULL)
7867 		return(NULL);
7868 	    ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7869 				     NULL, 0, 0);
7870 	    return(ret);
7871 	}
7872     }
7873 
7874 #ifdef DEBUG_DERIV
7875     printf("Fallback to derivative\n");
7876 #endif
7877     if (IS_NILLABLE(sub)) {
7878         if (!(IS_NILLABLE(exp)))
7879 	    return(forbiddenExp);
7880 	else
7881 	    ret = emptyExp;
7882     } else
7883 	ret = NULL;
7884     /*
7885      * here the structured derivation made no progress so
7886      * we use the default token based derivation to force one more step
7887      */
7888     if (ctxt->tabSize == 0)
7889         ctxt->tabSize = 40;
7890 
7891     tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7892 	                               sizeof(const xmlChar *));
7893     if (tab == NULL) {
7894 	return(NULL);
7895     }
7896 
7897     /*
7898      * collect all the strings accepted by the subexpression on input
7899      */
7900     len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7901     while (len < 0) {
7902         const xmlChar **temp;
7903 	temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
7904 	                                     sizeof(const xmlChar *));
7905 	if (temp == NULL) {
7906 	    xmlFree((xmlChar **) tab);
7907 	    return(NULL);
7908 	}
7909 	tab = temp;
7910 	ctxt->tabSize *= 2;
7911 	len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7912     }
7913     for (i = 0;i < len;i++) {
7914         tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7915 	if ((tmp == NULL) || (tmp == forbiddenExp)) {
7916 	    xmlExpFree(ctxt, ret);
7917 	    xmlFree((xmlChar **) tab);
7918 	    return(tmp);
7919 	}
7920 	tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7921 	if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7922 	    xmlExpFree(ctxt, tmp);
7923 	    xmlExpFree(ctxt, ret);
7924 	    xmlFree((xmlChar **) tab);
7925 	    return(tmp);
7926 	}
7927 	tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7928 	xmlExpFree(ctxt, tmp);
7929 	xmlExpFree(ctxt, tmp2);
7930 
7931 	if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7932 	    xmlExpFree(ctxt, ret);
7933 	    xmlFree((xmlChar **) tab);
7934 	    return(tmp3);
7935 	}
7936 
7937 	if (ret == NULL)
7938 	    ret = tmp3;
7939 	else {
7940 	    ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7941 	    if (ret == NULL) {
7942 		xmlFree((xmlChar **) tab);
7943 	        return(NULL);
7944 	    }
7945 	}
7946     }
7947     xmlFree((xmlChar **) tab);
7948     return(ret);
7949 }
7950 
7951 /**
7952  * xmlExpExpDerive:
7953  * @ctxt: the expressions context
7954  * @exp: the englobing expression
7955  * @sub: the subexpression
7956  *
7957  * Evaluates the expression resulting from @exp consuming a sub expression @sub
7958  * Based on algebraic derivation and sometimes direct Brzozowski derivation
7959  * it usually takes less than linear time and can handle expressions generating
7960  * infinite languages.
7961  *
7962  * Returns the resulting expression or NULL in case of internal error, the
7963  *         result must be freed
7964  */
7965 xmlExpNodePtr
xmlExpExpDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7966 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7967     if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7968         return(NULL);
7969 
7970     /*
7971      * O(1) speedups
7972      */
7973     if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7974 #ifdef DEBUG_DERIV
7975 	printf("Sub nillable and not exp : can't subsume\n");
7976 #endif
7977         return(forbiddenExp);
7978     }
7979     if (xmlExpCheckCard(exp, sub) == 0) {
7980 #ifdef DEBUG_DERIV
7981 	printf("sub generate longer sequences than exp : can't subsume\n");
7982 #endif
7983         return(forbiddenExp);
7984     }
7985     return(xmlExpExpDeriveInt(ctxt, exp, sub));
7986 }
7987 
7988 /**
7989  * xmlExpSubsume:
7990  * @ctxt: the expressions context
7991  * @exp: the englobing expression
7992  * @sub: the subexpression
7993  *
7994  * Check whether @exp accepts all the languages accepted by @sub
7995  * the input being a subexpression.
7996  *
7997  * Returns 1 if true 0 if false and -1 in case of failure.
7998  */
7999 int
xmlExpSubsume(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)8000 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
8001     xmlExpNodePtr tmp;
8002 
8003     if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
8004         return(-1);
8005 
8006     /*
8007      * TODO: speedup by checking the language of sub is a subset of the
8008      *       language of exp
8009      */
8010     /*
8011      * O(1) speedups
8012      */
8013     if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
8014 #ifdef DEBUG_DERIV
8015 	printf("Sub nillable and not exp : can't subsume\n");
8016 #endif
8017         return(0);
8018     }
8019     if (xmlExpCheckCard(exp, sub) == 0) {
8020 #ifdef DEBUG_DERIV
8021 	printf("sub generate longer sequences than exp : can't subsume\n");
8022 #endif
8023         return(0);
8024     }
8025     tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
8026 #ifdef DEBUG_DERIV
8027     printf("Result derivation :\n");
8028     PRINT_EXP(tmp);
8029 #endif
8030     if (tmp == NULL)
8031         return(-1);
8032     if (tmp == forbiddenExp)
8033 	return(0);
8034     if (tmp == emptyExp)
8035 	return(1);
8036     if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
8037         xmlExpFree(ctxt, tmp);
8038         return(1);
8039     }
8040     xmlExpFree(ctxt, tmp);
8041     return(0);
8042 }
8043 
8044 /************************************************************************
8045  *									*
8046  *			Parsing expression				*
8047  *									*
8048  ************************************************************************/
8049 
8050 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
8051 
8052 #undef CUR
8053 #define CUR (*ctxt->cur)
8054 #undef NEXT
8055 #define NEXT ctxt->cur++;
8056 #undef IS_BLANK
8057 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
8058 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
8059 
8060 static int
xmlExpParseNumber(xmlExpCtxtPtr ctxt)8061 xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
8062     int ret = 0;
8063 
8064     SKIP_BLANKS
8065     if (CUR == '*') {
8066 	NEXT
8067 	return(-1);
8068     }
8069     if ((CUR < '0') || (CUR > '9'))
8070         return(-1);
8071     while ((CUR >= '0') && (CUR <= '9')) {
8072         ret = ret * 10 + (CUR - '0');
8073 	NEXT
8074     }
8075     return(ret);
8076 }
8077 
8078 static xmlExpNodePtr
xmlExpParseOr(xmlExpCtxtPtr ctxt)8079 xmlExpParseOr(xmlExpCtxtPtr ctxt) {
8080     const char *base;
8081     xmlExpNodePtr ret;
8082     const xmlChar *val;
8083 
8084     SKIP_BLANKS
8085     base = ctxt->cur;
8086     if (*ctxt->cur == '(') {
8087         NEXT
8088 	ret = xmlExpParseExpr(ctxt);
8089 	SKIP_BLANKS
8090 	if (*ctxt->cur != ')') {
8091 	    fprintf(stderr, "unbalanced '(' : %s\n", base);
8092 	    xmlExpFree(ctxt, ret);
8093 	    return(NULL);
8094 	}
8095 	NEXT;
8096 	SKIP_BLANKS
8097 	goto parse_quantifier;
8098     }
8099     while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
8100            (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
8101 	   (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
8102 	NEXT;
8103     val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
8104     if (val == NULL)
8105         return(NULL);
8106     ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
8107     if (ret == NULL)
8108         return(NULL);
8109     SKIP_BLANKS
8110 parse_quantifier:
8111     if (CUR == '{') {
8112         int min, max;
8113 
8114         NEXT
8115 	min = xmlExpParseNumber(ctxt);
8116 	if (min < 0) {
8117 	    xmlExpFree(ctxt, ret);
8118 	    return(NULL);
8119 	}
8120 	SKIP_BLANKS
8121 	if (CUR == ',') {
8122 	    NEXT
8123 	    max = xmlExpParseNumber(ctxt);
8124 	    SKIP_BLANKS
8125 	} else
8126 	    max = min;
8127 	if (CUR != '}') {
8128 	    xmlExpFree(ctxt, ret);
8129 	    return(NULL);
8130 	}
8131         NEXT
8132 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8133 	                         min, max);
8134 	SKIP_BLANKS
8135     } else if (CUR == '?') {
8136         NEXT
8137 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8138 	                         0, 1);
8139 	SKIP_BLANKS
8140     } else if (CUR == '+') {
8141         NEXT
8142 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8143 	                         1, -1);
8144 	SKIP_BLANKS
8145     } else if (CUR == '*') {
8146         NEXT
8147 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8148 	                         0, -1);
8149 	SKIP_BLANKS
8150     }
8151     return(ret);
8152 }
8153 
8154 
8155 static xmlExpNodePtr
xmlExpParseSeq(xmlExpCtxtPtr ctxt)8156 xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
8157     xmlExpNodePtr ret, right;
8158 
8159     ret = xmlExpParseOr(ctxt);
8160     SKIP_BLANKS
8161     while (CUR == '|') {
8162         NEXT
8163 	right = xmlExpParseOr(ctxt);
8164 	if (right == NULL) {
8165 	    xmlExpFree(ctxt, ret);
8166 	    return(NULL);
8167 	}
8168 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
8169 	if (ret == NULL)
8170 	    return(NULL);
8171     }
8172     return(ret);
8173 }
8174 
8175 static xmlExpNodePtr
xmlExpParseExpr(xmlExpCtxtPtr ctxt)8176 xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
8177     xmlExpNodePtr ret, right;
8178 
8179     ret = xmlExpParseSeq(ctxt);
8180     SKIP_BLANKS
8181     while (CUR == ',') {
8182         NEXT
8183 	right = xmlExpParseSeq(ctxt);
8184 	if (right == NULL) {
8185 	    xmlExpFree(ctxt, ret);
8186 	    return(NULL);
8187 	}
8188 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
8189 	if (ret == NULL)
8190 	    return(NULL);
8191     }
8192     return(ret);
8193 }
8194 
8195 /**
8196  * xmlExpParse:
8197  * @ctxt: the expressions context
8198  * @expr: the 0 terminated string
8199  *
8200  * Minimal parser for regexps, it understand the following constructs
8201  *  - string terminals
8202  *  - choice operator |
8203  *  - sequence operator ,
8204  *  - subexpressions (...)
8205  *  - usual cardinality operators + * and ?
8206  *  - finite sequences  { min, max }
8207  *  - infinite sequences { min, * }
8208  * There is minimal checkings made especially no checking on strings values
8209  *
8210  * Returns a new expression or NULL in case of failure
8211  */
8212 xmlExpNodePtr
xmlExpParse(xmlExpCtxtPtr ctxt,const char * expr)8213 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
8214     xmlExpNodePtr ret;
8215 
8216     ctxt->expr = expr;
8217     ctxt->cur = expr;
8218 
8219     ret = xmlExpParseExpr(ctxt);
8220     SKIP_BLANKS
8221     if (*ctxt->cur != 0) {
8222         xmlExpFree(ctxt, ret);
8223         return(NULL);
8224     }
8225     return(ret);
8226 }
8227 
8228 static void
xmlExpDumpInt(xmlBufferPtr buf,xmlExpNodePtr expr,int glob)8229 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
8230     xmlExpNodePtr c;
8231 
8232     if (expr == NULL) return;
8233     if (glob) xmlBufferWriteChar(buf, "(");
8234     switch (expr->type) {
8235         case XML_EXP_EMPTY:
8236 	    xmlBufferWriteChar(buf, "empty");
8237 	    break;
8238         case XML_EXP_FORBID:
8239 	    xmlBufferWriteChar(buf, "forbidden");
8240 	    break;
8241         case XML_EXP_ATOM:
8242 	    xmlBufferWriteCHAR(buf, expr->exp_str);
8243 	    break;
8244         case XML_EXP_SEQ:
8245 	    c = expr->exp_left;
8246 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8247 	        xmlExpDumpInt(buf, c, 1);
8248 	    else
8249 	        xmlExpDumpInt(buf, c, 0);
8250 	    xmlBufferWriteChar(buf, " , ");
8251 	    c = expr->exp_right;
8252 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8253 	        xmlExpDumpInt(buf, c, 1);
8254 	    else
8255 	        xmlExpDumpInt(buf, c, 0);
8256             break;
8257         case XML_EXP_OR:
8258 	    c = expr->exp_left;
8259 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8260 	        xmlExpDumpInt(buf, c, 1);
8261 	    else
8262 	        xmlExpDumpInt(buf, c, 0);
8263 	    xmlBufferWriteChar(buf, " | ");
8264 	    c = expr->exp_right;
8265 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8266 	        xmlExpDumpInt(buf, c, 1);
8267 	    else
8268 	        xmlExpDumpInt(buf, c, 0);
8269             break;
8270         case XML_EXP_COUNT: {
8271 	    char rep[40];
8272 
8273 	    c = expr->exp_left;
8274 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8275 	        xmlExpDumpInt(buf, c, 1);
8276 	    else
8277 	        xmlExpDumpInt(buf, c, 0);
8278 	    if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
8279 		rep[0] = '?';
8280 		rep[1] = 0;
8281 	    } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
8282 		rep[0] = '*';
8283 		rep[1] = 0;
8284 	    } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
8285 		rep[0] = '+';
8286 		rep[1] = 0;
8287 	    } else if (expr->exp_max == expr->exp_min) {
8288 	        snprintf(rep, 39, "{%d}", expr->exp_min);
8289 	    } else if (expr->exp_max < 0) {
8290 	        snprintf(rep, 39, "{%d,inf}", expr->exp_min);
8291 	    } else {
8292 	        snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
8293 	    }
8294 	    rep[39] = 0;
8295 	    xmlBufferWriteChar(buf, rep);
8296 	    break;
8297 	}
8298 	default:
8299 	    fprintf(stderr, "Error in tree\n");
8300     }
8301     if (glob)
8302         xmlBufferWriteChar(buf, ")");
8303 }
8304 /**
8305  * xmlExpDump:
8306  * @buf:  a buffer to receive the output
8307  * @expr:  the compiled expression
8308  *
8309  * Serialize the expression as compiled to the buffer
8310  */
8311 void
xmlExpDump(xmlBufferPtr buf,xmlExpNodePtr expr)8312 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
8313     if ((buf == NULL) || (expr == NULL))
8314         return;
8315     xmlExpDumpInt(buf, expr, 0);
8316 }
8317 
8318 /**
8319  * xmlExpMaxToken:
8320  * @expr: a compiled expression
8321  *
8322  * Indicate the maximum number of input a expression can accept
8323  *
8324  * Returns the maximum length or -1 in case of error
8325  */
8326 int
xmlExpMaxToken(xmlExpNodePtr expr)8327 xmlExpMaxToken(xmlExpNodePtr expr) {
8328     if (expr == NULL)
8329         return(-1);
8330     return(expr->c_max);
8331 }
8332 
8333 /**
8334  * xmlExpCtxtNbNodes:
8335  * @ctxt: an expression context
8336  *
8337  * Debugging facility provides the number of allocated nodes at a that point
8338  *
8339  * Returns the number of nodes in use or -1 in case of error
8340  */
8341 int
xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt)8342 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
8343     if (ctxt == NULL)
8344         return(-1);
8345     return(ctxt->nb_nodes);
8346 }
8347 
8348 /**
8349  * xmlExpCtxtNbCons:
8350  * @ctxt: an expression context
8351  *
8352  * Debugging facility provides the number of allocated nodes over lifetime
8353  *
8354  * Returns the number of nodes ever allocated or -1 in case of error
8355  */
8356 int
xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt)8357 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
8358     if (ctxt == NULL)
8359         return(-1);
8360     return(ctxt->nb_cons);
8361 }
8362 
8363 #endif /* LIBXML_EXPR_ENABLED */
8364 
8365 #endif /* LIBXML_REGEXP_ENABLED */
8366