• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * regexp.c: generic and extensible Regular Expression engine
3  *
4  * Basically designed with the purpose of compiling regexps for
5  * the variety of validation/schemas mechanisms now available in
6  * XML related specifications these include:
7  *    - XML-1.0 DTD validation
8  *    - XML Schemas structure part 1
9  *    - XML Schemas Datatypes part 2 especially Appendix F
10  *    - RELAX-NG/TREX i.e. the counter proposal
11  *
12  * See Copyright for the status of this software.
13  *
14  * Daniel Veillard <veillard@redhat.com>
15  */
16 
17 #define IN_LIBXML
18 #include "libxml.h"
19 
20 #ifdef LIBXML_REGEXP_ENABLED
21 
22 /* #define DEBUG_ERR */
23 
24 #include <stdio.h>
25 #include <string.h>
26 #ifdef HAVE_LIMITS_H
27 #include <limits.h>
28 #endif
29 #ifdef HAVE_STDINT_H
30 #include <stdint.h>
31 #endif
32 
33 #include <libxml/tree.h>
34 #include <libxml/parserInternals.h>
35 #include <libxml/xmlregexp.h>
36 #include <libxml/xmlautomata.h>
37 #include <libxml/xmlunicode.h>
38 
39 #ifndef INT_MAX
40 #define INT_MAX 123456789 /* easy to flag and big enough for our needs */
41 #endif
42 #ifndef SIZE_MAX
43 #define SIZE_MAX ((size_t) -1)
44 #endif
45 
46 /* #define DEBUG_REGEXP_GRAPH */
47 /* #define DEBUG_REGEXP_EXEC */
48 /* #define DEBUG_PUSH */
49 /* #define DEBUG_COMPACTION */
50 
51 #define MAX_PUSH 10000000
52 
53 #ifdef ERROR
54 #undef ERROR
55 #endif
56 #define ERROR(str)							\
57     ctxt->error = XML_REGEXP_COMPILE_ERROR;				\
58     xmlRegexpErrCompile(ctxt, str);
59 #define NEXT ctxt->cur++
60 #define CUR (*(ctxt->cur))
61 #define NXT(index) (ctxt->cur[index])
62 
63 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
64 #define NEXTL(l) ctxt->cur += l;
65 #define XML_REG_STRING_SEPARATOR '|'
66 /*
67  * Need PREV to check on a '-' within a Character Group. May only be used
68  * when it's guaranteed that cur is not at the beginning of ctxt->string!
69  */
70 #define PREV (ctxt->cur[-1])
71 
72 /**
73  * TODO:
74  *
75  * macro to flag unimplemented blocks
76  */
77 #define TODO								\
78     xmlGenericError(xmlGenericErrorContext,				\
79 	    "Unimplemented block at %s:%d\n",				\
80             __FILE__, __LINE__);
81 
82 /************************************************************************
83  *									*
84  *			Datatypes and structures			*
85  *									*
86  ************************************************************************/
87 
88 /*
89  * Note: the order of the enums below is significant, do not shuffle
90  */
91 typedef enum {
92     XML_REGEXP_EPSILON = 1,
93     XML_REGEXP_CHARVAL,
94     XML_REGEXP_RANGES,
95     XML_REGEXP_SUBREG,  /* used for () sub regexps */
96     XML_REGEXP_STRING,
97     XML_REGEXP_ANYCHAR, /* . */
98     XML_REGEXP_ANYSPACE, /* \s */
99     XML_REGEXP_NOTSPACE, /* \S */
100     XML_REGEXP_INITNAME, /* \l */
101     XML_REGEXP_NOTINITNAME, /* \L */
102     XML_REGEXP_NAMECHAR, /* \c */
103     XML_REGEXP_NOTNAMECHAR, /* \C */
104     XML_REGEXP_DECIMAL, /* \d */
105     XML_REGEXP_NOTDECIMAL, /* \D */
106     XML_REGEXP_REALCHAR, /* \w */
107     XML_REGEXP_NOTREALCHAR, /* \W */
108     XML_REGEXP_LETTER = 100,
109     XML_REGEXP_LETTER_UPPERCASE,
110     XML_REGEXP_LETTER_LOWERCASE,
111     XML_REGEXP_LETTER_TITLECASE,
112     XML_REGEXP_LETTER_MODIFIER,
113     XML_REGEXP_LETTER_OTHERS,
114     XML_REGEXP_MARK,
115     XML_REGEXP_MARK_NONSPACING,
116     XML_REGEXP_MARK_SPACECOMBINING,
117     XML_REGEXP_MARK_ENCLOSING,
118     XML_REGEXP_NUMBER,
119     XML_REGEXP_NUMBER_DECIMAL,
120     XML_REGEXP_NUMBER_LETTER,
121     XML_REGEXP_NUMBER_OTHERS,
122     XML_REGEXP_PUNCT,
123     XML_REGEXP_PUNCT_CONNECTOR,
124     XML_REGEXP_PUNCT_DASH,
125     XML_REGEXP_PUNCT_OPEN,
126     XML_REGEXP_PUNCT_CLOSE,
127     XML_REGEXP_PUNCT_INITQUOTE,
128     XML_REGEXP_PUNCT_FINQUOTE,
129     XML_REGEXP_PUNCT_OTHERS,
130     XML_REGEXP_SEPAR,
131     XML_REGEXP_SEPAR_SPACE,
132     XML_REGEXP_SEPAR_LINE,
133     XML_REGEXP_SEPAR_PARA,
134     XML_REGEXP_SYMBOL,
135     XML_REGEXP_SYMBOL_MATH,
136     XML_REGEXP_SYMBOL_CURRENCY,
137     XML_REGEXP_SYMBOL_MODIFIER,
138     XML_REGEXP_SYMBOL_OTHERS,
139     XML_REGEXP_OTHER,
140     XML_REGEXP_OTHER_CONTROL,
141     XML_REGEXP_OTHER_FORMAT,
142     XML_REGEXP_OTHER_PRIVATE,
143     XML_REGEXP_OTHER_NA,
144     XML_REGEXP_BLOCK_NAME
145 } xmlRegAtomType;
146 
147 typedef enum {
148     XML_REGEXP_QUANT_EPSILON = 1,
149     XML_REGEXP_QUANT_ONCE,
150     XML_REGEXP_QUANT_OPT,
151     XML_REGEXP_QUANT_MULT,
152     XML_REGEXP_QUANT_PLUS,
153     XML_REGEXP_QUANT_ONCEONLY,
154     XML_REGEXP_QUANT_ALL,
155     XML_REGEXP_QUANT_RANGE
156 } xmlRegQuantType;
157 
158 typedef enum {
159     XML_REGEXP_START_STATE = 1,
160     XML_REGEXP_FINAL_STATE,
161     XML_REGEXP_TRANS_STATE,
162     XML_REGEXP_SINK_STATE,
163     XML_REGEXP_UNREACH_STATE
164 } xmlRegStateType;
165 
166 typedef enum {
167     XML_REGEXP_MARK_NORMAL = 0,
168     XML_REGEXP_MARK_START,
169     XML_REGEXP_MARK_VISITED
170 } xmlRegMarkedType;
171 
172 typedef struct _xmlRegRange xmlRegRange;
173 typedef xmlRegRange *xmlRegRangePtr;
174 
175 struct _xmlRegRange {
176     int neg;		/* 0 normal, 1 not, 2 exclude */
177     xmlRegAtomType type;
178     int start;
179     int end;
180     xmlChar *blockName;
181 };
182 
183 typedef struct _xmlRegAtom xmlRegAtom;
184 typedef xmlRegAtom *xmlRegAtomPtr;
185 
186 typedef struct _xmlAutomataState xmlRegState;
187 typedef xmlRegState *xmlRegStatePtr;
188 
189 struct _xmlRegAtom {
190     int no;
191     xmlRegAtomType type;
192     xmlRegQuantType quant;
193     int min;
194     int max;
195 
196     void *valuep;
197     void *valuep2;
198     int neg;
199     int codepoint;
200     xmlRegStatePtr start;
201     xmlRegStatePtr start0;
202     xmlRegStatePtr stop;
203     int maxRanges;
204     int nbRanges;
205     xmlRegRangePtr *ranges;
206     void *data;
207 };
208 
209 typedef struct _xmlRegCounter xmlRegCounter;
210 typedef xmlRegCounter *xmlRegCounterPtr;
211 
212 struct _xmlRegCounter {
213     int min;
214     int max;
215 };
216 
217 typedef struct _xmlRegTrans xmlRegTrans;
218 typedef xmlRegTrans *xmlRegTransPtr;
219 
220 struct _xmlRegTrans {
221     xmlRegAtomPtr atom;
222     int to;
223     int counter;
224     int count;
225     int nd;
226 };
227 
228 struct _xmlAutomataState {
229     xmlRegStateType type;
230     xmlRegMarkedType mark;
231     xmlRegMarkedType markd;
232     xmlRegMarkedType reached;
233     int no;
234     int maxTrans;
235     int nbTrans;
236     xmlRegTrans *trans;
237     /*  knowing states pointing to us can speed things up */
238     int maxTransTo;
239     int nbTransTo;
240     int *transTo;
241 };
242 
243 typedef struct _xmlAutomata xmlRegParserCtxt;
244 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
245 
246 #define AM_AUTOMATA_RNG 1
247 
248 struct _xmlAutomata {
249     xmlChar *string;
250     xmlChar *cur;
251 
252     int error;
253     int neg;
254 
255     xmlRegStatePtr start;
256     xmlRegStatePtr end;
257     xmlRegStatePtr state;
258 
259     xmlRegAtomPtr atom;
260 
261     int maxAtoms;
262     int nbAtoms;
263     xmlRegAtomPtr *atoms;
264 
265     int maxStates;
266     int nbStates;
267     xmlRegStatePtr *states;
268 
269     int maxCounters;
270     int nbCounters;
271     xmlRegCounter *counters;
272 
273     int determinist;
274     int negs;
275     int flags;
276 
277     int depth;
278 };
279 
280 struct _xmlRegexp {
281     xmlChar *string;
282     int nbStates;
283     xmlRegStatePtr *states;
284     int nbAtoms;
285     xmlRegAtomPtr *atoms;
286     int nbCounters;
287     xmlRegCounter *counters;
288     int determinist;
289     int flags;
290     /*
291      * That's the compact form for determinists automatas
292      */
293     int nbstates;
294     int *compact;
295     void **transdata;
296     int nbstrings;
297     xmlChar **stringMap;
298 };
299 
300 typedef struct _xmlRegExecRollback xmlRegExecRollback;
301 typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
302 
303 struct _xmlRegExecRollback {
304     xmlRegStatePtr state;/* the current state */
305     int index;		/* the index in the input stack */
306     int nextbranch;	/* the next transition to explore in that state */
307     int *counts;	/* save the automata state if it has some */
308 };
309 
310 typedef struct _xmlRegInputToken xmlRegInputToken;
311 typedef xmlRegInputToken *xmlRegInputTokenPtr;
312 
313 struct _xmlRegInputToken {
314     xmlChar *value;
315     void *data;
316 };
317 
318 struct _xmlRegExecCtxt {
319     int status;		/* execution status != 0 indicate an error */
320     int determinist;	/* did we find an indeterministic behaviour */
321     xmlRegexpPtr comp;	/* the compiled regexp */
322     xmlRegExecCallbacks callback;
323     void *data;
324 
325     xmlRegStatePtr state;/* the current state */
326     int transno;	/* the current transition on that state */
327     int transcount;	/* the number of chars in char counted transitions */
328 
329     /*
330      * A stack of rollback states
331      */
332     int maxRollbacks;
333     int nbRollbacks;
334     xmlRegExecRollback *rollbacks;
335 
336     /*
337      * The state of the automata if any
338      */
339     int *counts;
340 
341     /*
342      * The input stack
343      */
344     int inputStackMax;
345     int inputStackNr;
346     int index;
347     int *charStack;
348     const xmlChar *inputString; /* when operating on characters */
349     xmlRegInputTokenPtr inputStack;/* when operating on strings */
350 
351     /*
352      * error handling
353      */
354     int errStateNo;		/* the error state number */
355     xmlRegStatePtr errState;    /* the error state */
356     xmlChar *errString;		/* the string raising the error */
357     int *errCounts;		/* counters at the error state */
358     int nbPush;
359 };
360 
361 #define REGEXP_ALL_COUNTER	0x123456
362 #define REGEXP_ALL_LAX_COUNTER	0x123457
363 
364 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
365 static void xmlRegFreeState(xmlRegStatePtr state);
366 static void xmlRegFreeAtom(xmlRegAtomPtr atom);
367 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
368 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
369 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
370                   int neg, int start, int end, const xmlChar *blockName);
371 
372 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags);
373 
374 /************************************************************************
375  *									*
376  *		Regexp memory error handler				*
377  *									*
378  ************************************************************************/
379 /**
380  * xmlRegexpErrMemory:
381  * @extra:  extra information
382  *
383  * Handle an out of memory condition
384  */
385 static void
xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt,const char * extra)386 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
387 {
388     const char *regexp = NULL;
389     if (ctxt != NULL) {
390         regexp = (const char *) ctxt->string;
391 	ctxt->error = XML_ERR_NO_MEMORY;
392     }
393     __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
394 		    XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
395 		    regexp, NULL, 0, 0,
396 		    "Memory allocation failed : %s\n", extra);
397 }
398 
399 /**
400  * xmlRegexpErrCompile:
401  * @extra:  extra information
402  *
403  * Handle a compilation failure
404  */
405 static void
xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt,const char * extra)406 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
407 {
408     const char *regexp = NULL;
409     int idx = 0;
410 
411     if (ctxt != NULL) {
412         regexp = (const char *) ctxt->string;
413 	idx = ctxt->cur - ctxt->string;
414 	ctxt->error = XML_REGEXP_COMPILE_ERROR;
415     }
416     __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
417 		    XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
418 		    regexp, NULL, idx, 0,
419 		    "failed to compile: %s\n", extra);
420 }
421 
422 /************************************************************************
423  *									*
424  *			Allocation/Deallocation				*
425  *									*
426  ************************************************************************/
427 
428 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
429 
430 /**
431  * xmlRegCalloc2:
432  * @dim1:  size of first dimension
433  * @dim2:  size of second dimension
434  * @elemSize:  size of element
435  *
436  * Allocate a two-dimensional array and set all elements to zero.
437  *
438  * Returns the new array or NULL in case of error.
439  */
440 static void*
xmlRegCalloc2(size_t dim1,size_t dim2,size_t elemSize)441 xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) {
442     size_t totalSize;
443     void *ret;
444 
445     /* Check for overflow */
446     if (dim1 > SIZE_MAX / dim2 / elemSize)
447         return (NULL);
448     totalSize = dim1 * dim2 * elemSize;
449     ret = xmlMalloc(totalSize);
450     if (ret != NULL)
451         memset(ret, 0, totalSize);
452     return (ret);
453 }
454 
455 /**
456  * xmlRegEpxFromParse:
457  * @ctxt:  the parser context used to build it
458  *
459  * Allocate a new regexp and fill it with the result from the parser
460  *
461  * Returns the new regexp or NULL in case of error
462  */
463 static xmlRegexpPtr
xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt)464 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
465     xmlRegexpPtr ret;
466 
467     ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
468     if (ret == NULL) {
469 	xmlRegexpErrMemory(ctxt, "compiling regexp");
470 	return(NULL);
471     }
472     memset(ret, 0, sizeof(xmlRegexp));
473     ret->string = ctxt->string;
474     ret->nbStates = ctxt->nbStates;
475     ret->states = ctxt->states;
476     ret->nbAtoms = ctxt->nbAtoms;
477     ret->atoms = ctxt->atoms;
478     ret->nbCounters = ctxt->nbCounters;
479     ret->counters = ctxt->counters;
480     ret->determinist = ctxt->determinist;
481     ret->flags = ctxt->flags;
482     if (ret->determinist == -1) {
483         xmlRegexpIsDeterminist(ret);
484     }
485 
486     if ((ret->determinist != 0) &&
487 	(ret->nbCounters == 0) &&
488 	(ctxt->negs == 0) &&
489 	(ret->atoms != NULL) &&
490 	(ret->atoms[0] != NULL) &&
491 	(ret->atoms[0]->type == XML_REGEXP_STRING)) {
492 	int i, j, nbstates = 0, nbatoms = 0;
493 	int *stateRemap;
494 	int *stringRemap;
495 	int *transitions;
496 	void **transdata;
497 	xmlChar **stringMap;
498         xmlChar *value;
499 
500 	/*
501 	 * Switch to a compact representation
502 	 * 1/ counting the effective number of states left
503 	 * 2/ counting the unique number of atoms, and check that
504 	 *    they are all of the string type
505 	 * 3/ build a table state x atom for the transitions
506 	 */
507 
508 	stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
509 	if (stateRemap == NULL) {
510 	    xmlRegexpErrMemory(ctxt, "compiling regexp");
511 	    xmlFree(ret);
512 	    return(NULL);
513 	}
514 	for (i = 0;i < ret->nbStates;i++) {
515 	    if (ret->states[i] != NULL) {
516 		stateRemap[i] = nbstates;
517 		nbstates++;
518 	    } else {
519 		stateRemap[i] = -1;
520 	    }
521 	}
522 #ifdef DEBUG_COMPACTION
523 	printf("Final: %d states\n", nbstates);
524 #endif
525 	stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
526 	if (stringMap == NULL) {
527 	    xmlRegexpErrMemory(ctxt, "compiling regexp");
528 	    xmlFree(stateRemap);
529 	    xmlFree(ret);
530 	    return(NULL);
531 	}
532 	stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
533 	if (stringRemap == NULL) {
534 	    xmlRegexpErrMemory(ctxt, "compiling regexp");
535 	    xmlFree(stringMap);
536 	    xmlFree(stateRemap);
537 	    xmlFree(ret);
538 	    return(NULL);
539 	}
540 	for (i = 0;i < ret->nbAtoms;i++) {
541 	    if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
542 		(ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
543 		value = ret->atoms[i]->valuep;
544                 for (j = 0;j < nbatoms;j++) {
545 		    if (xmlStrEqual(stringMap[j], value)) {
546 			stringRemap[i] = j;
547 			break;
548 		    }
549 		}
550 		if (j >= nbatoms) {
551 		    stringRemap[i] = nbatoms;
552 		    stringMap[nbatoms] = xmlStrdup(value);
553 		    if (stringMap[nbatoms] == NULL) {
554 			for (i = 0;i < nbatoms;i++)
555 			    xmlFree(stringMap[i]);
556 			xmlFree(stringRemap);
557 			xmlFree(stringMap);
558 			xmlFree(stateRemap);
559 			xmlFree(ret);
560 			return(NULL);
561 		    }
562 		    nbatoms++;
563 		}
564 	    } else {
565 		xmlFree(stateRemap);
566 		xmlFree(stringRemap);
567 		for (i = 0;i < nbatoms;i++)
568 		    xmlFree(stringMap[i]);
569 		xmlFree(stringMap);
570 		xmlFree(ret);
571 		return(NULL);
572 	    }
573 	}
574 #ifdef DEBUG_COMPACTION
575 	printf("Final: %d atoms\n", nbatoms);
576 #endif
577 	transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1,
578                                             sizeof(int));
579 	if (transitions == NULL) {
580 	    xmlFree(stateRemap);
581 	    xmlFree(stringRemap);
582             for (i = 0;i < nbatoms;i++)
583 		xmlFree(stringMap[i]);
584 	    xmlFree(stringMap);
585 	    xmlFree(ret);
586 	    return(NULL);
587 	}
588 
589 	/*
590 	 * Allocate the transition table. The first entry for each
591 	 * state corresponds to the state type.
592 	 */
593 	transdata = NULL;
594 
595 	for (i = 0;i < ret->nbStates;i++) {
596 	    int stateno, atomno, targetno, prev;
597 	    xmlRegStatePtr state;
598 	    xmlRegTransPtr trans;
599 
600 	    stateno = stateRemap[i];
601 	    if (stateno == -1)
602 		continue;
603 	    state = ret->states[i];
604 
605 	    transitions[stateno * (nbatoms + 1)] = state->type;
606 
607 	    for (j = 0;j < state->nbTrans;j++) {
608 		trans = &(state->trans[j]);
609 		if ((trans->to == -1) || (trans->atom == NULL))
610 		    continue;
611                 atomno = stringRemap[trans->atom->no];
612 		if ((trans->atom->data != NULL) && (transdata == NULL)) {
613 		    transdata = (void **) xmlRegCalloc2(nbstates, nbatoms,
614 			                                sizeof(void *));
615 		    if (transdata == NULL) {
616 			xmlRegexpErrMemory(ctxt, "compiling regexp");
617 			break;
618 		    }
619 		}
620 		targetno = stateRemap[trans->to];
621 		/*
622 		 * if the same atom can generate transitions to 2 different
623 		 * states then it means the automata is not deterministic and
624 		 * the compact form can't be used !
625 		 */
626 		prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
627 		if (prev != 0) {
628 		    if (prev != targetno + 1) {
629 			ret->determinist = 0;
630 #ifdef DEBUG_COMPACTION
631 			printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
632 			       i, j, trans->atom->no, trans->to, atomno, targetno);
633 			printf("       previous to is %d\n", prev);
634 #endif
635 			if (transdata != NULL)
636 			    xmlFree(transdata);
637 			xmlFree(transitions);
638 			xmlFree(stateRemap);
639 			xmlFree(stringRemap);
640 			for (i = 0;i < nbatoms;i++)
641 			    xmlFree(stringMap[i]);
642 			xmlFree(stringMap);
643 			goto not_determ;
644 		    }
645 		} else {
646 #if 0
647 		    printf("State %d trans %d: atom %d to %d : %d to %d\n",
648 			   i, j, trans->atom->no, trans->to, atomno, targetno);
649 #endif
650 		    transitions[stateno * (nbatoms + 1) + atomno + 1] =
651 			targetno + 1; /* to avoid 0 */
652 		    if (transdata != NULL)
653 			transdata[stateno * nbatoms + atomno] =
654 			    trans->atom->data;
655 		}
656 	    }
657 	}
658 	ret->determinist = 1;
659 #ifdef DEBUG_COMPACTION
660 	/*
661 	 * Debug
662 	 */
663 	for (i = 0;i < nbstates;i++) {
664 	    for (j = 0;j < nbatoms + 1;j++) {
665                 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
666 	    }
667 	    printf("\n");
668 	}
669 	printf("\n");
670 #endif
671 	/*
672 	 * Cleanup of the old data
673 	 */
674 	if (ret->states != NULL) {
675 	    for (i = 0;i < ret->nbStates;i++)
676 		xmlRegFreeState(ret->states[i]);
677 	    xmlFree(ret->states);
678 	}
679 	ret->states = NULL;
680 	ret->nbStates = 0;
681 	if (ret->atoms != NULL) {
682 	    for (i = 0;i < ret->nbAtoms;i++)
683 		xmlRegFreeAtom(ret->atoms[i]);
684 	    xmlFree(ret->atoms);
685 	}
686 	ret->atoms = NULL;
687 	ret->nbAtoms = 0;
688 
689 	ret->compact = transitions;
690 	ret->transdata = transdata;
691 	ret->stringMap = stringMap;
692 	ret->nbstrings = nbatoms;
693 	ret->nbstates = nbstates;
694 	xmlFree(stateRemap);
695 	xmlFree(stringRemap);
696     }
697 not_determ:
698     ctxt->string = NULL;
699     ctxt->nbStates = 0;
700     ctxt->states = NULL;
701     ctxt->nbAtoms = 0;
702     ctxt->atoms = NULL;
703     ctxt->nbCounters = 0;
704     ctxt->counters = NULL;
705     return(ret);
706 }
707 
708 /**
709  * xmlRegNewParserCtxt:
710  * @string:  the string to parse
711  *
712  * Allocate a new regexp parser context
713  *
714  * Returns the new context or NULL in case of error
715  */
716 static xmlRegParserCtxtPtr
xmlRegNewParserCtxt(const xmlChar * string)717 xmlRegNewParserCtxt(const xmlChar *string) {
718     xmlRegParserCtxtPtr ret;
719 
720     ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
721     if (ret == NULL)
722 	return(NULL);
723     memset(ret, 0, sizeof(xmlRegParserCtxt));
724     if (string != NULL)
725 	ret->string = xmlStrdup(string);
726     ret->cur = ret->string;
727     ret->neg = 0;
728     ret->negs = 0;
729     ret->error = 0;
730     ret->determinist = -1;
731     return(ret);
732 }
733 
734 /**
735  * xmlRegNewRange:
736  * @ctxt:  the regexp parser context
737  * @neg:  is that negative
738  * @type:  the type of range
739  * @start:  the start codepoint
740  * @end:  the end codepoint
741  *
742  * Allocate a new regexp range
743  *
744  * Returns the new range or NULL in case of error
745  */
746 static xmlRegRangePtr
xmlRegNewRange(xmlRegParserCtxtPtr ctxt,int neg,xmlRegAtomType type,int start,int end)747 xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
748 	       int neg, xmlRegAtomType type, int start, int end) {
749     xmlRegRangePtr ret;
750 
751     ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
752     if (ret == NULL) {
753 	xmlRegexpErrMemory(ctxt, "allocating range");
754 	return(NULL);
755     }
756     ret->neg = neg;
757     ret->type = type;
758     ret->start = start;
759     ret->end = end;
760     return(ret);
761 }
762 
763 /**
764  * xmlRegFreeRange:
765  * @range:  the regexp range
766  *
767  * Free a regexp range
768  */
769 static void
xmlRegFreeRange(xmlRegRangePtr range)770 xmlRegFreeRange(xmlRegRangePtr range) {
771     if (range == NULL)
772 	return;
773 
774     if (range->blockName != NULL)
775 	xmlFree(range->blockName);
776     xmlFree(range);
777 }
778 
779 /**
780  * xmlRegCopyRange:
781  * @range:  the regexp range
782  *
783  * Copy a regexp range
784  *
785  * Returns the new copy or NULL in case of error.
786  */
787 static xmlRegRangePtr
xmlRegCopyRange(xmlRegParserCtxtPtr ctxt,xmlRegRangePtr range)788 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
789     xmlRegRangePtr ret;
790 
791     if (range == NULL)
792 	return(NULL);
793 
794     ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
795                          range->end);
796     if (ret == NULL)
797         return(NULL);
798     if (range->blockName != NULL) {
799 	ret->blockName = xmlStrdup(range->blockName);
800 	if (ret->blockName == NULL) {
801 	    xmlRegexpErrMemory(ctxt, "allocating range");
802 	    xmlRegFreeRange(ret);
803 	    return(NULL);
804 	}
805     }
806     return(ret);
807 }
808 
809 /**
810  * xmlRegNewAtom:
811  * @ctxt:  the regexp parser context
812  * @type:  the type of atom
813  *
814  * Allocate a new atom
815  *
816  * Returns the new atom or NULL in case of error
817  */
818 static xmlRegAtomPtr
xmlRegNewAtom(xmlRegParserCtxtPtr ctxt,xmlRegAtomType type)819 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
820     xmlRegAtomPtr ret;
821 
822     ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
823     if (ret == NULL) {
824 	xmlRegexpErrMemory(ctxt, "allocating atom");
825 	return(NULL);
826     }
827     memset(ret, 0, sizeof(xmlRegAtom));
828     ret->type = type;
829     ret->quant = XML_REGEXP_QUANT_ONCE;
830     ret->min = 0;
831     ret->max = 0;
832     return(ret);
833 }
834 
835 /**
836  * xmlRegFreeAtom:
837  * @atom:  the regexp atom
838  *
839  * Free a regexp atom
840  */
841 static void
xmlRegFreeAtom(xmlRegAtomPtr atom)842 xmlRegFreeAtom(xmlRegAtomPtr atom) {
843     int i;
844 
845     if (atom == NULL)
846 	return;
847 
848     for (i = 0;i < atom->nbRanges;i++)
849 	xmlRegFreeRange(atom->ranges[i]);
850     if (atom->ranges != NULL)
851 	xmlFree(atom->ranges);
852     if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
853 	xmlFree(atom->valuep);
854     if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
855 	xmlFree(atom->valuep2);
856     if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
857 	xmlFree(atom->valuep);
858     xmlFree(atom);
859 }
860 
861 /**
862  * xmlRegCopyAtom:
863  * @ctxt:  the regexp parser context
864  * @atom:  the original atom
865  *
866  * Allocate a new regexp range
867  *
868  * Returns the new atom or NULL in case of error
869  */
870 static xmlRegAtomPtr
xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom)871 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
872     xmlRegAtomPtr ret;
873 
874     ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
875     if (ret == NULL) {
876 	xmlRegexpErrMemory(ctxt, "copying atom");
877 	return(NULL);
878     }
879     memset(ret, 0, sizeof(xmlRegAtom));
880     ret->type = atom->type;
881     ret->quant = atom->quant;
882     ret->min = atom->min;
883     ret->max = atom->max;
884     if (atom->nbRanges > 0) {
885         int i;
886 
887         ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
888 	                                           atom->nbRanges);
889 	if (ret->ranges == NULL) {
890 	    xmlRegexpErrMemory(ctxt, "copying atom");
891 	    goto error;
892 	}
893 	for (i = 0;i < atom->nbRanges;i++) {
894 	    ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
895 	    if (ret->ranges[i] == NULL)
896 	        goto error;
897 	    ret->nbRanges = i + 1;
898 	}
899     }
900     return(ret);
901 
902 error:
903     xmlRegFreeAtom(ret);
904     return(NULL);
905 }
906 
907 static xmlRegStatePtr
xmlRegNewState(xmlRegParserCtxtPtr ctxt)908 xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
909     xmlRegStatePtr ret;
910 
911     ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
912     if (ret == NULL) {
913 	xmlRegexpErrMemory(ctxt, "allocating state");
914 	return(NULL);
915     }
916     memset(ret, 0, sizeof(xmlRegState));
917     ret->type = XML_REGEXP_TRANS_STATE;
918     ret->mark = XML_REGEXP_MARK_NORMAL;
919     return(ret);
920 }
921 
922 /**
923  * xmlRegFreeState:
924  * @state:  the regexp state
925  *
926  * Free a regexp state
927  */
928 static void
xmlRegFreeState(xmlRegStatePtr state)929 xmlRegFreeState(xmlRegStatePtr state) {
930     if (state == NULL)
931 	return;
932 
933     if (state->trans != NULL)
934 	xmlFree(state->trans);
935     if (state->transTo != NULL)
936 	xmlFree(state->transTo);
937     xmlFree(state);
938 }
939 
940 /**
941  * xmlRegFreeParserCtxt:
942  * @ctxt:  the regexp parser context
943  *
944  * Free a regexp parser context
945  */
946 static void
xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt)947 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
948     int i;
949     if (ctxt == NULL)
950 	return;
951 
952     if (ctxt->string != NULL)
953 	xmlFree(ctxt->string);
954     if (ctxt->states != NULL) {
955 	for (i = 0;i < ctxt->nbStates;i++)
956 	    xmlRegFreeState(ctxt->states[i]);
957 	xmlFree(ctxt->states);
958     }
959     if (ctxt->atoms != NULL) {
960 	for (i = 0;i < ctxt->nbAtoms;i++)
961 	    xmlRegFreeAtom(ctxt->atoms[i]);
962 	xmlFree(ctxt->atoms);
963     }
964     if (ctxt->counters != NULL)
965 	xmlFree(ctxt->counters);
966     xmlFree(ctxt);
967 }
968 
969 /************************************************************************
970  *									*
971  *			Display of Data structures			*
972  *									*
973  ************************************************************************/
974 
975 static void
xmlRegPrintAtomType(FILE * output,xmlRegAtomType type)976 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
977     switch (type) {
978         case XML_REGEXP_EPSILON:
979 	    fprintf(output, "epsilon "); break;
980         case XML_REGEXP_CHARVAL:
981 	    fprintf(output, "charval "); break;
982         case XML_REGEXP_RANGES:
983 	    fprintf(output, "ranges "); break;
984         case XML_REGEXP_SUBREG:
985 	    fprintf(output, "subexpr "); break;
986         case XML_REGEXP_STRING:
987 	    fprintf(output, "string "); break;
988         case XML_REGEXP_ANYCHAR:
989 	    fprintf(output, "anychar "); break;
990         case XML_REGEXP_ANYSPACE:
991 	    fprintf(output, "anyspace "); break;
992         case XML_REGEXP_NOTSPACE:
993 	    fprintf(output, "notspace "); break;
994         case XML_REGEXP_INITNAME:
995 	    fprintf(output, "initname "); break;
996         case XML_REGEXP_NOTINITNAME:
997 	    fprintf(output, "notinitname "); break;
998         case XML_REGEXP_NAMECHAR:
999 	    fprintf(output, "namechar "); break;
1000         case XML_REGEXP_NOTNAMECHAR:
1001 	    fprintf(output, "notnamechar "); break;
1002         case XML_REGEXP_DECIMAL:
1003 	    fprintf(output, "decimal "); break;
1004         case XML_REGEXP_NOTDECIMAL:
1005 	    fprintf(output, "notdecimal "); break;
1006         case XML_REGEXP_REALCHAR:
1007 	    fprintf(output, "realchar "); break;
1008         case XML_REGEXP_NOTREALCHAR:
1009 	    fprintf(output, "notrealchar "); break;
1010         case XML_REGEXP_LETTER:
1011             fprintf(output, "LETTER "); break;
1012         case XML_REGEXP_LETTER_UPPERCASE:
1013             fprintf(output, "LETTER_UPPERCASE "); break;
1014         case XML_REGEXP_LETTER_LOWERCASE:
1015             fprintf(output, "LETTER_LOWERCASE "); break;
1016         case XML_REGEXP_LETTER_TITLECASE:
1017             fprintf(output, "LETTER_TITLECASE "); break;
1018         case XML_REGEXP_LETTER_MODIFIER:
1019             fprintf(output, "LETTER_MODIFIER "); break;
1020         case XML_REGEXP_LETTER_OTHERS:
1021             fprintf(output, "LETTER_OTHERS "); break;
1022         case XML_REGEXP_MARK:
1023             fprintf(output, "MARK "); break;
1024         case XML_REGEXP_MARK_NONSPACING:
1025             fprintf(output, "MARK_NONSPACING "); break;
1026         case XML_REGEXP_MARK_SPACECOMBINING:
1027             fprintf(output, "MARK_SPACECOMBINING "); break;
1028         case XML_REGEXP_MARK_ENCLOSING:
1029             fprintf(output, "MARK_ENCLOSING "); break;
1030         case XML_REGEXP_NUMBER:
1031             fprintf(output, "NUMBER "); break;
1032         case XML_REGEXP_NUMBER_DECIMAL:
1033             fprintf(output, "NUMBER_DECIMAL "); break;
1034         case XML_REGEXP_NUMBER_LETTER:
1035             fprintf(output, "NUMBER_LETTER "); break;
1036         case XML_REGEXP_NUMBER_OTHERS:
1037             fprintf(output, "NUMBER_OTHERS "); break;
1038         case XML_REGEXP_PUNCT:
1039             fprintf(output, "PUNCT "); break;
1040         case XML_REGEXP_PUNCT_CONNECTOR:
1041             fprintf(output, "PUNCT_CONNECTOR "); break;
1042         case XML_REGEXP_PUNCT_DASH:
1043             fprintf(output, "PUNCT_DASH "); break;
1044         case XML_REGEXP_PUNCT_OPEN:
1045             fprintf(output, "PUNCT_OPEN "); break;
1046         case XML_REGEXP_PUNCT_CLOSE:
1047             fprintf(output, "PUNCT_CLOSE "); break;
1048         case XML_REGEXP_PUNCT_INITQUOTE:
1049             fprintf(output, "PUNCT_INITQUOTE "); break;
1050         case XML_REGEXP_PUNCT_FINQUOTE:
1051             fprintf(output, "PUNCT_FINQUOTE "); break;
1052         case XML_REGEXP_PUNCT_OTHERS:
1053             fprintf(output, "PUNCT_OTHERS "); break;
1054         case XML_REGEXP_SEPAR:
1055             fprintf(output, "SEPAR "); break;
1056         case XML_REGEXP_SEPAR_SPACE:
1057             fprintf(output, "SEPAR_SPACE "); break;
1058         case XML_REGEXP_SEPAR_LINE:
1059             fprintf(output, "SEPAR_LINE "); break;
1060         case XML_REGEXP_SEPAR_PARA:
1061             fprintf(output, "SEPAR_PARA "); break;
1062         case XML_REGEXP_SYMBOL:
1063             fprintf(output, "SYMBOL "); break;
1064         case XML_REGEXP_SYMBOL_MATH:
1065             fprintf(output, "SYMBOL_MATH "); break;
1066         case XML_REGEXP_SYMBOL_CURRENCY:
1067             fprintf(output, "SYMBOL_CURRENCY "); break;
1068         case XML_REGEXP_SYMBOL_MODIFIER:
1069             fprintf(output, "SYMBOL_MODIFIER "); break;
1070         case XML_REGEXP_SYMBOL_OTHERS:
1071             fprintf(output, "SYMBOL_OTHERS "); break;
1072         case XML_REGEXP_OTHER:
1073             fprintf(output, "OTHER "); break;
1074         case XML_REGEXP_OTHER_CONTROL:
1075             fprintf(output, "OTHER_CONTROL "); break;
1076         case XML_REGEXP_OTHER_FORMAT:
1077             fprintf(output, "OTHER_FORMAT "); break;
1078         case XML_REGEXP_OTHER_PRIVATE:
1079             fprintf(output, "OTHER_PRIVATE "); break;
1080         case XML_REGEXP_OTHER_NA:
1081             fprintf(output, "OTHER_NA "); break;
1082         case XML_REGEXP_BLOCK_NAME:
1083 	    fprintf(output, "BLOCK "); break;
1084     }
1085 }
1086 
1087 static void
xmlRegPrintQuantType(FILE * output,xmlRegQuantType type)1088 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1089     switch (type) {
1090         case XML_REGEXP_QUANT_EPSILON:
1091 	    fprintf(output, "epsilon "); break;
1092         case XML_REGEXP_QUANT_ONCE:
1093 	    fprintf(output, "once "); break;
1094         case XML_REGEXP_QUANT_OPT:
1095 	    fprintf(output, "? "); break;
1096         case XML_REGEXP_QUANT_MULT:
1097 	    fprintf(output, "* "); break;
1098         case XML_REGEXP_QUANT_PLUS:
1099 	    fprintf(output, "+ "); break;
1100 	case XML_REGEXP_QUANT_RANGE:
1101 	    fprintf(output, "range "); break;
1102 	case XML_REGEXP_QUANT_ONCEONLY:
1103 	    fprintf(output, "onceonly "); break;
1104 	case XML_REGEXP_QUANT_ALL:
1105 	    fprintf(output, "all "); break;
1106     }
1107 }
1108 static void
xmlRegPrintRange(FILE * output,xmlRegRangePtr range)1109 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1110     fprintf(output, "  range: ");
1111     if (range->neg)
1112 	fprintf(output, "negative ");
1113     xmlRegPrintAtomType(output, range->type);
1114     fprintf(output, "%c - %c\n", range->start, range->end);
1115 }
1116 
1117 static void
xmlRegPrintAtom(FILE * output,xmlRegAtomPtr atom)1118 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1119     fprintf(output, " atom: ");
1120     if (atom == NULL) {
1121 	fprintf(output, "NULL\n");
1122 	return;
1123     }
1124     if (atom->neg)
1125         fprintf(output, "not ");
1126     xmlRegPrintAtomType(output, atom->type);
1127     xmlRegPrintQuantType(output, atom->quant);
1128     if (atom->quant == XML_REGEXP_QUANT_RANGE)
1129 	fprintf(output, "%d-%d ", atom->min, atom->max);
1130     if (atom->type == XML_REGEXP_STRING)
1131 	fprintf(output, "'%s' ", (char *) atom->valuep);
1132     if (atom->type == XML_REGEXP_CHARVAL)
1133 	fprintf(output, "char %c\n", atom->codepoint);
1134     else if (atom->type == XML_REGEXP_RANGES) {
1135 	int i;
1136 	fprintf(output, "%d entries\n", atom->nbRanges);
1137 	for (i = 0; i < atom->nbRanges;i++)
1138 	    xmlRegPrintRange(output, atom->ranges[i]);
1139     } else if (atom->type == XML_REGEXP_SUBREG) {
1140 	fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1141     } else {
1142 	fprintf(output, "\n");
1143     }
1144 }
1145 
1146 static void
xmlRegPrintTrans(FILE * output,xmlRegTransPtr trans)1147 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1148     fprintf(output, "  trans: ");
1149     if (trans == NULL) {
1150 	fprintf(output, "NULL\n");
1151 	return;
1152     }
1153     if (trans->to < 0) {
1154 	fprintf(output, "removed\n");
1155 	return;
1156     }
1157     if (trans->nd != 0) {
1158 	if (trans->nd == 2)
1159 	    fprintf(output, "last not determinist, ");
1160 	else
1161 	    fprintf(output, "not determinist, ");
1162     }
1163     if (trans->counter >= 0) {
1164 	fprintf(output, "counted %d, ", trans->counter);
1165     }
1166     if (trans->count == REGEXP_ALL_COUNTER) {
1167 	fprintf(output, "all transition, ");
1168     } else if (trans->count >= 0) {
1169 	fprintf(output, "count based %d, ", trans->count);
1170     }
1171     if (trans->atom == NULL) {
1172 	fprintf(output, "epsilon to %d\n", trans->to);
1173 	return;
1174     }
1175     if (trans->atom->type == XML_REGEXP_CHARVAL)
1176 	fprintf(output, "char %c ", trans->atom->codepoint);
1177     fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1178 }
1179 
1180 static void
xmlRegPrintState(FILE * output,xmlRegStatePtr state)1181 xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1182     int i;
1183 
1184     fprintf(output, " state: ");
1185     if (state == NULL) {
1186 	fprintf(output, "NULL\n");
1187 	return;
1188     }
1189     if (state->type == XML_REGEXP_START_STATE)
1190 	fprintf(output, "START ");
1191     if (state->type == XML_REGEXP_FINAL_STATE)
1192 	fprintf(output, "FINAL ");
1193 
1194     fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1195     for (i = 0;i < state->nbTrans; i++) {
1196 	xmlRegPrintTrans(output, &(state->trans[i]));
1197     }
1198 }
1199 
1200 #ifdef DEBUG_REGEXP_GRAPH
1201 static void
xmlRegPrintCtxt(FILE * output,xmlRegParserCtxtPtr ctxt)1202 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1203     int i;
1204 
1205     fprintf(output, " ctxt: ");
1206     if (ctxt == NULL) {
1207 	fprintf(output, "NULL\n");
1208 	return;
1209     }
1210     fprintf(output, "'%s' ", ctxt->string);
1211     if (ctxt->error)
1212 	fprintf(output, "error ");
1213     if (ctxt->neg)
1214 	fprintf(output, "neg ");
1215     fprintf(output, "\n");
1216     fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1217     for (i = 0;i < ctxt->nbAtoms; i++) {
1218 	fprintf(output, " %02d ", i);
1219 	xmlRegPrintAtom(output, ctxt->atoms[i]);
1220     }
1221     if (ctxt->atom != NULL) {
1222 	fprintf(output, "current atom:\n");
1223 	xmlRegPrintAtom(output, ctxt->atom);
1224     }
1225     fprintf(output, "%d states:", ctxt->nbStates);
1226     if (ctxt->start != NULL)
1227 	fprintf(output, " start: %d", ctxt->start->no);
1228     if (ctxt->end != NULL)
1229 	fprintf(output, " end: %d", ctxt->end->no);
1230     fprintf(output, "\n");
1231     for (i = 0;i < ctxt->nbStates; i++) {
1232 	xmlRegPrintState(output, ctxt->states[i]);
1233     }
1234     fprintf(output, "%d counters:\n", ctxt->nbCounters);
1235     for (i = 0;i < ctxt->nbCounters; i++) {
1236 	fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1237 		                                ctxt->counters[i].max);
1238     }
1239 }
1240 #endif
1241 
1242 /************************************************************************
1243  *									*
1244  *		 Finite Automata structures manipulations		*
1245  *									*
1246  ************************************************************************/
1247 
1248 static void
xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom,int neg,xmlRegAtomType type,int start,int end,xmlChar * blockName)1249 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1250 	           int neg, xmlRegAtomType type, int start, int end,
1251 		   xmlChar *blockName) {
1252     xmlRegRangePtr range;
1253 
1254     if (atom == NULL) {
1255 	ERROR("add range: atom is NULL");
1256 	return;
1257     }
1258     if (atom->type != XML_REGEXP_RANGES) {
1259 	ERROR("add range: atom is not ranges");
1260 	return;
1261     }
1262     if (atom->maxRanges == 0) {
1263 	atom->maxRanges = 4;
1264 	atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1265 		                             sizeof(xmlRegRangePtr));
1266 	if (atom->ranges == NULL) {
1267 	    xmlRegexpErrMemory(ctxt, "adding ranges");
1268 	    atom->maxRanges = 0;
1269 	    return;
1270 	}
1271     } else if (atom->nbRanges >= atom->maxRanges) {
1272 	xmlRegRangePtr *tmp;
1273 	atom->maxRanges *= 2;
1274 	tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1275 		                             sizeof(xmlRegRangePtr));
1276 	if (tmp == NULL) {
1277 	    xmlRegexpErrMemory(ctxt, "adding ranges");
1278 	    atom->maxRanges /= 2;
1279 	    return;
1280 	}
1281 	atom->ranges = tmp;
1282     }
1283     range = xmlRegNewRange(ctxt, neg, type, start, end);
1284     if (range == NULL)
1285 	return;
1286     range->blockName = blockName;
1287     atom->ranges[atom->nbRanges++] = range;
1288 
1289 }
1290 
1291 static int
xmlRegGetCounter(xmlRegParserCtxtPtr ctxt)1292 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1293     if (ctxt->maxCounters == 0) {
1294 	ctxt->maxCounters = 4;
1295 	ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1296 		                             sizeof(xmlRegCounter));
1297 	if (ctxt->counters == NULL) {
1298 	    xmlRegexpErrMemory(ctxt, "allocating counter");
1299 	    ctxt->maxCounters = 0;
1300 	    return(-1);
1301 	}
1302     } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1303 	xmlRegCounter *tmp;
1304 	ctxt->maxCounters *= 2;
1305 	tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1306 		                           sizeof(xmlRegCounter));
1307 	if (tmp == NULL) {
1308 	    xmlRegexpErrMemory(ctxt, "allocating counter");
1309 	    ctxt->maxCounters /= 2;
1310 	    return(-1);
1311 	}
1312 	ctxt->counters = tmp;
1313     }
1314     ctxt->counters[ctxt->nbCounters].min = -1;
1315     ctxt->counters[ctxt->nbCounters].max = -1;
1316     return(ctxt->nbCounters++);
1317 }
1318 
1319 static int
xmlRegAtomPush(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom)1320 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1321     if (atom == NULL) {
1322 	ERROR("atom push: atom is NULL");
1323 	return(-1);
1324     }
1325     if (ctxt->maxAtoms == 0) {
1326 	ctxt->maxAtoms = 4;
1327 	ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1328 		                             sizeof(xmlRegAtomPtr));
1329 	if (ctxt->atoms == NULL) {
1330 	    xmlRegexpErrMemory(ctxt, "pushing atom");
1331 	    ctxt->maxAtoms = 0;
1332 	    return(-1);
1333 	}
1334     } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1335 	xmlRegAtomPtr *tmp;
1336 	ctxt->maxAtoms *= 2;
1337 	tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1338 		                             sizeof(xmlRegAtomPtr));
1339 	if (tmp == NULL) {
1340 	    xmlRegexpErrMemory(ctxt, "allocating counter");
1341 	    ctxt->maxAtoms /= 2;
1342 	    return(-1);
1343 	}
1344 	ctxt->atoms = tmp;
1345     }
1346     atom->no = ctxt->nbAtoms;
1347     ctxt->atoms[ctxt->nbAtoms++] = atom;
1348     return(0);
1349 }
1350 
1351 static void
xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr target,int from)1352 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1353                       int from) {
1354     if (target->maxTransTo == 0) {
1355 	target->maxTransTo = 8;
1356 	target->transTo = (int *) xmlMalloc(target->maxTransTo *
1357 		                             sizeof(int));
1358 	if (target->transTo == NULL) {
1359 	    xmlRegexpErrMemory(ctxt, "adding transition");
1360 	    target->maxTransTo = 0;
1361 	    return;
1362 	}
1363     } else if (target->nbTransTo >= target->maxTransTo) {
1364 	int *tmp;
1365 	target->maxTransTo *= 2;
1366 	tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1367 		                             sizeof(int));
1368 	if (tmp == NULL) {
1369 	    xmlRegexpErrMemory(ctxt, "adding transition");
1370 	    target->maxTransTo /= 2;
1371 	    return;
1372 	}
1373 	target->transTo = tmp;
1374     }
1375     target->transTo[target->nbTransTo] = from;
1376     target->nbTransTo++;
1377 }
1378 
1379 static void
xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,xmlRegAtomPtr atom,xmlRegStatePtr target,int counter,int count)1380 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1381 	            xmlRegAtomPtr atom, xmlRegStatePtr target,
1382 		    int counter, int count) {
1383 
1384     int nrtrans;
1385 
1386     if (state == NULL) {
1387 	ERROR("add state: state is NULL");
1388 	return;
1389     }
1390     if (target == NULL) {
1391 	ERROR("add state: target is NULL");
1392 	return;
1393     }
1394     /*
1395      * Other routines follow the philosophy 'When in doubt, add a transition'
1396      * so we check here whether such a transition is already present and, if
1397      * so, silently ignore this request.
1398      */
1399 
1400     for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1401 	xmlRegTransPtr trans = &(state->trans[nrtrans]);
1402 	if ((trans->atom == atom) &&
1403 	    (trans->to == target->no) &&
1404 	    (trans->counter == counter) &&
1405 	    (trans->count == count)) {
1406 #ifdef DEBUG_REGEXP_GRAPH
1407 	    printf("Ignoring duplicate transition from %d to %d\n",
1408 		    state->no, target->no);
1409 #endif
1410 	    return;
1411 	}
1412     }
1413 
1414     if (state->maxTrans == 0) {
1415 	state->maxTrans = 8;
1416 	state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1417 		                             sizeof(xmlRegTrans));
1418 	if (state->trans == NULL) {
1419 	    xmlRegexpErrMemory(ctxt, "adding transition");
1420 	    state->maxTrans = 0;
1421 	    return;
1422 	}
1423     } else if (state->nbTrans >= state->maxTrans) {
1424 	xmlRegTrans *tmp;
1425 	state->maxTrans *= 2;
1426 	tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1427 		                             sizeof(xmlRegTrans));
1428 	if (tmp == NULL) {
1429 	    xmlRegexpErrMemory(ctxt, "adding transition");
1430 	    state->maxTrans /= 2;
1431 	    return;
1432 	}
1433 	state->trans = tmp;
1434     }
1435 #ifdef DEBUG_REGEXP_GRAPH
1436     printf("Add trans from %d to %d ", state->no, target->no);
1437     if (count == REGEXP_ALL_COUNTER)
1438 	printf("all transition\n");
1439     else if (count >= 0)
1440 	printf("count based %d\n", count);
1441     else if (counter >= 0)
1442 	printf("counted %d\n", counter);
1443     else if (atom == NULL)
1444 	printf("epsilon transition\n");
1445     else if (atom != NULL)
1446         xmlRegPrintAtom(stdout, atom);
1447 #endif
1448 
1449     state->trans[state->nbTrans].atom = atom;
1450     state->trans[state->nbTrans].to = target->no;
1451     state->trans[state->nbTrans].counter = counter;
1452     state->trans[state->nbTrans].count = count;
1453     state->trans[state->nbTrans].nd = 0;
1454     state->nbTrans++;
1455     xmlRegStateAddTransTo(ctxt, target, state->no);
1456 }
1457 
1458 static int
xmlRegStatePush(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state)1459 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
1460     if (state == NULL) return(-1);
1461     if (ctxt->maxStates == 0) {
1462 	ctxt->maxStates = 4;
1463 	ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1464 		                             sizeof(xmlRegStatePtr));
1465 	if (ctxt->states == NULL) {
1466 	    xmlRegexpErrMemory(ctxt, "adding state");
1467 	    ctxt->maxStates = 0;
1468 	    return(-1);
1469 	}
1470     } else if (ctxt->nbStates >= ctxt->maxStates) {
1471 	xmlRegStatePtr *tmp;
1472 	ctxt->maxStates *= 2;
1473 	tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1474 		                             sizeof(xmlRegStatePtr));
1475 	if (tmp == NULL) {
1476 	    xmlRegexpErrMemory(ctxt, "adding state");
1477 	    ctxt->maxStates /= 2;
1478 	    return(-1);
1479 	}
1480 	ctxt->states = tmp;
1481     }
1482     state->no = ctxt->nbStates;
1483     ctxt->states[ctxt->nbStates++] = state;
1484     return(0);
1485 }
1486 
1487 /**
1488  * xmlFAGenerateAllTransition:
1489  * @ctxt:  a regexp parser context
1490  * @from:  the from state
1491  * @to:  the target state or NULL for building a new one
1492  * @lax:
1493  *
1494  */
1495 static void
xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int lax)1496 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1497 			   xmlRegStatePtr from, xmlRegStatePtr to,
1498 			   int lax) {
1499     if (to == NULL) {
1500 	to = xmlRegNewState(ctxt);
1501 	xmlRegStatePush(ctxt, to);
1502 	ctxt->state = to;
1503     }
1504     if (lax)
1505 	xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1506     else
1507 	xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1508 }
1509 
1510 /**
1511  * xmlFAGenerateEpsilonTransition:
1512  * @ctxt:  a regexp parser context
1513  * @from:  the from state
1514  * @to:  the target state or NULL for building a new one
1515  *
1516  */
1517 static void
xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to)1518 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1519 			       xmlRegStatePtr from, xmlRegStatePtr to) {
1520     if (to == NULL) {
1521 	to = xmlRegNewState(ctxt);
1522 	xmlRegStatePush(ctxt, to);
1523 	ctxt->state = to;
1524     }
1525     xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1526 }
1527 
1528 /**
1529  * xmlFAGenerateCountedEpsilonTransition:
1530  * @ctxt:  a regexp parser context
1531  * @from:  the from state
1532  * @to:  the target state or NULL for building a new one
1533  * counter:  the counter for that transition
1534  *
1535  */
1536 static void
xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1537 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1538 	    xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1539     if (to == NULL) {
1540 	to = xmlRegNewState(ctxt);
1541 	xmlRegStatePush(ctxt, to);
1542 	ctxt->state = to;
1543     }
1544     xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1545 }
1546 
1547 /**
1548  * xmlFAGenerateCountedTransition:
1549  * @ctxt:  a regexp parser context
1550  * @from:  the from state
1551  * @to:  the target state or NULL for building a new one
1552  * counter:  the counter for that transition
1553  *
1554  */
1555 static void
xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1556 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1557 	    xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1558     if (to == NULL) {
1559 	to = xmlRegNewState(ctxt);
1560 	xmlRegStatePush(ctxt, to);
1561 	ctxt->state = to;
1562     }
1563     xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1564 }
1565 
1566 /**
1567  * xmlFAGenerateTransitions:
1568  * @ctxt:  a regexp parser context
1569  * @from:  the from state
1570  * @to:  the target state or NULL for building a new one
1571  * @atom:  the atom generating the transition
1572  *
1573  * Returns 0 if success and -1 in case of error.
1574  */
1575 static int
xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,xmlRegAtomPtr atom)1576 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1577 	                 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1578     xmlRegStatePtr end;
1579     int nullable = 0;
1580 
1581     if (atom == NULL) {
1582 	ERROR("generate transition: atom == NULL");
1583 	return(-1);
1584     }
1585     if (atom->type == XML_REGEXP_SUBREG) {
1586 	/*
1587 	 * this is a subexpression handling one should not need to
1588 	 * create a new node except for XML_REGEXP_QUANT_RANGE.
1589 	 */
1590 	if (xmlRegAtomPush(ctxt, atom) < 0) {
1591 	    return(-1);
1592 	}
1593 	if ((to != NULL) && (atom->stop != to) &&
1594 	    (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1595 	    /*
1596 	     * Generate an epsilon transition to link to the target
1597 	     */
1598 	    xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1599 #ifdef DV
1600 	} else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1601 		   (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1602 	    to = xmlRegNewState(ctxt);
1603 	    xmlRegStatePush(ctxt, to);
1604 	    ctxt->state = to;
1605 	    xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1606 #endif
1607 	}
1608 	switch (atom->quant) {
1609 	    case XML_REGEXP_QUANT_OPT:
1610 		atom->quant = XML_REGEXP_QUANT_ONCE;
1611 		/*
1612 		 * transition done to the state after end of atom.
1613 		 *      1. set transition from atom start to new state
1614 		 *      2. set transition from atom end to this state.
1615 		 */
1616                 if (to == NULL) {
1617                     xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1618                     xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1619                                                    ctxt->state);
1620                 } else {
1621                     xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1622                 }
1623 		break;
1624 	    case XML_REGEXP_QUANT_MULT:
1625 		atom->quant = XML_REGEXP_QUANT_ONCE;
1626 		xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1627 		xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1628 		break;
1629 	    case XML_REGEXP_QUANT_PLUS:
1630 		atom->quant = XML_REGEXP_QUANT_ONCE;
1631 		xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1632 		break;
1633 	    case XML_REGEXP_QUANT_RANGE: {
1634 		int counter;
1635 		xmlRegStatePtr inter, newstate;
1636 
1637 		/*
1638 		 * create the final state now if needed
1639 		 */
1640 		if (to != NULL) {
1641 		    newstate = to;
1642 		} else {
1643 		    newstate = xmlRegNewState(ctxt);
1644 		    xmlRegStatePush(ctxt, newstate);
1645 		}
1646 
1647 		/*
1648 		 * The principle here is to use counted transition
1649 		 * to avoid explosion in the number of states in the
1650 		 * graph. This is clearly more complex but should not
1651 		 * be exploitable at runtime.
1652 		 */
1653 		if ((atom->min == 0) && (atom->start0 == NULL)) {
1654 		    xmlRegAtomPtr copy;
1655 		    /*
1656 		     * duplicate a transition based on atom to count next
1657 		     * occurrences after 1. We cannot loop to atom->start
1658 		     * directly because we need an epsilon transition to
1659 		     * newstate.
1660 		     */
1661 		     /* ???? For some reason it seems we never reach that
1662 		        case, I suppose this got optimized out before when
1663 			building the automata */
1664 		    copy = xmlRegCopyAtom(ctxt, atom);
1665 		    if (copy == NULL)
1666 		        return(-1);
1667 		    copy->quant = XML_REGEXP_QUANT_ONCE;
1668 		    copy->min = 0;
1669 		    copy->max = 0;
1670 
1671 		    if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1672 		        < 0)
1673 			return(-1);
1674 		    inter = ctxt->state;
1675 		    counter = xmlRegGetCounter(ctxt);
1676 		    ctxt->counters[counter].min = atom->min - 1;
1677 		    ctxt->counters[counter].max = atom->max - 1;
1678 		    /* count the number of times we see it again */
1679 		    xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1680 						   atom->stop, counter);
1681 		    /* allow a way out based on the count */
1682 		    xmlFAGenerateCountedTransition(ctxt, inter,
1683 			                           newstate, counter);
1684 		    /* and also allow a direct exit for 0 */
1685 		    xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1686 		                                   newstate);
1687 		} else {
1688 		    /*
1689 		     * either we need the atom at least once or there
1690 		     * is an atom->start0 allowing to easily plug the
1691 		     * epsilon transition.
1692 		     */
1693 		    counter = xmlRegGetCounter(ctxt);
1694 		    ctxt->counters[counter].min = atom->min - 1;
1695 		    ctxt->counters[counter].max = atom->max - 1;
1696 		    /* count the number of times we see it again */
1697 		    xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1698 						   atom->start, counter);
1699 		    /* allow a way out based on the count */
1700 		    xmlFAGenerateCountedTransition(ctxt, atom->stop,
1701 			                           newstate, counter);
1702 		    /* and if needed allow a direct exit for 0 */
1703 		    if (atom->min == 0)
1704 			xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1705 						       newstate);
1706 
1707 		}
1708 		atom->min = 0;
1709 		atom->max = 0;
1710 		atom->quant = XML_REGEXP_QUANT_ONCE;
1711 		ctxt->state = newstate;
1712 	    }
1713 	    default:
1714 		break;
1715 	}
1716 	return(0);
1717     }
1718     if ((atom->min == 0) && (atom->max == 0) &&
1719                (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1720         /*
1721 	 * we can discard the atom and generate an epsilon transition instead
1722 	 */
1723 	if (to == NULL) {
1724 	    to = xmlRegNewState(ctxt);
1725 	    if (to != NULL)
1726 		xmlRegStatePush(ctxt, to);
1727 	    else {
1728 		return(-1);
1729 	    }
1730 	}
1731 	xmlFAGenerateEpsilonTransition(ctxt, from, to);
1732 	ctxt->state = to;
1733 	xmlRegFreeAtom(atom);
1734 	return(0);
1735     }
1736     if (to == NULL) {
1737 	to = xmlRegNewState(ctxt);
1738 	if (to != NULL)
1739 	    xmlRegStatePush(ctxt, to);
1740 	else {
1741 	    return(-1);
1742 	}
1743     }
1744     end = to;
1745     if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
1746         (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1747 	/*
1748 	 * Do not pollute the target state by adding transitions from
1749 	 * it as it is likely to be the shared target of multiple branches.
1750 	 * So isolate with an epsilon transition.
1751 	 */
1752         xmlRegStatePtr tmp;
1753 
1754 	tmp = xmlRegNewState(ctxt);
1755 	if (tmp != NULL)
1756 	    xmlRegStatePush(ctxt, tmp);
1757 	else {
1758 	    return(-1);
1759 	}
1760 	xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1761 	to = tmp;
1762     }
1763     if (xmlRegAtomPush(ctxt, atom) < 0) {
1764 	return(-1);
1765     }
1766     if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
1767         (atom->min == 0) && (atom->max > 0)) {
1768 	nullable = 1;
1769 	atom->min = 1;
1770         if (atom->max == 1)
1771 	    atom->quant = XML_REGEXP_QUANT_OPT;
1772     }
1773     xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1774     ctxt->state = end;
1775     switch (atom->quant) {
1776 	case XML_REGEXP_QUANT_OPT:
1777 	    atom->quant = XML_REGEXP_QUANT_ONCE;
1778 	    xmlFAGenerateEpsilonTransition(ctxt, from, to);
1779 	    break;
1780 	case XML_REGEXP_QUANT_MULT:
1781 	    atom->quant = XML_REGEXP_QUANT_ONCE;
1782 	    xmlFAGenerateEpsilonTransition(ctxt, from, to);
1783 	    xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1784 	    break;
1785 	case XML_REGEXP_QUANT_PLUS:
1786 	    atom->quant = XML_REGEXP_QUANT_ONCE;
1787 	    xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1788 	    break;
1789 	case XML_REGEXP_QUANT_RANGE:
1790 	    if (nullable)
1791 		xmlFAGenerateEpsilonTransition(ctxt, from, to);
1792 	    break;
1793 	default:
1794 	    break;
1795     }
1796     return(0);
1797 }
1798 
1799 /**
1800  * xmlFAReduceEpsilonTransitions:
1801  * @ctxt:  a regexp parser context
1802  * @fromnr:  the from state
1803  * @tonr:  the to state
1804  * @counter:  should that transition be associated to a counted
1805  *
1806  */
1807 static void
xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt,int fromnr,int tonr,int counter)1808 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1809 	                      int tonr, int counter) {
1810     int transnr;
1811     xmlRegStatePtr from;
1812     xmlRegStatePtr to;
1813 
1814 #ifdef DEBUG_REGEXP_GRAPH
1815     printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1816 #endif
1817     from = ctxt->states[fromnr];
1818     if (from == NULL)
1819 	return;
1820     to = ctxt->states[tonr];
1821     if (to == NULL)
1822 	return;
1823     if ((to->mark == XML_REGEXP_MARK_START) ||
1824 	(to->mark == XML_REGEXP_MARK_VISITED))
1825 	return;
1826 
1827     to->mark = XML_REGEXP_MARK_VISITED;
1828     if (to->type == XML_REGEXP_FINAL_STATE) {
1829 #ifdef DEBUG_REGEXP_GRAPH
1830 	printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1831 #endif
1832 	from->type = XML_REGEXP_FINAL_STATE;
1833     }
1834     for (transnr = 0;transnr < to->nbTrans;transnr++) {
1835         if (to->trans[transnr].to < 0)
1836 	    continue;
1837 	if (to->trans[transnr].atom == NULL) {
1838 	    /*
1839 	     * Don't remove counted transitions
1840 	     * Don't loop either
1841 	     */
1842 	    if (to->trans[transnr].to != fromnr) {
1843 		if (to->trans[transnr].count >= 0) {
1844 		    int newto = to->trans[transnr].to;
1845 
1846 		    xmlRegStateAddTrans(ctxt, from, NULL,
1847 					ctxt->states[newto],
1848 					-1, to->trans[transnr].count);
1849 		} else {
1850 #ifdef DEBUG_REGEXP_GRAPH
1851 		    printf("Found epsilon trans %d from %d to %d\n",
1852 			   transnr, tonr, to->trans[transnr].to);
1853 #endif
1854 		    if (to->trans[transnr].counter >= 0) {
1855 			xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1856 					      to->trans[transnr].to,
1857 					      to->trans[transnr].counter);
1858 		    } else {
1859 			xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1860 					      to->trans[transnr].to,
1861 					      counter);
1862 		    }
1863 		}
1864 	    }
1865 	} else {
1866 	    int newto = to->trans[transnr].to;
1867 
1868 	    if (to->trans[transnr].counter >= 0) {
1869 		xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1870 				    ctxt->states[newto],
1871 				    to->trans[transnr].counter, -1);
1872 	    } else {
1873 		xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1874 				    ctxt->states[newto], counter, -1);
1875 	    }
1876 	}
1877     }
1878     to->mark = XML_REGEXP_MARK_NORMAL;
1879 }
1880 
1881 /**
1882  * xmlFAEliminateSimpleEpsilonTransitions:
1883  * @ctxt:  a regexp parser context
1884  *
1885  * Eliminating general epsilon transitions can get costly in the general
1886  * algorithm due to the large amount of generated new transitions and
1887  * associated comparisons. However for simple epsilon transition used just
1888  * to separate building blocks when generating the automata this can be
1889  * reduced to state elimination:
1890  *    - if there exists an epsilon from X to Y
1891  *    - if there is no other transition from X
1892  * then X and Y are semantically equivalent and X can be eliminated
1893  * If X is the start state then make Y the start state, else replace the
1894  * target of all transitions to X by transitions to Y.
1895  *
1896  * If X is a final state, skip it.
1897  * Otherwise it would be necessary to manipulate counters for this case when
1898  * eliminating state 2:
1899  * State 1 has a transition with an atom to state 2.
1900  * State 2 is final and has an epsilon transition to state 1.
1901  */
1902 static void
xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1903 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1904     int statenr, i, j, newto;
1905     xmlRegStatePtr state, tmp;
1906 
1907     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1908 	state = ctxt->states[statenr];
1909 	if (state == NULL)
1910 	    continue;
1911 	if (state->nbTrans != 1)
1912 	    continue;
1913        if (state->type == XML_REGEXP_UNREACH_STATE ||
1914            state->type == XML_REGEXP_FINAL_STATE)
1915 	    continue;
1916 	/* is the only transition out a basic transition */
1917 	if ((state->trans[0].atom == NULL) &&
1918 	    (state->trans[0].to >= 0) &&
1919 	    (state->trans[0].to != statenr) &&
1920 	    (state->trans[0].counter < 0) &&
1921 	    (state->trans[0].count < 0)) {
1922 	    newto = state->trans[0].to;
1923 
1924             if (state->type == XML_REGEXP_START_STATE) {
1925 #ifdef DEBUG_REGEXP_GRAPH
1926 		printf("Found simple epsilon trans from start %d to %d\n",
1927 		       statenr, newto);
1928 #endif
1929             } else {
1930 #ifdef DEBUG_REGEXP_GRAPH
1931 		printf("Found simple epsilon trans from %d to %d\n",
1932 		       statenr, newto);
1933 #endif
1934 	        for (i = 0;i < state->nbTransTo;i++) {
1935 		    tmp = ctxt->states[state->transTo[i]];
1936 		    for (j = 0;j < tmp->nbTrans;j++) {
1937 			if (tmp->trans[j].to == statenr) {
1938 #ifdef DEBUG_REGEXP_GRAPH
1939 			    printf("Changed transition %d on %d to go to %d\n",
1940 				   j, tmp->no, newto);
1941 #endif
1942 			    tmp->trans[j].to = -1;
1943 			    xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
1944 						ctxt->states[newto],
1945 					        tmp->trans[j].counter,
1946 						tmp->trans[j].count);
1947 			}
1948 		    }
1949 		}
1950 		if (state->type == XML_REGEXP_FINAL_STATE)
1951 		    ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1952 		/* eliminate the transition completely */
1953 		state->nbTrans = 0;
1954 
1955                 state->type = XML_REGEXP_UNREACH_STATE;
1956 
1957 	    }
1958 
1959 	}
1960     }
1961 }
1962 /**
1963  * xmlFAEliminateEpsilonTransitions:
1964  * @ctxt:  a regexp parser context
1965  *
1966  */
1967 static void
xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1968 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1969     int statenr, transnr;
1970     xmlRegStatePtr state;
1971     int has_epsilon;
1972 
1973     if (ctxt->states == NULL) return;
1974 
1975     /*
1976      * Eliminate simple epsilon transition and the associated unreachable
1977      * states.
1978      */
1979     xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1980     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1981 	state = ctxt->states[statenr];
1982 	if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1983 #ifdef DEBUG_REGEXP_GRAPH
1984 	    printf("Removed unreachable state %d\n", statenr);
1985 #endif
1986 	    xmlRegFreeState(state);
1987 	    ctxt->states[statenr] = NULL;
1988 	}
1989     }
1990 
1991     has_epsilon = 0;
1992 
1993     /*
1994      * Build the completed transitions bypassing the epsilons
1995      * Use a marking algorithm to avoid loops
1996      * Mark sink states too.
1997      * Process from the latest states backward to the start when
1998      * there is long cascading epsilon chains this minimize the
1999      * recursions and transition compares when adding the new ones
2000      */
2001     for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
2002 	state = ctxt->states[statenr];
2003 	if (state == NULL)
2004 	    continue;
2005 	if ((state->nbTrans == 0) &&
2006 	    (state->type != XML_REGEXP_FINAL_STATE)) {
2007 	    state->type = XML_REGEXP_SINK_STATE;
2008 	}
2009 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
2010 	    if ((state->trans[transnr].atom == NULL) &&
2011 		(state->trans[transnr].to >= 0)) {
2012 		if (state->trans[transnr].to == statenr) {
2013 		    state->trans[transnr].to = -1;
2014 #ifdef DEBUG_REGEXP_GRAPH
2015 		    printf("Removed loopback epsilon trans %d on %d\n",
2016 			   transnr, statenr);
2017 #endif
2018 		} else if (state->trans[transnr].count < 0) {
2019 		    int newto = state->trans[transnr].to;
2020 
2021 #ifdef DEBUG_REGEXP_GRAPH
2022 		    printf("Found epsilon trans %d from %d to %d\n",
2023 			   transnr, statenr, newto);
2024 #endif
2025 		    has_epsilon = 1;
2026 		    state->trans[transnr].to = -2;
2027 		    state->mark = XML_REGEXP_MARK_START;
2028 		    xmlFAReduceEpsilonTransitions(ctxt, statenr,
2029 				      newto, state->trans[transnr].counter);
2030 		    state->mark = XML_REGEXP_MARK_NORMAL;
2031 #ifdef DEBUG_REGEXP_GRAPH
2032 		} else {
2033 		    printf("Found counted transition %d on %d\n",
2034 			   transnr, statenr);
2035 #endif
2036 	        }
2037 	    }
2038 	}
2039     }
2040     /*
2041      * Eliminate the epsilon transitions
2042      */
2043     if (has_epsilon) {
2044 	for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2045 	    state = ctxt->states[statenr];
2046 	    if (state == NULL)
2047 		continue;
2048 	    for (transnr = 0;transnr < state->nbTrans;transnr++) {
2049 		xmlRegTransPtr trans = &(state->trans[transnr]);
2050 		if ((trans->atom == NULL) &&
2051 		    (trans->count < 0) &&
2052 		    (trans->to >= 0)) {
2053 		    trans->to = -1;
2054 		}
2055 	    }
2056 	}
2057     }
2058 
2059     /*
2060      * Use this pass to detect unreachable states too
2061      */
2062     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2063 	state = ctxt->states[statenr];
2064 	if (state != NULL)
2065 	    state->reached = XML_REGEXP_MARK_NORMAL;
2066     }
2067     state = ctxt->states[0];
2068     if (state != NULL)
2069 	state->reached = XML_REGEXP_MARK_START;
2070     while (state != NULL) {
2071 	xmlRegStatePtr target = NULL;
2072 	state->reached = XML_REGEXP_MARK_VISITED;
2073 	/*
2074 	 * Mark all states reachable from the current reachable state
2075 	 */
2076 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
2077 	    if ((state->trans[transnr].to >= 0) &&
2078 		((state->trans[transnr].atom != NULL) ||
2079 		 (state->trans[transnr].count >= 0))) {
2080 		int newto = state->trans[transnr].to;
2081 
2082 		if (ctxt->states[newto] == NULL)
2083 		    continue;
2084 		if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
2085 		    ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
2086 		    target = ctxt->states[newto];
2087 		}
2088 	    }
2089 	}
2090 
2091 	/*
2092 	 * find the next accessible state not explored
2093 	 */
2094 	if (target == NULL) {
2095 	    for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
2096 		state = ctxt->states[statenr];
2097 		if ((state != NULL) && (state->reached ==
2098 			XML_REGEXP_MARK_START)) {
2099 		    target = state;
2100 		    break;
2101 		}
2102 	    }
2103 	}
2104 	state = target;
2105     }
2106     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2107 	state = ctxt->states[statenr];
2108 	if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
2109 #ifdef DEBUG_REGEXP_GRAPH
2110 	    printf("Removed unreachable state %d\n", statenr);
2111 #endif
2112 	    xmlRegFreeState(state);
2113 	    ctxt->states[statenr] = NULL;
2114 	}
2115     }
2116 
2117 }
2118 
2119 static int
xmlFACompareRanges(xmlRegRangePtr range1,xmlRegRangePtr range2)2120 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
2121     int ret = 0;
2122 
2123     if ((range1->type == XML_REGEXP_RANGES) ||
2124         (range2->type == XML_REGEXP_RANGES) ||
2125         (range2->type == XML_REGEXP_SUBREG) ||
2126         (range1->type == XML_REGEXP_SUBREG) ||
2127         (range1->type == XML_REGEXP_STRING) ||
2128         (range2->type == XML_REGEXP_STRING))
2129 	return(-1);
2130 
2131     /* put them in order */
2132     if (range1->type > range2->type) {
2133         xmlRegRangePtr tmp;
2134 
2135 	tmp = range1;
2136 	range1 = range2;
2137 	range2 = tmp;
2138     }
2139     if ((range1->type == XML_REGEXP_ANYCHAR) ||
2140         (range2->type == XML_REGEXP_ANYCHAR)) {
2141 	ret = 1;
2142     } else if ((range1->type == XML_REGEXP_EPSILON) ||
2143                (range2->type == XML_REGEXP_EPSILON)) {
2144 	return(0);
2145     } else if (range1->type == range2->type) {
2146         if (range1->type != XML_REGEXP_CHARVAL)
2147             ret = 1;
2148         else if ((range1->end < range2->start) ||
2149 	         (range2->end < range1->start))
2150 	    ret = 0;
2151 	else
2152 	    ret = 1;
2153     } else if (range1->type == XML_REGEXP_CHARVAL) {
2154         int codepoint;
2155 	int neg = 0;
2156 
2157 	/*
2158 	 * just check all codepoints in the range for acceptance,
2159 	 * this is usually way cheaper since done only once at
2160 	 * compilation than testing over and over at runtime or
2161 	 * pushing too many states when evaluating.
2162 	 */
2163 	if (((range1->neg == 0) && (range2->neg != 0)) ||
2164 	    ((range1->neg != 0) && (range2->neg == 0)))
2165 	    neg = 1;
2166 
2167 	for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2168 	    ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2169 					    0, range2->start, range2->end,
2170 					    range2->blockName);
2171 	    if (ret < 0)
2172 	        return(-1);
2173 	    if (((neg == 1) && (ret == 0)) ||
2174 	        ((neg == 0) && (ret == 1)))
2175 		return(1);
2176 	}
2177 	return(0);
2178     } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2179                (range2->type == XML_REGEXP_BLOCK_NAME)) {
2180 	if (range1->type == range2->type) {
2181 	    ret = xmlStrEqual(range1->blockName, range2->blockName);
2182 	} else {
2183 	    /*
2184 	     * comparing a block range with anything else is way
2185 	     * too costly, and maintaining the table is like too much
2186 	     * memory too, so let's force the automata to save state
2187 	     * here.
2188 	     */
2189 	    return(1);
2190 	}
2191     } else if ((range1->type < XML_REGEXP_LETTER) ||
2192                (range2->type < XML_REGEXP_LETTER)) {
2193 	if ((range1->type == XML_REGEXP_ANYSPACE) &&
2194 	    (range2->type == XML_REGEXP_NOTSPACE))
2195 	    ret = 0;
2196 	else if ((range1->type == XML_REGEXP_INITNAME) &&
2197 	         (range2->type == XML_REGEXP_NOTINITNAME))
2198 	    ret = 0;
2199 	else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2200 	         (range2->type == XML_REGEXP_NOTNAMECHAR))
2201 	    ret = 0;
2202 	else if ((range1->type == XML_REGEXP_DECIMAL) &&
2203 	         (range2->type == XML_REGEXP_NOTDECIMAL))
2204 	    ret = 0;
2205 	else if ((range1->type == XML_REGEXP_REALCHAR) &&
2206 	         (range2->type == XML_REGEXP_NOTREALCHAR))
2207 	    ret = 0;
2208 	else {
2209 	    /* same thing to limit complexity */
2210 	    return(1);
2211 	}
2212     } else {
2213         ret = 0;
2214         /* range1->type < range2->type here */
2215         switch (range1->type) {
2216 	    case XML_REGEXP_LETTER:
2217 	         /* all disjoint except in the subgroups */
2218 	         if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2219 		     (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2220 		     (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2221 		     (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2222 		     (range2->type == XML_REGEXP_LETTER_OTHERS))
2223 		     ret = 1;
2224 		 break;
2225 	    case XML_REGEXP_MARK:
2226 	         if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2227 		     (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2228 		     (range2->type == XML_REGEXP_MARK_ENCLOSING))
2229 		     ret = 1;
2230 		 break;
2231 	    case XML_REGEXP_NUMBER:
2232 	         if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2233 		     (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2234 		     (range2->type == XML_REGEXP_NUMBER_OTHERS))
2235 		     ret = 1;
2236 		 break;
2237 	    case XML_REGEXP_PUNCT:
2238 	         if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2239 		     (range2->type == XML_REGEXP_PUNCT_DASH) ||
2240 		     (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2241 		     (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2242 		     (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2243 		     (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2244 		     (range2->type == XML_REGEXP_PUNCT_OTHERS))
2245 		     ret = 1;
2246 		 break;
2247 	    case XML_REGEXP_SEPAR:
2248 	         if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2249 		     (range2->type == XML_REGEXP_SEPAR_LINE) ||
2250 		     (range2->type == XML_REGEXP_SEPAR_PARA))
2251 		     ret = 1;
2252 		 break;
2253 	    case XML_REGEXP_SYMBOL:
2254 	         if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2255 		     (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2256 		     (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2257 		     (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2258 		     ret = 1;
2259 		 break;
2260 	    case XML_REGEXP_OTHER:
2261 	         if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2262 		     (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2263 		     (range2->type == XML_REGEXP_OTHER_PRIVATE))
2264 		     ret = 1;
2265 		 break;
2266             default:
2267 	         if ((range2->type >= XML_REGEXP_LETTER) &&
2268 		     (range2->type < XML_REGEXP_BLOCK_NAME))
2269 		     ret = 0;
2270 		 else {
2271 		     /* safety net ! */
2272 		     return(1);
2273 		 }
2274 	}
2275     }
2276     if (((range1->neg == 0) && (range2->neg != 0)) ||
2277         ((range1->neg != 0) && (range2->neg == 0)))
2278 	ret = !ret;
2279     return(ret);
2280 }
2281 
2282 /**
2283  * xmlFACompareAtomTypes:
2284  * @type1:  an atom type
2285  * @type2:  an atom type
2286  *
2287  * Compares two atoms type to check whether they intersect in some ways,
2288  * this is used by xmlFACompareAtoms only
2289  *
2290  * Returns 1 if they may intersect and 0 otherwise
2291  */
2292 static int
xmlFACompareAtomTypes(xmlRegAtomType type1,xmlRegAtomType type2)2293 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2294     if ((type1 == XML_REGEXP_EPSILON) ||
2295         (type1 == XML_REGEXP_CHARVAL) ||
2296 	(type1 == XML_REGEXP_RANGES) ||
2297 	(type1 == XML_REGEXP_SUBREG) ||
2298 	(type1 == XML_REGEXP_STRING) ||
2299 	(type1 == XML_REGEXP_ANYCHAR))
2300 	return(1);
2301     if ((type2 == XML_REGEXP_EPSILON) ||
2302         (type2 == XML_REGEXP_CHARVAL) ||
2303 	(type2 == XML_REGEXP_RANGES) ||
2304 	(type2 == XML_REGEXP_SUBREG) ||
2305 	(type2 == XML_REGEXP_STRING) ||
2306 	(type2 == XML_REGEXP_ANYCHAR))
2307 	return(1);
2308 
2309     if (type1 == type2) return(1);
2310 
2311     /* simplify subsequent compares by making sure type1 < type2 */
2312     if (type1 > type2) {
2313         xmlRegAtomType tmp = type1;
2314 	type1 = type2;
2315 	type2 = tmp;
2316     }
2317     switch (type1) {
2318         case XML_REGEXP_ANYSPACE: /* \s */
2319 	    /* can't be a letter, number, mark, punctuation, symbol */
2320 	    if ((type2 == XML_REGEXP_NOTSPACE) ||
2321 		((type2 >= XML_REGEXP_LETTER) &&
2322 		 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2323 	        ((type2 >= XML_REGEXP_NUMBER) &&
2324 		 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2325 	        ((type2 >= XML_REGEXP_MARK) &&
2326 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2327 	        ((type2 >= XML_REGEXP_PUNCT) &&
2328 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2329 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2330 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2331 	        ) return(0);
2332 	    break;
2333         case XML_REGEXP_NOTSPACE: /* \S */
2334 	    break;
2335         case XML_REGEXP_INITNAME: /* \l */
2336 	    /* can't be a number, mark, separator, punctuation, symbol or other */
2337 	    if ((type2 == XML_REGEXP_NOTINITNAME) ||
2338 	        ((type2 >= XML_REGEXP_NUMBER) &&
2339 		 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2340 	        ((type2 >= XML_REGEXP_MARK) &&
2341 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2342 	        ((type2 >= XML_REGEXP_SEPAR) &&
2343 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2344 	        ((type2 >= XML_REGEXP_PUNCT) &&
2345 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2346 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2347 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2348 	        ((type2 >= XML_REGEXP_OTHER) &&
2349 		 (type2 <= XML_REGEXP_OTHER_NA))
2350 		) return(0);
2351 	    break;
2352         case XML_REGEXP_NOTINITNAME: /* \L */
2353 	    break;
2354         case XML_REGEXP_NAMECHAR: /* \c */
2355 	    /* can't be a mark, separator, punctuation, symbol or other */
2356 	    if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2357 	        ((type2 >= XML_REGEXP_MARK) &&
2358 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2359 	        ((type2 >= XML_REGEXP_PUNCT) &&
2360 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2361 	        ((type2 >= XML_REGEXP_SEPAR) &&
2362 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2363 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2364 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2365 	        ((type2 >= XML_REGEXP_OTHER) &&
2366 		 (type2 <= XML_REGEXP_OTHER_NA))
2367 		) return(0);
2368 	    break;
2369         case XML_REGEXP_NOTNAMECHAR: /* \C */
2370 	    break;
2371         case XML_REGEXP_DECIMAL: /* \d */
2372 	    /* can't be a letter, mark, separator, punctuation, symbol or other */
2373 	    if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2374 	        (type2 == XML_REGEXP_REALCHAR) ||
2375 		((type2 >= XML_REGEXP_LETTER) &&
2376 		 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2377 	        ((type2 >= XML_REGEXP_MARK) &&
2378 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2379 	        ((type2 >= XML_REGEXP_PUNCT) &&
2380 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2381 	        ((type2 >= XML_REGEXP_SEPAR) &&
2382 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2383 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2384 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2385 	        ((type2 >= XML_REGEXP_OTHER) &&
2386 		 (type2 <= XML_REGEXP_OTHER_NA))
2387 		)return(0);
2388 	    break;
2389         case XML_REGEXP_NOTDECIMAL: /* \D */
2390 	    break;
2391         case XML_REGEXP_REALCHAR: /* \w */
2392 	    /* can't be a mark, separator, punctuation, symbol or other */
2393 	    if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2394 	        ((type2 >= XML_REGEXP_MARK) &&
2395 		 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2396 	        ((type2 >= XML_REGEXP_PUNCT) &&
2397 		 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2398 	        ((type2 >= XML_REGEXP_SEPAR) &&
2399 		 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2400 	        ((type2 >= XML_REGEXP_SYMBOL) &&
2401 		 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2402 	        ((type2 >= XML_REGEXP_OTHER) &&
2403 		 (type2 <= XML_REGEXP_OTHER_NA))
2404 		)return(0);
2405 	    break;
2406         case XML_REGEXP_NOTREALCHAR: /* \W */
2407 	    break;
2408 	/*
2409 	 * at that point we know both type 1 and type2 are from
2410 	 * character categories are ordered and are different,
2411 	 * it becomes simple because this is a partition
2412 	 */
2413         case XML_REGEXP_LETTER:
2414 	    if (type2 <= XML_REGEXP_LETTER_OTHERS)
2415 	        return(1);
2416 	    return(0);
2417         case XML_REGEXP_LETTER_UPPERCASE:
2418         case XML_REGEXP_LETTER_LOWERCASE:
2419         case XML_REGEXP_LETTER_TITLECASE:
2420         case XML_REGEXP_LETTER_MODIFIER:
2421         case XML_REGEXP_LETTER_OTHERS:
2422 	    return(0);
2423         case XML_REGEXP_MARK:
2424 	    if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2425 	        return(1);
2426 	    return(0);
2427         case XML_REGEXP_MARK_NONSPACING:
2428         case XML_REGEXP_MARK_SPACECOMBINING:
2429         case XML_REGEXP_MARK_ENCLOSING:
2430 	    return(0);
2431         case XML_REGEXP_NUMBER:
2432 	    if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2433 	        return(1);
2434 	    return(0);
2435         case XML_REGEXP_NUMBER_DECIMAL:
2436         case XML_REGEXP_NUMBER_LETTER:
2437         case XML_REGEXP_NUMBER_OTHERS:
2438 	    return(0);
2439         case XML_REGEXP_PUNCT:
2440 	    if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2441 	        return(1);
2442 	    return(0);
2443         case XML_REGEXP_PUNCT_CONNECTOR:
2444         case XML_REGEXP_PUNCT_DASH:
2445         case XML_REGEXP_PUNCT_OPEN:
2446         case XML_REGEXP_PUNCT_CLOSE:
2447         case XML_REGEXP_PUNCT_INITQUOTE:
2448         case XML_REGEXP_PUNCT_FINQUOTE:
2449         case XML_REGEXP_PUNCT_OTHERS:
2450 	    return(0);
2451         case XML_REGEXP_SEPAR:
2452 	    if (type2 <= XML_REGEXP_SEPAR_PARA)
2453 	        return(1);
2454 	    return(0);
2455         case XML_REGEXP_SEPAR_SPACE:
2456         case XML_REGEXP_SEPAR_LINE:
2457         case XML_REGEXP_SEPAR_PARA:
2458 	    return(0);
2459         case XML_REGEXP_SYMBOL:
2460 	    if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2461 	        return(1);
2462 	    return(0);
2463         case XML_REGEXP_SYMBOL_MATH:
2464         case XML_REGEXP_SYMBOL_CURRENCY:
2465         case XML_REGEXP_SYMBOL_MODIFIER:
2466         case XML_REGEXP_SYMBOL_OTHERS:
2467 	    return(0);
2468         case XML_REGEXP_OTHER:
2469 	    if (type2 <= XML_REGEXP_OTHER_NA)
2470 	        return(1);
2471 	    return(0);
2472         case XML_REGEXP_OTHER_CONTROL:
2473         case XML_REGEXP_OTHER_FORMAT:
2474         case XML_REGEXP_OTHER_PRIVATE:
2475         case XML_REGEXP_OTHER_NA:
2476 	    return(0);
2477 	default:
2478 	    break;
2479     }
2480     return(1);
2481 }
2482 
2483 /**
2484  * xmlFAEqualAtoms:
2485  * @atom1:  an atom
2486  * @atom2:  an atom
2487  * @deep: if not set only compare string pointers
2488  *
2489  * Compares two atoms to check whether they are the same exactly
2490  * this is used to remove equivalent transitions
2491  *
2492  * Returns 1 if same and 0 otherwise
2493  */
2494 static int
xmlFAEqualAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2,int deep)2495 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2496     int ret = 0;
2497 
2498     if (atom1 == atom2)
2499 	return(1);
2500     if ((atom1 == NULL) || (atom2 == NULL))
2501 	return(0);
2502 
2503     if (atom1->type != atom2->type)
2504         return(0);
2505     switch (atom1->type) {
2506         case XML_REGEXP_EPSILON:
2507 	    ret = 0;
2508 	    break;
2509         case XML_REGEXP_STRING:
2510             if (!deep)
2511                 ret = (atom1->valuep == atom2->valuep);
2512             else
2513                 ret = xmlStrEqual((xmlChar *)atom1->valuep,
2514                                   (xmlChar *)atom2->valuep);
2515 	    break;
2516         case XML_REGEXP_CHARVAL:
2517 	    ret = (atom1->codepoint == atom2->codepoint);
2518 	    break;
2519 	case XML_REGEXP_RANGES:
2520 	    /* too hard to do in the general case */
2521 	    ret = 0;
2522 	default:
2523 	    break;
2524     }
2525     return(ret);
2526 }
2527 
2528 /**
2529  * xmlFACompareAtoms:
2530  * @atom1:  an atom
2531  * @atom2:  an atom
2532  * @deep: if not set only compare string pointers
2533  *
2534  * Compares two atoms to check whether they intersect in some ways,
2535  * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2536  *
2537  * Returns 1 if yes and 0 otherwise
2538  */
2539 static int
xmlFACompareAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2,int deep)2540 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2541     int ret = 1;
2542 
2543     if (atom1 == atom2)
2544 	return(1);
2545     if ((atom1 == NULL) || (atom2 == NULL))
2546 	return(0);
2547 
2548     if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2549         (atom2->type == XML_REGEXP_ANYCHAR))
2550 	return(1);
2551 
2552     if (atom1->type > atom2->type) {
2553 	xmlRegAtomPtr tmp;
2554 	tmp = atom1;
2555 	atom1 = atom2;
2556 	atom2 = tmp;
2557     }
2558     if (atom1->type != atom2->type) {
2559         ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2560 	/* if they can't intersect at the type level break now */
2561 	if (ret == 0)
2562 	    return(0);
2563     }
2564     switch (atom1->type) {
2565         case XML_REGEXP_STRING:
2566             if (!deep)
2567                 ret = (atom1->valuep != atom2->valuep);
2568             else {
2569                 xmlChar *val1 = (xmlChar *)atom1->valuep;
2570                 xmlChar *val2 = (xmlChar *)atom2->valuep;
2571                 int compound1 = (xmlStrchr(val1, '|') != NULL);
2572                 int compound2 = (xmlStrchr(val2, '|') != NULL);
2573 
2574                 /* Ignore negative match flag for ##other namespaces */
2575                 if (compound1 != compound2)
2576                     return(0);
2577 
2578                 ret = xmlRegStrEqualWildcard(val1, val2);
2579             }
2580 	    break;
2581         case XML_REGEXP_EPSILON:
2582 	    goto not_determinist;
2583         case XML_REGEXP_CHARVAL:
2584 	    if (atom2->type == XML_REGEXP_CHARVAL) {
2585 		ret = (atom1->codepoint == atom2->codepoint);
2586 	    } else {
2587 	        ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2588 		if (ret < 0)
2589 		    ret = 1;
2590 	    }
2591 	    break;
2592         case XML_REGEXP_RANGES:
2593 	    if (atom2->type == XML_REGEXP_RANGES) {
2594 	        int i, j, res;
2595 		xmlRegRangePtr r1, r2;
2596 
2597 		/*
2598 		 * need to check that none of the ranges eventually matches
2599 		 */
2600 		for (i = 0;i < atom1->nbRanges;i++) {
2601 		    for (j = 0;j < atom2->nbRanges;j++) {
2602 			r1 = atom1->ranges[i];
2603 			r2 = atom2->ranges[j];
2604 			res = xmlFACompareRanges(r1, r2);
2605 			if (res == 1) {
2606 			    ret = 1;
2607 			    goto done;
2608 			}
2609 		    }
2610 		}
2611 		ret = 0;
2612 	    }
2613 	    break;
2614 	default:
2615 	    goto not_determinist;
2616     }
2617 done:
2618     if (atom1->neg != atom2->neg) {
2619         ret = !ret;
2620     }
2621     if (ret == 0)
2622         return(0);
2623 not_determinist:
2624     return(1);
2625 }
2626 
2627 /**
2628  * xmlFARecurseDeterminism:
2629  * @ctxt:  a regexp parser context
2630  *
2631  * Check whether the associated regexp is determinist,
2632  * should be called after xmlFAEliminateEpsilonTransitions()
2633  *
2634  */
2635 static int
xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,int to,xmlRegAtomPtr atom)2636 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2637 	                 int to, xmlRegAtomPtr atom) {
2638     int ret = 1;
2639     int res;
2640     int transnr, nbTrans;
2641     xmlRegTransPtr t1;
2642     int deep = 1;
2643 
2644     if (state == NULL)
2645 	return(ret);
2646     if (state->markd == XML_REGEXP_MARK_VISITED)
2647 	return(ret);
2648 
2649     if (ctxt->flags & AM_AUTOMATA_RNG)
2650         deep = 0;
2651 
2652     /*
2653      * don't recurse on transitions potentially added in the course of
2654      * the elimination.
2655      */
2656     nbTrans = state->nbTrans;
2657     for (transnr = 0;transnr < nbTrans;transnr++) {
2658 	t1 = &(state->trans[transnr]);
2659 	/*
2660 	 * check transitions conflicting with the one looked at
2661 	 */
2662 	if (t1->atom == NULL) {
2663 	    if (t1->to < 0)
2664 		continue;
2665 	    state->markd = XML_REGEXP_MARK_VISITED;
2666 	    res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2667 		                           to, atom);
2668 	    if (res == 0) {
2669 	        ret = 0;
2670 		/* t1->nd = 1; */
2671 	    }
2672 	    continue;
2673 	}
2674 	if (t1->to != to)
2675 	    continue;
2676 	if (xmlFACompareAtoms(t1->atom, atom, deep)) {
2677 	    ret = 0;
2678 	    /* mark the transition as non-deterministic */
2679 	    t1->nd = 1;
2680 	}
2681     }
2682     return(ret);
2683 }
2684 
2685 /**
2686  * xmlFAFinishRecurseDeterminism:
2687  * @ctxt:  a regexp parser context
2688  *
2689  * Reset flags after checking determinism.
2690  */
2691 static void
xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state)2692 xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
2693     int transnr, nbTrans;
2694 
2695     if (state == NULL)
2696 	return;
2697     if (state->markd != XML_REGEXP_MARK_VISITED)
2698 	return;
2699     state->markd = 0;
2700 
2701     nbTrans = state->nbTrans;
2702     for (transnr = 0; transnr < nbTrans; transnr++) {
2703 	xmlRegTransPtr t1 = &state->trans[transnr];
2704 	if ((t1->atom == NULL) && (t1->to >= 0))
2705 	    xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2706     }
2707 }
2708 
2709 /**
2710  * xmlFAComputesDeterminism:
2711  * @ctxt:  a regexp parser context
2712  *
2713  * Check whether the associated regexp is determinist,
2714  * should be called after xmlFAEliminateEpsilonTransitions()
2715  *
2716  */
2717 static int
xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt)2718 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2719     int statenr, transnr;
2720     xmlRegStatePtr state;
2721     xmlRegTransPtr t1, t2, last;
2722     int i;
2723     int ret = 1;
2724     int deep = 1;
2725 
2726 #ifdef DEBUG_REGEXP_GRAPH
2727     printf("xmlFAComputesDeterminism\n");
2728     xmlRegPrintCtxt(stdout, ctxt);
2729 #endif
2730     if (ctxt->determinist != -1)
2731 	return(ctxt->determinist);
2732 
2733     if (ctxt->flags & AM_AUTOMATA_RNG)
2734         deep = 0;
2735 
2736     /*
2737      * First cleanup the automata removing cancelled transitions
2738      */
2739     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2740 	state = ctxt->states[statenr];
2741 	if (state == NULL)
2742 	    continue;
2743 	if (state->nbTrans < 2)
2744 	    continue;
2745 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
2746 	    t1 = &(state->trans[transnr]);
2747 	    /*
2748 	     * Determinism checks in case of counted or all transitions
2749 	     * will have to be handled separately
2750 	     */
2751 	    if (t1->atom == NULL) {
2752 		/* t1->nd = 1; */
2753 		continue;
2754 	    }
2755 	    if (t1->to == -1) /* eliminated */
2756 		continue;
2757 	    for (i = 0;i < transnr;i++) {
2758 		t2 = &(state->trans[i]);
2759 		if (t2->to == -1) /* eliminated */
2760 		    continue;
2761 		if (t2->atom != NULL) {
2762 		    if (t1->to == t2->to) {
2763                         /*
2764                          * Here we use deep because we want to keep the
2765                          * transitions which indicate a conflict
2766                          */
2767 			if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
2768                             (t1->counter == t2->counter) &&
2769                             (t1->count == t2->count))
2770 			    t2->to = -1; /* eliminated */
2771 		    }
2772 		}
2773 	    }
2774 	}
2775     }
2776 
2777     /*
2778      * Check for all states that there aren't 2 transitions
2779      * with the same atom and a different target.
2780      */
2781     for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2782 	state = ctxt->states[statenr];
2783 	if (state == NULL)
2784 	    continue;
2785 	if (state->nbTrans < 2)
2786 	    continue;
2787 	last = NULL;
2788 	for (transnr = 0;transnr < state->nbTrans;transnr++) {
2789 	    t1 = &(state->trans[transnr]);
2790 	    /*
2791 	     * Determinism checks in case of counted or all transitions
2792 	     * will have to be handled separately
2793 	     */
2794 	    if (t1->atom == NULL) {
2795 		continue;
2796 	    }
2797 	    if (t1->to == -1) /* eliminated */
2798 		continue;
2799 	    for (i = 0;i < transnr;i++) {
2800 		t2 = &(state->trans[i]);
2801 		if (t2->to == -1) /* eliminated */
2802 		    continue;
2803 		if (t2->atom != NULL) {
2804                     /*
2805                      * But here we don't use deep because we want to
2806                      * find transitions which indicate a conflict
2807                      */
2808 		    if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
2809 			ret = 0;
2810 			/* mark the transitions as non-deterministic ones */
2811 			t1->nd = 1;
2812 			t2->nd = 1;
2813 			last = t1;
2814 		    }
2815 		} else if (t1->to != -1) {
2816 		    /*
2817 		     * do the closure in case of remaining specific
2818 		     * epsilon transitions like choices or all
2819 		     */
2820 		    ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2821 						   t2->to, t2->atom);
2822                     xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2823 		    /* don't shortcut the computation so all non deterministic
2824 		       transition get marked down
2825 		    if (ret == 0)
2826 			return(0);
2827 		     */
2828 		    if (ret == 0) {
2829 			t1->nd = 1;
2830 			/* t2->nd = 1; */
2831 			last = t1;
2832 		    }
2833 		}
2834 	    }
2835 	    /* don't shortcut the computation so all non deterministic
2836 	       transition get marked down
2837 	    if (ret == 0)
2838 		break; */
2839 	}
2840 
2841 	/*
2842 	 * mark specifically the last non-deterministic transition
2843 	 * from a state since there is no need to set-up rollback
2844 	 * from it
2845 	 */
2846 	if (last != NULL) {
2847 	    last->nd = 2;
2848 	}
2849 
2850 	/* don't shortcut the computation so all non deterministic
2851 	   transition get marked down
2852 	if (ret == 0)
2853 	    break; */
2854     }
2855 
2856     ctxt->determinist = ret;
2857     return(ret);
2858 }
2859 
2860 /************************************************************************
2861  *									*
2862  *	Routines to check input against transition atoms		*
2863  *									*
2864  ************************************************************************/
2865 
2866 static int
xmlRegCheckCharacterRange(xmlRegAtomType type,int codepoint,int neg,int start,int end,const xmlChar * blockName)2867 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2868 	                  int start, int end, const xmlChar *blockName) {
2869     int ret = 0;
2870 
2871     switch (type) {
2872         case XML_REGEXP_STRING:
2873         case XML_REGEXP_SUBREG:
2874         case XML_REGEXP_RANGES:
2875         case XML_REGEXP_EPSILON:
2876 	    return(-1);
2877         case XML_REGEXP_ANYCHAR:
2878 	    ret = ((codepoint != '\n') && (codepoint != '\r'));
2879 	    break;
2880         case XML_REGEXP_CHARVAL:
2881 	    ret = ((codepoint >= start) && (codepoint <= end));
2882 	    break;
2883         case XML_REGEXP_NOTSPACE:
2884 	    neg = !neg;
2885             /* Falls through. */
2886         case XML_REGEXP_ANYSPACE:
2887 	    ret = ((codepoint == '\n') || (codepoint == '\r') ||
2888 		   (codepoint == '\t') || (codepoint == ' '));
2889 	    break;
2890         case XML_REGEXP_NOTINITNAME:
2891 	    neg = !neg;
2892             /* Falls through. */
2893         case XML_REGEXP_INITNAME:
2894 	    ret = (IS_LETTER(codepoint) ||
2895 		   (codepoint == '_') || (codepoint == ':'));
2896 	    break;
2897         case XML_REGEXP_NOTNAMECHAR:
2898 	    neg = !neg;
2899             /* Falls through. */
2900         case XML_REGEXP_NAMECHAR:
2901 	    ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
2902 		   (codepoint == '.') || (codepoint == '-') ||
2903 		   (codepoint == '_') || (codepoint == ':') ||
2904 		   IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
2905 	    break;
2906         case XML_REGEXP_NOTDECIMAL:
2907 	    neg = !neg;
2908             /* Falls through. */
2909         case XML_REGEXP_DECIMAL:
2910 	    ret = xmlUCSIsCatNd(codepoint);
2911 	    break;
2912         case XML_REGEXP_REALCHAR:
2913 	    neg = !neg;
2914             /* Falls through. */
2915         case XML_REGEXP_NOTREALCHAR:
2916 	    ret = xmlUCSIsCatP(codepoint);
2917 	    if (ret == 0)
2918 		ret = xmlUCSIsCatZ(codepoint);
2919 	    if (ret == 0)
2920 		ret = xmlUCSIsCatC(codepoint);
2921 	    break;
2922         case XML_REGEXP_LETTER:
2923 	    ret = xmlUCSIsCatL(codepoint);
2924 	    break;
2925         case XML_REGEXP_LETTER_UPPERCASE:
2926 	    ret = xmlUCSIsCatLu(codepoint);
2927 	    break;
2928         case XML_REGEXP_LETTER_LOWERCASE:
2929 	    ret = xmlUCSIsCatLl(codepoint);
2930 	    break;
2931         case XML_REGEXP_LETTER_TITLECASE:
2932 	    ret = xmlUCSIsCatLt(codepoint);
2933 	    break;
2934         case XML_REGEXP_LETTER_MODIFIER:
2935 	    ret = xmlUCSIsCatLm(codepoint);
2936 	    break;
2937         case XML_REGEXP_LETTER_OTHERS:
2938 	    ret = xmlUCSIsCatLo(codepoint);
2939 	    break;
2940         case XML_REGEXP_MARK:
2941 	    ret = xmlUCSIsCatM(codepoint);
2942 	    break;
2943         case XML_REGEXP_MARK_NONSPACING:
2944 	    ret = xmlUCSIsCatMn(codepoint);
2945 	    break;
2946         case XML_REGEXP_MARK_SPACECOMBINING:
2947 	    ret = xmlUCSIsCatMc(codepoint);
2948 	    break;
2949         case XML_REGEXP_MARK_ENCLOSING:
2950 	    ret = xmlUCSIsCatMe(codepoint);
2951 	    break;
2952         case XML_REGEXP_NUMBER:
2953 	    ret = xmlUCSIsCatN(codepoint);
2954 	    break;
2955         case XML_REGEXP_NUMBER_DECIMAL:
2956 	    ret = xmlUCSIsCatNd(codepoint);
2957 	    break;
2958         case XML_REGEXP_NUMBER_LETTER:
2959 	    ret = xmlUCSIsCatNl(codepoint);
2960 	    break;
2961         case XML_REGEXP_NUMBER_OTHERS:
2962 	    ret = xmlUCSIsCatNo(codepoint);
2963 	    break;
2964         case XML_REGEXP_PUNCT:
2965 	    ret = xmlUCSIsCatP(codepoint);
2966 	    break;
2967         case XML_REGEXP_PUNCT_CONNECTOR:
2968 	    ret = xmlUCSIsCatPc(codepoint);
2969 	    break;
2970         case XML_REGEXP_PUNCT_DASH:
2971 	    ret = xmlUCSIsCatPd(codepoint);
2972 	    break;
2973         case XML_REGEXP_PUNCT_OPEN:
2974 	    ret = xmlUCSIsCatPs(codepoint);
2975 	    break;
2976         case XML_REGEXP_PUNCT_CLOSE:
2977 	    ret = xmlUCSIsCatPe(codepoint);
2978 	    break;
2979         case XML_REGEXP_PUNCT_INITQUOTE:
2980 	    ret = xmlUCSIsCatPi(codepoint);
2981 	    break;
2982         case XML_REGEXP_PUNCT_FINQUOTE:
2983 	    ret = xmlUCSIsCatPf(codepoint);
2984 	    break;
2985         case XML_REGEXP_PUNCT_OTHERS:
2986 	    ret = xmlUCSIsCatPo(codepoint);
2987 	    break;
2988         case XML_REGEXP_SEPAR:
2989 	    ret = xmlUCSIsCatZ(codepoint);
2990 	    break;
2991         case XML_REGEXP_SEPAR_SPACE:
2992 	    ret = xmlUCSIsCatZs(codepoint);
2993 	    break;
2994         case XML_REGEXP_SEPAR_LINE:
2995 	    ret = xmlUCSIsCatZl(codepoint);
2996 	    break;
2997         case XML_REGEXP_SEPAR_PARA:
2998 	    ret = xmlUCSIsCatZp(codepoint);
2999 	    break;
3000         case XML_REGEXP_SYMBOL:
3001 	    ret = xmlUCSIsCatS(codepoint);
3002 	    break;
3003         case XML_REGEXP_SYMBOL_MATH:
3004 	    ret = xmlUCSIsCatSm(codepoint);
3005 	    break;
3006         case XML_REGEXP_SYMBOL_CURRENCY:
3007 	    ret = xmlUCSIsCatSc(codepoint);
3008 	    break;
3009         case XML_REGEXP_SYMBOL_MODIFIER:
3010 	    ret = xmlUCSIsCatSk(codepoint);
3011 	    break;
3012         case XML_REGEXP_SYMBOL_OTHERS:
3013 	    ret = xmlUCSIsCatSo(codepoint);
3014 	    break;
3015         case XML_REGEXP_OTHER:
3016 	    ret = xmlUCSIsCatC(codepoint);
3017 	    break;
3018         case XML_REGEXP_OTHER_CONTROL:
3019 	    ret = xmlUCSIsCatCc(codepoint);
3020 	    break;
3021         case XML_REGEXP_OTHER_FORMAT:
3022 	    ret = xmlUCSIsCatCf(codepoint);
3023 	    break;
3024         case XML_REGEXP_OTHER_PRIVATE:
3025 	    ret = xmlUCSIsCatCo(codepoint);
3026 	    break;
3027         case XML_REGEXP_OTHER_NA:
3028 	    /* ret = xmlUCSIsCatCn(codepoint); */
3029 	    /* Seems it doesn't exist anymore in recent Unicode releases */
3030 	    ret = 0;
3031 	    break;
3032         case XML_REGEXP_BLOCK_NAME:
3033 	    ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
3034 	    break;
3035     }
3036     if (neg)
3037 	return(!ret);
3038     return(ret);
3039 }
3040 
3041 static int
xmlRegCheckCharacter(xmlRegAtomPtr atom,int codepoint)3042 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
3043     int i, ret = 0;
3044     xmlRegRangePtr range;
3045 
3046     if ((atom == NULL) || (!IS_CHAR(codepoint)))
3047 	return(-1);
3048 
3049     switch (atom->type) {
3050         case XML_REGEXP_SUBREG:
3051         case XML_REGEXP_EPSILON:
3052 	    return(-1);
3053         case XML_REGEXP_CHARVAL:
3054             return(codepoint == atom->codepoint);
3055         case XML_REGEXP_RANGES: {
3056 	    int accept = 0;
3057 
3058 	    for (i = 0;i < atom->nbRanges;i++) {
3059 		range = atom->ranges[i];
3060 		if (range->neg == 2) {
3061 		    ret = xmlRegCheckCharacterRange(range->type, codepoint,
3062 						0, range->start, range->end,
3063 						range->blockName);
3064 		    if (ret != 0)
3065 			return(0); /* excluded char */
3066 		} else if (range->neg) {
3067 		    ret = xmlRegCheckCharacterRange(range->type, codepoint,
3068 						0, range->start, range->end,
3069 						range->blockName);
3070 		    if (ret == 0)
3071 		        accept = 1;
3072 		    else
3073 		        return(0);
3074 		} else {
3075 		    ret = xmlRegCheckCharacterRange(range->type, codepoint,
3076 						0, range->start, range->end,
3077 						range->blockName);
3078 		    if (ret != 0)
3079 			accept = 1; /* might still be excluded */
3080 		}
3081 	    }
3082 	    return(accept);
3083 	}
3084         case XML_REGEXP_STRING:
3085 	    printf("TODO: XML_REGEXP_STRING\n");
3086 	    return(-1);
3087         case XML_REGEXP_ANYCHAR:
3088         case XML_REGEXP_ANYSPACE:
3089         case XML_REGEXP_NOTSPACE:
3090         case XML_REGEXP_INITNAME:
3091         case XML_REGEXP_NOTINITNAME:
3092         case XML_REGEXP_NAMECHAR:
3093         case XML_REGEXP_NOTNAMECHAR:
3094         case XML_REGEXP_DECIMAL:
3095         case XML_REGEXP_NOTDECIMAL:
3096         case XML_REGEXP_REALCHAR:
3097         case XML_REGEXP_NOTREALCHAR:
3098         case XML_REGEXP_LETTER:
3099         case XML_REGEXP_LETTER_UPPERCASE:
3100         case XML_REGEXP_LETTER_LOWERCASE:
3101         case XML_REGEXP_LETTER_TITLECASE:
3102         case XML_REGEXP_LETTER_MODIFIER:
3103         case XML_REGEXP_LETTER_OTHERS:
3104         case XML_REGEXP_MARK:
3105         case XML_REGEXP_MARK_NONSPACING:
3106         case XML_REGEXP_MARK_SPACECOMBINING:
3107         case XML_REGEXP_MARK_ENCLOSING:
3108         case XML_REGEXP_NUMBER:
3109         case XML_REGEXP_NUMBER_DECIMAL:
3110         case XML_REGEXP_NUMBER_LETTER:
3111         case XML_REGEXP_NUMBER_OTHERS:
3112         case XML_REGEXP_PUNCT:
3113         case XML_REGEXP_PUNCT_CONNECTOR:
3114         case XML_REGEXP_PUNCT_DASH:
3115         case XML_REGEXP_PUNCT_OPEN:
3116         case XML_REGEXP_PUNCT_CLOSE:
3117         case XML_REGEXP_PUNCT_INITQUOTE:
3118         case XML_REGEXP_PUNCT_FINQUOTE:
3119         case XML_REGEXP_PUNCT_OTHERS:
3120         case XML_REGEXP_SEPAR:
3121         case XML_REGEXP_SEPAR_SPACE:
3122         case XML_REGEXP_SEPAR_LINE:
3123         case XML_REGEXP_SEPAR_PARA:
3124         case XML_REGEXP_SYMBOL:
3125         case XML_REGEXP_SYMBOL_MATH:
3126         case XML_REGEXP_SYMBOL_CURRENCY:
3127         case XML_REGEXP_SYMBOL_MODIFIER:
3128         case XML_REGEXP_SYMBOL_OTHERS:
3129         case XML_REGEXP_OTHER:
3130         case XML_REGEXP_OTHER_CONTROL:
3131         case XML_REGEXP_OTHER_FORMAT:
3132         case XML_REGEXP_OTHER_PRIVATE:
3133         case XML_REGEXP_OTHER_NA:
3134 	case XML_REGEXP_BLOCK_NAME:
3135 	    ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3136 		                            (const xmlChar *)atom->valuep);
3137 	    if (atom->neg)
3138 		ret = !ret;
3139 	    break;
3140     }
3141     return(ret);
3142 }
3143 
3144 /************************************************************************
3145  *									*
3146  *	Saving and restoring state of an execution context		*
3147  *									*
3148  ************************************************************************/
3149 
3150 #ifdef DEBUG_REGEXP_EXEC
3151 static void
xmlFARegDebugExec(xmlRegExecCtxtPtr exec)3152 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
3153     printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
3154     if (exec->inputStack != NULL) {
3155 	int i;
3156 	printf(": ");
3157 	for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
3158 	    printf("%s ", (const char *)
3159 	           exec->inputStack[exec->inputStackNr - (i + 1)].value);
3160     } else {
3161 	printf(": %s", &(exec->inputString[exec->index]));
3162     }
3163     printf("\n");
3164 }
3165 #endif
3166 
3167 static void
xmlFARegExecSave(xmlRegExecCtxtPtr exec)3168 xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3169 #ifdef DEBUG_REGEXP_EXEC
3170     printf("saving ");
3171     exec->transno++;
3172     xmlFARegDebugExec(exec);
3173     exec->transno--;
3174 #endif
3175 #ifdef MAX_PUSH
3176     if (exec->nbPush > MAX_PUSH) {
3177         return;
3178     }
3179     exec->nbPush++;
3180 #endif
3181 
3182     if (exec->maxRollbacks == 0) {
3183 	exec->maxRollbacks = 4;
3184 	exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3185 		                             sizeof(xmlRegExecRollback));
3186 	if (exec->rollbacks == NULL) {
3187 	    xmlRegexpErrMemory(NULL, "saving regexp");
3188 	    exec->maxRollbacks = 0;
3189 	    return;
3190 	}
3191 	memset(exec->rollbacks, 0,
3192 	       exec->maxRollbacks * sizeof(xmlRegExecRollback));
3193     } else if (exec->nbRollbacks >= exec->maxRollbacks) {
3194 	xmlRegExecRollback *tmp;
3195 	int len = exec->maxRollbacks;
3196 
3197 	exec->maxRollbacks *= 2;
3198 	tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3199 			exec->maxRollbacks * sizeof(xmlRegExecRollback));
3200 	if (tmp == NULL) {
3201 	    xmlRegexpErrMemory(NULL, "saving regexp");
3202 	    exec->maxRollbacks /= 2;
3203 	    return;
3204 	}
3205 	exec->rollbacks = tmp;
3206 	tmp = &exec->rollbacks[len];
3207 	memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3208     }
3209     exec->rollbacks[exec->nbRollbacks].state = exec->state;
3210     exec->rollbacks[exec->nbRollbacks].index = exec->index;
3211     exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3212     if (exec->comp->nbCounters > 0) {
3213 	if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3214 	    exec->rollbacks[exec->nbRollbacks].counts = (int *)
3215 		xmlMalloc(exec->comp->nbCounters * sizeof(int));
3216 	    if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3217 		xmlRegexpErrMemory(NULL, "saving regexp");
3218 		exec->status = -5;
3219 		return;
3220 	    }
3221 	}
3222 	memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3223 	       exec->comp->nbCounters * sizeof(int));
3224     }
3225     exec->nbRollbacks++;
3226 }
3227 
3228 static void
xmlFARegExecRollBack(xmlRegExecCtxtPtr exec)3229 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3230     if (exec->nbRollbacks <= 0) {
3231 	exec->status = -1;
3232 #ifdef DEBUG_REGEXP_EXEC
3233 	printf("rollback failed on empty stack\n");
3234 #endif
3235 	return;
3236     }
3237     exec->nbRollbacks--;
3238     exec->state = exec->rollbacks[exec->nbRollbacks].state;
3239     exec->index = exec->rollbacks[exec->nbRollbacks].index;
3240     exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3241     if (exec->comp->nbCounters > 0) {
3242 	if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3243 	    fprintf(stderr, "exec save: allocation failed");
3244 	    exec->status = -6;
3245 	    return;
3246 	}
3247 	if (exec->counts) {
3248 	    memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
3249 	       exec->comp->nbCounters * sizeof(int));
3250 	}
3251     }
3252 
3253 #ifdef DEBUG_REGEXP_EXEC
3254     printf("restored ");
3255     xmlFARegDebugExec(exec);
3256 #endif
3257 }
3258 
3259 /************************************************************************
3260  *									*
3261  *	Verifier, running an input against a compiled regexp		*
3262  *									*
3263  ************************************************************************/
3264 
3265 static int
xmlFARegExec(xmlRegexpPtr comp,const xmlChar * content)3266 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3267     xmlRegExecCtxt execval;
3268     xmlRegExecCtxtPtr exec = &execval;
3269     int ret, codepoint = 0, len, deter;
3270 
3271     exec->inputString = content;
3272     exec->index = 0;
3273     exec->nbPush = 0;
3274     exec->determinist = 1;
3275     exec->maxRollbacks = 0;
3276     exec->nbRollbacks = 0;
3277     exec->rollbacks = NULL;
3278     exec->status = 0;
3279     exec->comp = comp;
3280     exec->state = comp->states[0];
3281     exec->transno = 0;
3282     exec->transcount = 0;
3283     exec->inputStack = NULL;
3284     exec->inputStackMax = 0;
3285     if (comp->nbCounters > 0) {
3286 	exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
3287 	if (exec->counts == NULL) {
3288 	    xmlRegexpErrMemory(NULL, "running regexp");
3289 	    return(-1);
3290 	}
3291         memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3292     } else
3293 	exec->counts = NULL;
3294     while ((exec->status == 0) && (exec->state != NULL) &&
3295 	   ((exec->inputString[exec->index] != 0) ||
3296 	    ((exec->state != NULL) &&
3297 	     (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3298 	xmlRegTransPtr trans;
3299 	xmlRegAtomPtr atom;
3300 
3301 	/*
3302 	 * If end of input on non-terminal state, rollback, however we may
3303 	 * still have epsilon like transition for counted transitions
3304 	 * on counters, in that case don't break too early.  Additionally,
3305 	 * if we are working on a range like "AB{0,2}", where B is not present,
3306 	 * we don't want to break.
3307 	 */
3308 	len = 1;
3309 	if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
3310 	    /*
3311 	     * if there is a transition, we must check if
3312 	     *  atom allows minOccurs of 0
3313 	     */
3314 	    if (exec->transno < exec->state->nbTrans) {
3315 	        trans = &exec->state->trans[exec->transno];
3316 		if (trans->to >=0) {
3317 		    atom = trans->atom;
3318 		    if (!((atom->min == 0) && (atom->max > 0)))
3319 		        goto rollback;
3320 		}
3321 	    } else
3322 	        goto rollback;
3323 	}
3324 
3325 	exec->transcount = 0;
3326 	for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3327 	    trans = &exec->state->trans[exec->transno];
3328 	    if (trans->to < 0)
3329 		continue;
3330 	    atom = trans->atom;
3331 	    ret = 0;
3332 	    deter = 1;
3333 	    if (trans->count >= 0) {
3334 		int count;
3335 		xmlRegCounterPtr counter;
3336 
3337 		if (exec->counts == NULL) {
3338 		    exec->status = -1;
3339 		    goto error;
3340 		}
3341 		/*
3342 		 * A counted transition.
3343 		 */
3344 
3345 		count = exec->counts[trans->count];
3346 		counter = &exec->comp->counters[trans->count];
3347 #ifdef DEBUG_REGEXP_EXEC
3348 		printf("testing count %d: val %d, min %d, max %d\n",
3349 		       trans->count, count, counter->min,  counter->max);
3350 #endif
3351 		ret = ((count >= counter->min) && (count <= counter->max));
3352 		if ((ret) && (counter->min != counter->max))
3353 		    deter = 0;
3354 	    } else if (atom == NULL) {
3355 		fprintf(stderr, "epsilon transition left at runtime\n");
3356 		exec->status = -2;
3357 		break;
3358 	    } else if (exec->inputString[exec->index] != 0) {
3359                 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3360 		ret = xmlRegCheckCharacter(atom, codepoint);
3361 		if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
3362 		    xmlRegStatePtr to = comp->states[trans->to];
3363 
3364 		    /*
3365 		     * this is a multiple input sequence
3366 		     * If there is a counter associated increment it now.
3367 		     * before potentially saving and rollback
3368 		     * do not increment if the counter is already over the
3369 		     * maximum limit in which case get to next transition
3370 		     */
3371 		    if (trans->counter >= 0) {
3372 			xmlRegCounterPtr counter;
3373 
3374 			if ((exec->counts == NULL) ||
3375 			    (exec->comp == NULL) ||
3376 			    (exec->comp->counters == NULL)) {
3377 			    exec->status = -1;
3378 			    goto error;
3379 			}
3380 			counter = &exec->comp->counters[trans->counter];
3381 			if (exec->counts[trans->counter] >= counter->max)
3382 			    continue; /* for loop on transitions */
3383 
3384 #ifdef DEBUG_REGEXP_EXEC
3385 			printf("Increasing count %d\n", trans->counter);
3386 #endif
3387 			exec->counts[trans->counter]++;
3388 		    }
3389 		    if (exec->state->nbTrans > exec->transno + 1) {
3390 			xmlFARegExecSave(exec);
3391 		    }
3392 		    exec->transcount = 1;
3393 		    do {
3394 			/*
3395 			 * Try to progress as much as possible on the input
3396 			 */
3397 			if (exec->transcount == atom->max) {
3398 			    break;
3399 			}
3400 			exec->index += len;
3401 			/*
3402 			 * End of input: stop here
3403 			 */
3404 			if (exec->inputString[exec->index] == 0) {
3405 			    exec->index -= len;
3406 			    break;
3407 			}
3408 			if (exec->transcount >= atom->min) {
3409 			    int transno = exec->transno;
3410 			    xmlRegStatePtr state = exec->state;
3411 
3412 			    /*
3413 			     * The transition is acceptable save it
3414 			     */
3415 			    exec->transno = -1; /* trick */
3416 			    exec->state = to;
3417 			    xmlFARegExecSave(exec);
3418 			    exec->transno = transno;
3419 			    exec->state = state;
3420 			}
3421 			codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3422 				              len);
3423 			ret = xmlRegCheckCharacter(atom, codepoint);
3424 			exec->transcount++;
3425 		    } while (ret == 1);
3426 		    if (exec->transcount < atom->min)
3427 			ret = 0;
3428 
3429 		    /*
3430 		     * If the last check failed but one transition was found
3431 		     * possible, rollback
3432 		     */
3433 		    if (ret < 0)
3434 			ret = 0;
3435 		    if (ret == 0) {
3436 			goto rollback;
3437 		    }
3438 		    if (trans->counter >= 0) {
3439 			if (exec->counts == NULL) {
3440 			    exec->status = -1;
3441 			    goto error;
3442 			}
3443 #ifdef DEBUG_REGEXP_EXEC
3444 			printf("Decreasing count %d\n", trans->counter);
3445 #endif
3446 			exec->counts[trans->counter]--;
3447 		    }
3448 		} else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3449 		    /*
3450 		     * we don't match on the codepoint, but minOccurs of 0
3451 		     * says that's ok.  Setting len to 0 inhibits stepping
3452 		     * over the codepoint.
3453 		     */
3454 		    exec->transcount = 1;
3455 		    len = 0;
3456 		    ret = 1;
3457 		}
3458 	    } else if ((atom->min == 0) && (atom->max > 0)) {
3459 	        /* another spot to match when minOccurs is 0 */
3460 		exec->transcount = 1;
3461 		len = 0;
3462 		ret = 1;
3463 	    }
3464 	    if (ret == 1) {
3465 		if ((trans->nd == 1) ||
3466 		    ((trans->count >= 0) && (deter == 0) &&
3467 		     (exec->state->nbTrans > exec->transno + 1))) {
3468 #ifdef DEBUG_REGEXP_EXEC
3469 		    if (trans->nd == 1)
3470 		        printf("Saving on nd transition atom %d for %c at %d\n",
3471 			       trans->atom->no, codepoint, exec->index);
3472 		    else
3473 		        printf("Saving on counted transition count %d for %c at %d\n",
3474 			       trans->count, codepoint, exec->index);
3475 #endif
3476 		    xmlFARegExecSave(exec);
3477 		}
3478 		if (trans->counter >= 0) {
3479 		    xmlRegCounterPtr counter;
3480 
3481                     /* make sure we don't go over the counter maximum value */
3482 		    if ((exec->counts == NULL) ||
3483 			(exec->comp == NULL) ||
3484 			(exec->comp->counters == NULL)) {
3485 			exec->status = -1;
3486 			goto error;
3487 		    }
3488 		    counter = &exec->comp->counters[trans->counter];
3489 		    if (exec->counts[trans->counter] >= counter->max)
3490 			continue; /* for loop on transitions */
3491 #ifdef DEBUG_REGEXP_EXEC
3492 		    printf("Increasing count %d\n", trans->counter);
3493 #endif
3494 		    exec->counts[trans->counter]++;
3495 		}
3496 		if ((trans->count >= 0) &&
3497 		    (trans->count < REGEXP_ALL_COUNTER)) {
3498 		    if (exec->counts == NULL) {
3499 		        exec->status = -1;
3500 			goto error;
3501 		    }
3502 #ifdef DEBUG_REGEXP_EXEC
3503 		    printf("resetting count %d on transition\n",
3504 		           trans->count);
3505 #endif
3506 		    exec->counts[trans->count] = 0;
3507 		}
3508 #ifdef DEBUG_REGEXP_EXEC
3509 		printf("entering state %d\n", trans->to);
3510 #endif
3511 		exec->state = comp->states[trans->to];
3512 		exec->transno = 0;
3513 		if (trans->atom != NULL) {
3514 		    exec->index += len;
3515 		}
3516 		goto progress;
3517 	    } else if (ret < 0) {
3518 		exec->status = -4;
3519 		break;
3520 	    }
3521 	}
3522 	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3523 rollback:
3524 	    /*
3525 	     * Failed to find a way out
3526 	     */
3527 	    exec->determinist = 0;
3528 #ifdef DEBUG_REGEXP_EXEC
3529 	    printf("rollback from state %d on %d:%c\n", exec->state->no,
3530 	           codepoint,codepoint);
3531 #endif
3532 	    xmlFARegExecRollBack(exec);
3533 	}
3534 progress:
3535 	continue;
3536     }
3537 error:
3538     if (exec->rollbacks != NULL) {
3539 	if (exec->counts != NULL) {
3540 	    int i;
3541 
3542 	    for (i = 0;i < exec->maxRollbacks;i++)
3543 		if (exec->rollbacks[i].counts != NULL)
3544 		    xmlFree(exec->rollbacks[i].counts);
3545 	}
3546 	xmlFree(exec->rollbacks);
3547     }
3548     if (exec->state == NULL)
3549         return(-1);
3550     if (exec->counts != NULL)
3551 	xmlFree(exec->counts);
3552     if (exec->status == 0)
3553 	return(1);
3554     if (exec->status == -1) {
3555 	if (exec->nbPush > MAX_PUSH)
3556 	    return(-1);
3557 	return(0);
3558     }
3559     return(exec->status);
3560 }
3561 
3562 /************************************************************************
3563  *									*
3564  *	Progressive interface to the verifier one atom at a time	*
3565  *									*
3566  ************************************************************************/
3567 #ifdef DEBUG_ERR
3568 static void testerr(xmlRegExecCtxtPtr exec);
3569 #endif
3570 
3571 /**
3572  * xmlRegNewExecCtxt:
3573  * @comp: a precompiled regular expression
3574  * @callback: a callback function used for handling progresses in the
3575  *            automata matching phase
3576  * @data: the context data associated to the callback in this context
3577  *
3578  * Build a context used for progressive evaluation of a regexp.
3579  *
3580  * Returns the new context
3581  */
3582 xmlRegExecCtxtPtr
xmlRegNewExecCtxt(xmlRegexpPtr comp,xmlRegExecCallbacks callback,void * data)3583 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3584     xmlRegExecCtxtPtr exec;
3585 
3586     if (comp == NULL)
3587 	return(NULL);
3588     if ((comp->compact == NULL) && (comp->states == NULL))
3589         return(NULL);
3590     exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3591     if (exec == NULL) {
3592 	xmlRegexpErrMemory(NULL, "creating execution context");
3593 	return(NULL);
3594     }
3595     memset(exec, 0, sizeof(xmlRegExecCtxt));
3596     exec->inputString = NULL;
3597     exec->index = 0;
3598     exec->determinist = 1;
3599     exec->maxRollbacks = 0;
3600     exec->nbRollbacks = 0;
3601     exec->rollbacks = NULL;
3602     exec->status = 0;
3603     exec->comp = comp;
3604     if (comp->compact == NULL)
3605 	exec->state = comp->states[0];
3606     exec->transno = 0;
3607     exec->transcount = 0;
3608     exec->callback = callback;
3609     exec->data = data;
3610     if (comp->nbCounters > 0) {
3611         /*
3612 	 * For error handling, exec->counts is allocated twice the size
3613 	 * the second half is used to store the data in case of rollback
3614 	 */
3615 	exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3616 	                                 * 2);
3617 	if (exec->counts == NULL) {
3618 	    xmlRegexpErrMemory(NULL, "creating execution context");
3619 	    xmlFree(exec);
3620 	    return(NULL);
3621 	}
3622         memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3623 	exec->errCounts = &exec->counts[comp->nbCounters];
3624     } else {
3625 	exec->counts = NULL;
3626 	exec->errCounts = NULL;
3627     }
3628     exec->inputStackMax = 0;
3629     exec->inputStackNr = 0;
3630     exec->inputStack = NULL;
3631     exec->errStateNo = -1;
3632     exec->errString = NULL;
3633     exec->nbPush = 0;
3634     return(exec);
3635 }
3636 
3637 /**
3638  * xmlRegFreeExecCtxt:
3639  * @exec: a regular expression evaluation context
3640  *
3641  * Free the structures associated to a regular expression evaluation context.
3642  */
3643 void
xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec)3644 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3645     if (exec == NULL)
3646 	return;
3647 
3648     if (exec->rollbacks != NULL) {
3649 	if (exec->counts != NULL) {
3650 	    int i;
3651 
3652 	    for (i = 0;i < exec->maxRollbacks;i++)
3653 		if (exec->rollbacks[i].counts != NULL)
3654 		    xmlFree(exec->rollbacks[i].counts);
3655 	}
3656 	xmlFree(exec->rollbacks);
3657     }
3658     if (exec->counts != NULL)
3659 	xmlFree(exec->counts);
3660     if (exec->inputStack != NULL) {
3661 	int i;
3662 
3663 	for (i = 0;i < exec->inputStackNr;i++) {
3664 	    if (exec->inputStack[i].value != NULL)
3665 		xmlFree(exec->inputStack[i].value);
3666 	}
3667 	xmlFree(exec->inputStack);
3668     }
3669     if (exec->errString != NULL)
3670         xmlFree(exec->errString);
3671     xmlFree(exec);
3672 }
3673 
3674 static void
xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)3675 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3676 	                    void *data) {
3677 #ifdef DEBUG_PUSH
3678     printf("saving value: %d:%s\n", exec->inputStackNr, value);
3679 #endif
3680     if (exec->inputStackMax == 0) {
3681 	exec->inputStackMax = 4;
3682 	exec->inputStack = (xmlRegInputTokenPtr)
3683 	    xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3684 	if (exec->inputStack == NULL) {
3685 	    xmlRegexpErrMemory(NULL, "pushing input string");
3686 	    exec->inputStackMax = 0;
3687 	    return;
3688 	}
3689     } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3690 	xmlRegInputTokenPtr tmp;
3691 
3692 	exec->inputStackMax *= 2;
3693 	tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3694 			exec->inputStackMax * sizeof(xmlRegInputToken));
3695 	if (tmp == NULL) {
3696 	    xmlRegexpErrMemory(NULL, "pushing input string");
3697 	    exec->inputStackMax /= 2;
3698 	    return;
3699 	}
3700 	exec->inputStack = tmp;
3701     }
3702     exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3703     exec->inputStack[exec->inputStackNr].data = data;
3704     exec->inputStackNr++;
3705     exec->inputStack[exec->inputStackNr].value = NULL;
3706     exec->inputStack[exec->inputStackNr].data = NULL;
3707 }
3708 
3709 /**
3710  * xmlRegStrEqualWildcard:
3711  * @expStr:  the string to be evaluated
3712  * @valStr:  the validation string
3713  *
3714  * Checks if both strings are equal or have the same content. "*"
3715  * can be used as a wildcard in @valStr; "|" is used as a separator of
3716  * substrings in both @expStr and @valStr.
3717  *
3718  * Returns 1 if the comparison is satisfied and the number of substrings
3719  * is equal, 0 otherwise.
3720  */
3721 
3722 static int
xmlRegStrEqualWildcard(const xmlChar * expStr,const xmlChar * valStr)3723 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3724     if (expStr == valStr) return(1);
3725     if (expStr == NULL) return(0);
3726     if (valStr == NULL) return(0);
3727     do {
3728 	/*
3729 	* Eval if we have a wildcard for the current item.
3730 	*/
3731         if (*expStr != *valStr) {
3732 	    /* if one of them starts with a wildcard make valStr be it */
3733 	    if (*valStr == '*') {
3734 	        const xmlChar *tmp;
3735 
3736 		tmp = valStr;
3737 		valStr = expStr;
3738 		expStr = tmp;
3739 	    }
3740 	    if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3741 		do {
3742 		    if (*valStr == XML_REG_STRING_SEPARATOR)
3743 			break;
3744 		    valStr++;
3745 		} while (*valStr != 0);
3746 		continue;
3747 	    } else
3748 		return(0);
3749 	}
3750 	expStr++;
3751 	valStr++;
3752     } while (*valStr != 0);
3753     if (*expStr != 0)
3754 	return (0);
3755     else
3756 	return (1);
3757 }
3758 
3759 /**
3760  * xmlRegCompactPushString:
3761  * @exec: a regexp execution context
3762  * @comp:  the precompiled exec with a compact table
3763  * @value: a string token input
3764  * @data: data associated to the token to reuse in callbacks
3765  *
3766  * Push one input token in the execution context
3767  *
3768  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3769  *     a negative value in case of error.
3770  */
3771 static int
xmlRegCompactPushString(xmlRegExecCtxtPtr exec,xmlRegexpPtr comp,const xmlChar * value,void * data)3772 xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3773 	                xmlRegexpPtr comp,
3774 	                const xmlChar *value,
3775 	                void *data) {
3776     int state = exec->index;
3777     int i, target;
3778 
3779     if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3780 	return(-1);
3781 
3782     if (value == NULL) {
3783 	/*
3784 	 * are we at a final state ?
3785 	 */
3786 	if (comp->compact[state * (comp->nbstrings + 1)] ==
3787             XML_REGEXP_FINAL_STATE)
3788 	    return(1);
3789 	return(0);
3790     }
3791 
3792 #ifdef DEBUG_PUSH
3793     printf("value pushed: %s\n", value);
3794 #endif
3795 
3796     /*
3797      * Examine all outside transitions from current state
3798      */
3799     for (i = 0;i < comp->nbstrings;i++) {
3800 	target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3801 	if ((target > 0) && (target <= comp->nbstates)) {
3802 	    target--; /* to avoid 0 */
3803 	    if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3804 		exec->index = target;
3805 		if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3806 		    exec->callback(exec->data, value,
3807 			  comp->transdata[state * comp->nbstrings + i], data);
3808 		}
3809 #ifdef DEBUG_PUSH
3810 		printf("entering state %d\n", target);
3811 #endif
3812 		if (comp->compact[target * (comp->nbstrings + 1)] ==
3813 		    XML_REGEXP_SINK_STATE)
3814 		    goto error;
3815 
3816 		if (comp->compact[target * (comp->nbstrings + 1)] ==
3817 		    XML_REGEXP_FINAL_STATE)
3818 		    return(1);
3819 		return(0);
3820 	    }
3821 	}
3822     }
3823     /*
3824      * Failed to find an exit transition out from current state for the
3825      * current token
3826      */
3827 #ifdef DEBUG_PUSH
3828     printf("failed to find a transition for %s on state %d\n", value, state);
3829 #endif
3830 error:
3831     if (exec->errString != NULL)
3832         xmlFree(exec->errString);
3833     exec->errString = xmlStrdup(value);
3834     exec->errStateNo = state;
3835     exec->status = -1;
3836 #ifdef DEBUG_ERR
3837     testerr(exec);
3838 #endif
3839     return(-1);
3840 }
3841 
3842 /**
3843  * xmlRegExecPushStringInternal:
3844  * @exec: a regexp execution context or NULL to indicate the end
3845  * @value: a string token input
3846  * @data: data associated to the token to reuse in callbacks
3847  * @compound: value was assembled from 2 strings
3848  *
3849  * Push one input token in the execution context
3850  *
3851  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3852  *     a negative value in case of error.
3853  */
3854 static int
xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data,int compound)3855 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3856 	                     void *data, int compound) {
3857     xmlRegTransPtr trans;
3858     xmlRegAtomPtr atom;
3859     int ret;
3860     int final = 0;
3861     int progress = 1;
3862 
3863     if (exec == NULL)
3864 	return(-1);
3865     if (exec->comp == NULL)
3866 	return(-1);
3867     if (exec->status != 0)
3868 	return(exec->status);
3869 
3870     if (exec->comp->compact != NULL)
3871 	return(xmlRegCompactPushString(exec, exec->comp, value, data));
3872 
3873     if (value == NULL) {
3874         if (exec->state->type == XML_REGEXP_FINAL_STATE)
3875 	    return(1);
3876 	final = 1;
3877     }
3878 
3879 #ifdef DEBUG_PUSH
3880     printf("value pushed: %s\n", value);
3881 #endif
3882     /*
3883      * If we have an active rollback stack push the new value there
3884      * and get back to where we were left
3885      */
3886     if ((value != NULL) && (exec->inputStackNr > 0)) {
3887 	xmlFARegExecSaveInputString(exec, value, data);
3888 	value = exec->inputStack[exec->index].value;
3889 	data = exec->inputStack[exec->index].data;
3890 #ifdef DEBUG_PUSH
3891 	printf("value loaded: %s\n", value);
3892 #endif
3893     }
3894 
3895     while ((exec->status == 0) &&
3896 	   ((value != NULL) ||
3897 	    ((final == 1) &&
3898 	     (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3899 
3900 	/*
3901 	 * End of input on non-terminal state, rollback, however we may
3902 	 * still have epsilon like transition for counted transitions
3903 	 * on counters, in that case don't break too early.
3904 	 */
3905 	if ((value == NULL) && (exec->counts == NULL))
3906 	    goto rollback;
3907 
3908 	exec->transcount = 0;
3909 	for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3910 	    trans = &exec->state->trans[exec->transno];
3911 	    if (trans->to < 0)
3912 		continue;
3913 	    atom = trans->atom;
3914 	    ret = 0;
3915 	    if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3916 		int i;
3917 		int count;
3918 		xmlRegTransPtr t;
3919 		xmlRegCounterPtr counter;
3920 
3921 		ret = 0;
3922 
3923 #ifdef DEBUG_PUSH
3924 		printf("testing all lax %d\n", trans->count);
3925 #endif
3926 		/*
3927 		 * Check all counted transitions from the current state
3928 		 */
3929 		if ((value == NULL) && (final)) {
3930 		    ret = 1;
3931 		} else if (value != NULL) {
3932 		    for (i = 0;i < exec->state->nbTrans;i++) {
3933 			t = &exec->state->trans[i];
3934 			if ((t->counter < 0) || (t == trans))
3935 			    continue;
3936 			counter = &exec->comp->counters[t->counter];
3937 			count = exec->counts[t->counter];
3938 			if ((count < counter->max) &&
3939 		            (t->atom != NULL) &&
3940 			    (xmlStrEqual(value, t->atom->valuep))) {
3941 			    ret = 0;
3942 			    break;
3943 			}
3944 			if ((count >= counter->min) &&
3945 			    (count < counter->max) &&
3946 			    (t->atom != NULL) &&
3947 			    (xmlStrEqual(value, t->atom->valuep))) {
3948 			    ret = 1;
3949 			    break;
3950 			}
3951 		    }
3952 		}
3953 	    } else if (trans->count == REGEXP_ALL_COUNTER) {
3954 		int i;
3955 		int count;
3956 		xmlRegTransPtr t;
3957 		xmlRegCounterPtr counter;
3958 
3959 		ret = 1;
3960 
3961 #ifdef DEBUG_PUSH
3962 		printf("testing all %d\n", trans->count);
3963 #endif
3964 		/*
3965 		 * Check all counted transitions from the current state
3966 		 */
3967 		for (i = 0;i < exec->state->nbTrans;i++) {
3968                     t = &exec->state->trans[i];
3969 		    if ((t->counter < 0) || (t == trans))
3970 			continue;
3971                     counter = &exec->comp->counters[t->counter];
3972 		    count = exec->counts[t->counter];
3973 		    if ((count < counter->min) || (count > counter->max)) {
3974 			ret = 0;
3975 			break;
3976 		    }
3977 		}
3978 	    } else if (trans->count >= 0) {
3979 		int count;
3980 		xmlRegCounterPtr counter;
3981 
3982 		/*
3983 		 * A counted transition.
3984 		 */
3985 
3986 		count = exec->counts[trans->count];
3987 		counter = &exec->comp->counters[trans->count];
3988 #ifdef DEBUG_PUSH
3989 		printf("testing count %d: val %d, min %d, max %d\n",
3990 		       trans->count, count, counter->min,  counter->max);
3991 #endif
3992 		ret = ((count >= counter->min) && (count <= counter->max));
3993 	    } else if (atom == NULL) {
3994 		fprintf(stderr, "epsilon transition left at runtime\n");
3995 		exec->status = -2;
3996 		break;
3997 	    } else if (value != NULL) {
3998 		ret = xmlRegStrEqualWildcard(atom->valuep, value);
3999 		if (atom->neg) {
4000 		    ret = !ret;
4001 		    if (!compound)
4002 		        ret = 0;
4003 		}
4004 		if ((ret == 1) && (trans->counter >= 0)) {
4005 		    xmlRegCounterPtr counter;
4006 		    int count;
4007 
4008 		    count = exec->counts[trans->counter];
4009 		    counter = &exec->comp->counters[trans->counter];
4010 		    if (count >= counter->max)
4011 			ret = 0;
4012 		}
4013 
4014 		if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4015 		    xmlRegStatePtr to = exec->comp->states[trans->to];
4016 
4017 		    /*
4018 		     * this is a multiple input sequence
4019 		     */
4020 		    if (exec->state->nbTrans > exec->transno + 1) {
4021 			if (exec->inputStackNr <= 0) {
4022 			    xmlFARegExecSaveInputString(exec, value, data);
4023 			}
4024 			xmlFARegExecSave(exec);
4025 		    }
4026 		    exec->transcount = 1;
4027 		    do {
4028 			/*
4029 			 * Try to progress as much as possible on the input
4030 			 */
4031 			if (exec->transcount == atom->max) {
4032 			    break;
4033 			}
4034 			exec->index++;
4035 			value = exec->inputStack[exec->index].value;
4036 			data = exec->inputStack[exec->index].data;
4037 #ifdef DEBUG_PUSH
4038 			printf("value loaded: %s\n", value);
4039 #endif
4040 
4041 			/*
4042 			 * End of input: stop here
4043 			 */
4044 			if (value == NULL) {
4045 			    exec->index --;
4046 			    break;
4047 			}
4048 			if (exec->transcount >= atom->min) {
4049 			    int transno = exec->transno;
4050 			    xmlRegStatePtr state = exec->state;
4051 
4052 			    /*
4053 			     * The transition is acceptable save it
4054 			     */
4055 			    exec->transno = -1; /* trick */
4056 			    exec->state = to;
4057 			    if (exec->inputStackNr <= 0) {
4058 				xmlFARegExecSaveInputString(exec, value, data);
4059 			    }
4060 			    xmlFARegExecSave(exec);
4061 			    exec->transno = transno;
4062 			    exec->state = state;
4063 			}
4064 			ret = xmlStrEqual(value, atom->valuep);
4065 			exec->transcount++;
4066 		    } while (ret == 1);
4067 		    if (exec->transcount < atom->min)
4068 			ret = 0;
4069 
4070 		    /*
4071 		     * If the last check failed but one transition was found
4072 		     * possible, rollback
4073 		     */
4074 		    if (ret < 0)
4075 			ret = 0;
4076 		    if (ret == 0) {
4077 			goto rollback;
4078 		    }
4079 		}
4080 	    }
4081 	    if (ret == 1) {
4082 		if ((exec->callback != NULL) && (atom != NULL) &&
4083 			(data != NULL)) {
4084 		    exec->callback(exec->data, atom->valuep,
4085 			           atom->data, data);
4086 		}
4087 		if (exec->state->nbTrans > exec->transno + 1) {
4088 		    if (exec->inputStackNr <= 0) {
4089 			xmlFARegExecSaveInputString(exec, value, data);
4090 		    }
4091 		    xmlFARegExecSave(exec);
4092 		}
4093 		if (trans->counter >= 0) {
4094 #ifdef DEBUG_PUSH
4095 		    printf("Increasing count %d\n", trans->counter);
4096 #endif
4097 		    exec->counts[trans->counter]++;
4098 		}
4099 		if ((trans->count >= 0) &&
4100 		    (trans->count < REGEXP_ALL_COUNTER)) {
4101 #ifdef DEBUG_REGEXP_EXEC
4102 		    printf("resetting count %d on transition\n",
4103 		           trans->count);
4104 #endif
4105 		    exec->counts[trans->count] = 0;
4106 		}
4107 #ifdef DEBUG_PUSH
4108 		printf("entering state %d\n", trans->to);
4109 #endif
4110                 if ((exec->comp->states[trans->to] != NULL) &&
4111 		    (exec->comp->states[trans->to]->type ==
4112 		     XML_REGEXP_SINK_STATE)) {
4113 		    /*
4114 		     * entering a sink state, save the current state as error
4115 		     * state.
4116 		     */
4117 		    if (exec->errString != NULL)
4118 			xmlFree(exec->errString);
4119 		    exec->errString = xmlStrdup(value);
4120 		    exec->errState = exec->state;
4121 		    memcpy(exec->errCounts, exec->counts,
4122 			   exec->comp->nbCounters * sizeof(int));
4123 		}
4124 		exec->state = exec->comp->states[trans->to];
4125 		exec->transno = 0;
4126 		if (trans->atom != NULL) {
4127 		    if (exec->inputStack != NULL) {
4128 			exec->index++;
4129 			if (exec->index < exec->inputStackNr) {
4130 			    value = exec->inputStack[exec->index].value;
4131 			    data = exec->inputStack[exec->index].data;
4132 #ifdef DEBUG_PUSH
4133 			    printf("value loaded: %s\n", value);
4134 #endif
4135 			} else {
4136 			    value = NULL;
4137 			    data = NULL;
4138 #ifdef DEBUG_PUSH
4139 			    printf("end of input\n");
4140 #endif
4141 			}
4142 		    } else {
4143 			value = NULL;
4144 			data = NULL;
4145 #ifdef DEBUG_PUSH
4146 			printf("end of input\n");
4147 #endif
4148 		    }
4149 		}
4150 		goto progress;
4151 	    } else if (ret < 0) {
4152 		exec->status = -4;
4153 		break;
4154 	    }
4155 	}
4156 	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4157 rollback:
4158             /*
4159 	     * if we didn't yet rollback on the current input
4160 	     * store the current state as the error state.
4161 	     */
4162 	    if ((progress) && (exec->state != NULL) &&
4163 	        (exec->state->type != XML_REGEXP_SINK_STATE)) {
4164 	        progress = 0;
4165 		if (exec->errString != NULL)
4166 		    xmlFree(exec->errString);
4167 		exec->errString = xmlStrdup(value);
4168 		exec->errState = exec->state;
4169                 if (exec->comp->nbCounters)
4170                     memcpy(exec->errCounts, exec->counts,
4171                            exec->comp->nbCounters * sizeof(int));
4172 	    }
4173 
4174 	    /*
4175 	     * Failed to find a way out
4176 	     */
4177 	    exec->determinist = 0;
4178 	    xmlFARegExecRollBack(exec);
4179 	    if ((exec->inputStack != NULL ) && (exec->status == 0)) {
4180 		value = exec->inputStack[exec->index].value;
4181 		data = exec->inputStack[exec->index].data;
4182 #ifdef DEBUG_PUSH
4183 		printf("value loaded: %s\n", value);
4184 #endif
4185 	    }
4186 	}
4187 	continue;
4188 progress:
4189         progress = 1;
4190 	continue;
4191     }
4192     if (exec->status == 0) {
4193         return(exec->state->type == XML_REGEXP_FINAL_STATE);
4194     }
4195 #ifdef DEBUG_ERR
4196     if (exec->status < 0) {
4197 	testerr(exec);
4198     }
4199 #endif
4200     return(exec->status);
4201 }
4202 
4203 /**
4204  * xmlRegExecPushString:
4205  * @exec: a regexp execution context or NULL to indicate the end
4206  * @value: a string token input
4207  * @data: data associated to the token to reuse in callbacks
4208  *
4209  * Push one input token in the execution context
4210  *
4211  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4212  *     a negative value in case of error.
4213  */
4214 int
xmlRegExecPushString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)4215 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4216 	             void *data) {
4217     return(xmlRegExecPushStringInternal(exec, value, data, 0));
4218 }
4219 
4220 /**
4221  * xmlRegExecPushString2:
4222  * @exec: a regexp execution context or NULL to indicate the end
4223  * @value: the first string token input
4224  * @value2: the second string token input
4225  * @data: data associated to the token to reuse in callbacks
4226  *
4227  * Push one input token in the execution context
4228  *
4229  * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4230  *     a negative value in case of error.
4231  */
4232 int
xmlRegExecPushString2(xmlRegExecCtxtPtr exec,const xmlChar * value,const xmlChar * value2,void * data)4233 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4234                       const xmlChar *value2, void *data) {
4235     xmlChar buf[150];
4236     int lenn, lenp, ret;
4237     xmlChar *str;
4238 
4239     if (exec == NULL)
4240 	return(-1);
4241     if (exec->comp == NULL)
4242 	return(-1);
4243     if (exec->status != 0)
4244 	return(exec->status);
4245 
4246     if (value2 == NULL)
4247         return(xmlRegExecPushString(exec, value, data));
4248 
4249     lenn = strlen((char *) value2);
4250     lenp = strlen((char *) value);
4251 
4252     if (150 < lenn + lenp + 2) {
4253 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4254 	if (str == NULL) {
4255 	    exec->status = -1;
4256 	    return(-1);
4257 	}
4258     } else {
4259 	str = buf;
4260     }
4261     memcpy(&str[0], value, lenp);
4262     str[lenp] = XML_REG_STRING_SEPARATOR;
4263     memcpy(&str[lenp + 1], value2, lenn);
4264     str[lenn + lenp + 1] = 0;
4265 
4266     if (exec->comp->compact != NULL)
4267 	ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4268     else
4269         ret = xmlRegExecPushStringInternal(exec, str, data, 1);
4270 
4271     if (str != buf)
4272         xmlFree(str);
4273     return(ret);
4274 }
4275 
4276 /**
4277  * xmlRegExecGetValues:
4278  * @exec: a regexp execution context
4279  * @err: error extraction or normal one
4280  * @nbval: pointer to the number of accepted values IN/OUT
4281  * @nbneg: return number of negative transitions
4282  * @values: pointer to the array of acceptable values
4283  * @terminal: return value if this was a terminal state
4284  *
4285  * Extract information from the regexp execution, internal routine to
4286  * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
4287  *
4288  * Returns: 0 in case of success or -1 in case of error.
4289  */
4290 static int
xmlRegExecGetValues(xmlRegExecCtxtPtr exec,int err,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4291 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
4292                     int *nbval, int *nbneg,
4293 		    xmlChar **values, int *terminal) {
4294     int maxval;
4295     int nb = 0;
4296 
4297     if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
4298         (values == NULL) || (*nbval <= 0))
4299         return(-1);
4300 
4301     maxval = *nbval;
4302     *nbval = 0;
4303     *nbneg = 0;
4304     if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4305         xmlRegexpPtr comp;
4306 	int target, i, state;
4307 
4308         comp = exec->comp;
4309 
4310 	if (err) {
4311 	    if (exec->errStateNo == -1) return(-1);
4312 	    state = exec->errStateNo;
4313 	} else {
4314 	    state = exec->index;
4315 	}
4316 	if (terminal != NULL) {
4317 	    if (comp->compact[state * (comp->nbstrings + 1)] ==
4318 	        XML_REGEXP_FINAL_STATE)
4319 		*terminal = 1;
4320 	    else
4321 		*terminal = 0;
4322 	}
4323 	for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4324 	    target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4325 	    if ((target > 0) && (target <= comp->nbstates) &&
4326 	        (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4327 		 XML_REGEXP_SINK_STATE)) {
4328 	        values[nb++] = comp->stringMap[i];
4329 		(*nbval)++;
4330 	    }
4331 	}
4332 	for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4333 	    target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4334 	    if ((target > 0) && (target <= comp->nbstates) &&
4335 	        (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4336 		 XML_REGEXP_SINK_STATE)) {
4337 	        values[nb++] = comp->stringMap[i];
4338 		(*nbneg)++;
4339 	    }
4340 	}
4341     } else {
4342         int transno;
4343 	xmlRegTransPtr trans;
4344 	xmlRegAtomPtr atom;
4345 	xmlRegStatePtr state;
4346 
4347 	if (terminal != NULL) {
4348 	    if (exec->state->type == XML_REGEXP_FINAL_STATE)
4349 		*terminal = 1;
4350 	    else
4351 		*terminal = 0;
4352 	}
4353 
4354 	if (err) {
4355 	    if (exec->errState == NULL) return(-1);
4356 	    state = exec->errState;
4357 	} else {
4358 	    if (exec->state == NULL) return(-1);
4359 	    state = exec->state;
4360 	}
4361 	for (transno = 0;
4362 	     (transno < state->nbTrans) && (nb < maxval);
4363 	     transno++) {
4364 	    trans = &state->trans[transno];
4365 	    if (trans->to < 0)
4366 		continue;
4367 	    atom = trans->atom;
4368 	    if ((atom == NULL) || (atom->valuep == NULL))
4369 		continue;
4370 	    if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4371 	        /* this should not be reached but ... */
4372 	        TODO;
4373 	    } else if (trans->count == REGEXP_ALL_COUNTER) {
4374 	        /* this should not be reached but ... */
4375 	        TODO;
4376 	    } else if (trans->counter >= 0) {
4377 		xmlRegCounterPtr counter = NULL;
4378 		int count;
4379 
4380 		if (err)
4381 		    count = exec->errCounts[trans->counter];
4382 		else
4383 		    count = exec->counts[trans->counter];
4384 		if (exec->comp != NULL)
4385 		    counter = &exec->comp->counters[trans->counter];
4386 		if ((counter == NULL) || (count < counter->max)) {
4387 		    if (atom->neg)
4388 			values[nb++] = (xmlChar *) atom->valuep2;
4389 		    else
4390 			values[nb++] = (xmlChar *) atom->valuep;
4391 		    (*nbval)++;
4392 		}
4393 	    } else {
4394                 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
4395 		    (exec->comp->states[trans->to]->type !=
4396 		     XML_REGEXP_SINK_STATE)) {
4397 		    if (atom->neg)
4398 			values[nb++] = (xmlChar *) atom->valuep2;
4399 		    else
4400 			values[nb++] = (xmlChar *) atom->valuep;
4401 		    (*nbval)++;
4402 		}
4403 	    }
4404 	}
4405 	for (transno = 0;
4406 	     (transno < state->nbTrans) && (nb < maxval);
4407 	     transno++) {
4408 	    trans = &state->trans[transno];
4409 	    if (trans->to < 0)
4410 		continue;
4411 	    atom = trans->atom;
4412 	    if ((atom == NULL) || (atom->valuep == NULL))
4413 		continue;
4414 	    if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4415 	        continue;
4416 	    } else if (trans->count == REGEXP_ALL_COUNTER) {
4417 	        continue;
4418 	    } else if (trans->counter >= 0) {
4419 	        continue;
4420 	    } else {
4421                 if ((exec->comp->states[trans->to] != NULL) &&
4422 		    (exec->comp->states[trans->to]->type ==
4423 		     XML_REGEXP_SINK_STATE)) {
4424 		    if (atom->neg)
4425 			values[nb++] = (xmlChar *) atom->valuep2;
4426 		    else
4427 			values[nb++] = (xmlChar *) atom->valuep;
4428 		    (*nbneg)++;
4429 		}
4430 	    }
4431 	}
4432     }
4433     return(0);
4434 }
4435 
4436 /**
4437  * xmlRegExecNextValues:
4438  * @exec: a regexp execution context
4439  * @nbval: pointer to the number of accepted values IN/OUT
4440  * @nbneg: return number of negative transitions
4441  * @values: pointer to the array of acceptable values
4442  * @terminal: return value if this was a terminal state
4443  *
4444  * Extract information from the regexp execution,
4445  * the parameter @values must point to an array of @nbval string pointers
4446  * on return nbval will contain the number of possible strings in that
4447  * state and the @values array will be updated with them. The string values
4448  * returned will be freed with the @exec context and don't need to be
4449  * deallocated.
4450  *
4451  * Returns: 0 in case of success or -1 in case of error.
4452  */
4453 int
xmlRegExecNextValues(xmlRegExecCtxtPtr exec,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4454 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4455                      xmlChar **values, int *terminal) {
4456     return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
4457 }
4458 
4459 /**
4460  * xmlRegExecErrInfo:
4461  * @exec: a regexp execution context generating an error
4462  * @string: return value for the error string
4463  * @nbval: pointer to the number of accepted values IN/OUT
4464  * @nbneg: return number of negative transitions
4465  * @values: pointer to the array of acceptable values
4466  * @terminal: return value if this was a terminal state
4467  *
4468  * Extract error information from the regexp execution, the parameter
4469  * @string will be updated with the value pushed and not accepted,
4470  * the parameter @values must point to an array of @nbval string pointers
4471  * on return nbval will contain the number of possible strings in that
4472  * state and the @values array will be updated with them. The string values
4473  * returned will be freed with the @exec context and don't need to be
4474  * deallocated.
4475  *
4476  * Returns: 0 in case of success or -1 in case of error.
4477  */
4478 int
xmlRegExecErrInfo(xmlRegExecCtxtPtr exec,const xmlChar ** string,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4479 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
4480                   int *nbval, int *nbneg, xmlChar **values, int *terminal) {
4481     if (exec == NULL)
4482         return(-1);
4483     if (string != NULL) {
4484         if (exec->status != 0)
4485 	    *string = exec->errString;
4486 	else
4487 	    *string = NULL;
4488     }
4489     return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
4490 }
4491 
4492 #ifdef DEBUG_ERR
testerr(xmlRegExecCtxtPtr exec)4493 static void testerr(xmlRegExecCtxtPtr exec) {
4494     const xmlChar *string;
4495     xmlChar *values[5];
4496     int nb = 5;
4497     int nbneg;
4498     int terminal;
4499     xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
4500 }
4501 #endif
4502 
4503 #if 0
4504 static int
4505 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4506     xmlRegTransPtr trans;
4507     xmlRegAtomPtr atom;
4508     int ret;
4509     int codepoint, len;
4510 
4511     if (exec == NULL)
4512 	return(-1);
4513     if (exec->status != 0)
4514 	return(exec->status);
4515 
4516     while ((exec->status == 0) &&
4517 	   ((exec->inputString[exec->index] != 0) ||
4518 	    (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4519 
4520 	/*
4521 	 * End of input on non-terminal state, rollback, however we may
4522 	 * still have epsilon like transition for counted transitions
4523 	 * on counters, in that case don't break too early.
4524 	 */
4525 	if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4526 	    goto rollback;
4527 
4528 	exec->transcount = 0;
4529 	for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4530 	    trans = &exec->state->trans[exec->transno];
4531 	    if (trans->to < 0)
4532 		continue;
4533 	    atom = trans->atom;
4534 	    ret = 0;
4535 	    if (trans->count >= 0) {
4536 		int count;
4537 		xmlRegCounterPtr counter;
4538 
4539 		/*
4540 		 * A counted transition.
4541 		 */
4542 
4543 		count = exec->counts[trans->count];
4544 		counter = &exec->comp->counters[trans->count];
4545 #ifdef DEBUG_REGEXP_EXEC
4546 		printf("testing count %d: val %d, min %d, max %d\n",
4547 		       trans->count, count, counter->min,  counter->max);
4548 #endif
4549 		ret = ((count >= counter->min) && (count <= counter->max));
4550 	    } else if (atom == NULL) {
4551 		fprintf(stderr, "epsilon transition left at runtime\n");
4552 		exec->status = -2;
4553 		break;
4554 	    } else if (exec->inputString[exec->index] != 0) {
4555                 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4556 		ret = xmlRegCheckCharacter(atom, codepoint);
4557 		if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4558 		    xmlRegStatePtr to = exec->comp->states[trans->to];
4559 
4560 		    /*
4561 		     * this is a multiple input sequence
4562 		     */
4563 		    if (exec->state->nbTrans > exec->transno + 1) {
4564 			xmlFARegExecSave(exec);
4565 		    }
4566 		    exec->transcount = 1;
4567 		    do {
4568 			/*
4569 			 * Try to progress as much as possible on the input
4570 			 */
4571 			if (exec->transcount == atom->max) {
4572 			    break;
4573 			}
4574 			exec->index += len;
4575 			/*
4576 			 * End of input: stop here
4577 			 */
4578 			if (exec->inputString[exec->index] == 0) {
4579 			    exec->index -= len;
4580 			    break;
4581 			}
4582 			if (exec->transcount >= atom->min) {
4583 			    int transno = exec->transno;
4584 			    xmlRegStatePtr state = exec->state;
4585 
4586 			    /*
4587 			     * The transition is acceptable save it
4588 			     */
4589 			    exec->transno = -1; /* trick */
4590 			    exec->state = to;
4591 			    xmlFARegExecSave(exec);
4592 			    exec->transno = transno;
4593 			    exec->state = state;
4594 			}
4595 			codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4596 				              len);
4597 			ret = xmlRegCheckCharacter(atom, codepoint);
4598 			exec->transcount++;
4599 		    } while (ret == 1);
4600 		    if (exec->transcount < atom->min)
4601 			ret = 0;
4602 
4603 		    /*
4604 		     * If the last check failed but one transition was found
4605 		     * possible, rollback
4606 		     */
4607 		    if (ret < 0)
4608 			ret = 0;
4609 		    if (ret == 0) {
4610 			goto rollback;
4611 		    }
4612 		}
4613 	    }
4614 	    if (ret == 1) {
4615 		if (exec->state->nbTrans > exec->transno + 1) {
4616 		    xmlFARegExecSave(exec);
4617 		}
4618 		/*
4619 		 * restart count for expressions like this ((abc){2})*
4620 		 */
4621 		if (trans->count >= 0) {
4622 #ifdef DEBUG_REGEXP_EXEC
4623 		    printf("Reset count %d\n", trans->count);
4624 #endif
4625 		    exec->counts[trans->count] = 0;
4626 		}
4627 		if (trans->counter >= 0) {
4628 #ifdef DEBUG_REGEXP_EXEC
4629 		    printf("Increasing count %d\n", trans->counter);
4630 #endif
4631 		    exec->counts[trans->counter]++;
4632 		}
4633 #ifdef DEBUG_REGEXP_EXEC
4634 		printf("entering state %d\n", trans->to);
4635 #endif
4636 		exec->state = exec->comp->states[trans->to];
4637 		exec->transno = 0;
4638 		if (trans->atom != NULL) {
4639 		    exec->index += len;
4640 		}
4641 		goto progress;
4642 	    } else if (ret < 0) {
4643 		exec->status = -4;
4644 		break;
4645 	    }
4646 	}
4647 	if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4648 rollback:
4649 	    /*
4650 	     * Failed to find a way out
4651 	     */
4652 	    exec->determinist = 0;
4653 	    xmlFARegExecRollBack(exec);
4654 	}
4655 progress:
4656 	continue;
4657     }
4658 }
4659 #endif
4660 /************************************************************************
4661  *									*
4662  *	Parser for the Schemas Datatype Regular Expressions		*
4663  *	http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs	*
4664  *									*
4665  ************************************************************************/
4666 
4667 /**
4668  * xmlFAIsChar:
4669  * @ctxt:  a regexp parser context
4670  *
4671  * [10]   Char   ::=   [^.\?*+()|#x5B#x5D]
4672  */
4673 static int
xmlFAIsChar(xmlRegParserCtxtPtr ctxt)4674 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4675     int cur;
4676     int len;
4677 
4678     cur = CUR_SCHAR(ctxt->cur, len);
4679     if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4680 	(cur == '*') || (cur == '+') || (cur == '(') ||
4681 	(cur == ')') || (cur == '|') || (cur == 0x5B) ||
4682 	(cur == 0x5D) || (cur == 0))
4683 	return(-1);
4684     return(cur);
4685 }
4686 
4687 /**
4688  * xmlFAParseCharProp:
4689  * @ctxt:  a regexp parser context
4690  *
4691  * [27]   charProp   ::=   IsCategory | IsBlock
4692  * [28]   IsCategory ::= Letters | Marks | Numbers | Punctuation |
4693  *                       Separators | Symbols | Others
4694  * [29]   Letters   ::=   'L' [ultmo]?
4695  * [30]   Marks   ::=   'M' [nce]?
4696  * [31]   Numbers   ::=   'N' [dlo]?
4697  * [32]   Punctuation   ::=   'P' [cdseifo]?
4698  * [33]   Separators   ::=   'Z' [slp]?
4699  * [34]   Symbols   ::=   'S' [mcko]?
4700  * [35]   Others   ::=   'C' [cfon]?
4701  * [36]   IsBlock   ::=   'Is' [a-zA-Z0-9#x2D]+
4702  */
4703 static void
xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt)4704 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4705     int cur;
4706     xmlRegAtomType type = (xmlRegAtomType) 0;
4707     xmlChar *blockName = NULL;
4708 
4709     cur = CUR;
4710     if (cur == 'L') {
4711 	NEXT;
4712 	cur = CUR;
4713 	if (cur == 'u') {
4714 	    NEXT;
4715 	    type = XML_REGEXP_LETTER_UPPERCASE;
4716 	} else if (cur == 'l') {
4717 	    NEXT;
4718 	    type = XML_REGEXP_LETTER_LOWERCASE;
4719 	} else if (cur == 't') {
4720 	    NEXT;
4721 	    type = XML_REGEXP_LETTER_TITLECASE;
4722 	} else if (cur == 'm') {
4723 	    NEXT;
4724 	    type = XML_REGEXP_LETTER_MODIFIER;
4725 	} else if (cur == 'o') {
4726 	    NEXT;
4727 	    type = XML_REGEXP_LETTER_OTHERS;
4728 	} else {
4729 	    type = XML_REGEXP_LETTER;
4730 	}
4731     } else if (cur == 'M') {
4732 	NEXT;
4733 	cur = CUR;
4734 	if (cur == 'n') {
4735 	    NEXT;
4736 	    /* nonspacing */
4737 	    type = XML_REGEXP_MARK_NONSPACING;
4738 	} else if (cur == 'c') {
4739 	    NEXT;
4740 	    /* spacing combining */
4741 	    type = XML_REGEXP_MARK_SPACECOMBINING;
4742 	} else if (cur == 'e') {
4743 	    NEXT;
4744 	    /* enclosing */
4745 	    type = XML_REGEXP_MARK_ENCLOSING;
4746 	} else {
4747 	    /* all marks */
4748 	    type = XML_REGEXP_MARK;
4749 	}
4750     } else if (cur == 'N') {
4751 	NEXT;
4752 	cur = CUR;
4753 	if (cur == 'd') {
4754 	    NEXT;
4755 	    /* digital */
4756 	    type = XML_REGEXP_NUMBER_DECIMAL;
4757 	} else if (cur == 'l') {
4758 	    NEXT;
4759 	    /* letter */
4760 	    type = XML_REGEXP_NUMBER_LETTER;
4761 	} else if (cur == 'o') {
4762 	    NEXT;
4763 	    /* other */
4764 	    type = XML_REGEXP_NUMBER_OTHERS;
4765 	} else {
4766 	    /* all numbers */
4767 	    type = XML_REGEXP_NUMBER;
4768 	}
4769     } else if (cur == 'P') {
4770 	NEXT;
4771 	cur = CUR;
4772 	if (cur == 'c') {
4773 	    NEXT;
4774 	    /* connector */
4775 	    type = XML_REGEXP_PUNCT_CONNECTOR;
4776 	} else if (cur == 'd') {
4777 	    NEXT;
4778 	    /* dash */
4779 	    type = XML_REGEXP_PUNCT_DASH;
4780 	} else if (cur == 's') {
4781 	    NEXT;
4782 	    /* open */
4783 	    type = XML_REGEXP_PUNCT_OPEN;
4784 	} else if (cur == 'e') {
4785 	    NEXT;
4786 	    /* close */
4787 	    type = XML_REGEXP_PUNCT_CLOSE;
4788 	} else if (cur == 'i') {
4789 	    NEXT;
4790 	    /* initial quote */
4791 	    type = XML_REGEXP_PUNCT_INITQUOTE;
4792 	} else if (cur == 'f') {
4793 	    NEXT;
4794 	    /* final quote */
4795 	    type = XML_REGEXP_PUNCT_FINQUOTE;
4796 	} else if (cur == 'o') {
4797 	    NEXT;
4798 	    /* other */
4799 	    type = XML_REGEXP_PUNCT_OTHERS;
4800 	} else {
4801 	    /* all punctuation */
4802 	    type = XML_REGEXP_PUNCT;
4803 	}
4804     } else if (cur == 'Z') {
4805 	NEXT;
4806 	cur = CUR;
4807 	if (cur == 's') {
4808 	    NEXT;
4809 	    /* space */
4810 	    type = XML_REGEXP_SEPAR_SPACE;
4811 	} else if (cur == 'l') {
4812 	    NEXT;
4813 	    /* line */
4814 	    type = XML_REGEXP_SEPAR_LINE;
4815 	} else if (cur == 'p') {
4816 	    NEXT;
4817 	    /* paragraph */
4818 	    type = XML_REGEXP_SEPAR_PARA;
4819 	} else {
4820 	    /* all separators */
4821 	    type = XML_REGEXP_SEPAR;
4822 	}
4823     } else if (cur == 'S') {
4824 	NEXT;
4825 	cur = CUR;
4826 	if (cur == 'm') {
4827 	    NEXT;
4828 	    type = XML_REGEXP_SYMBOL_MATH;
4829 	    /* math */
4830 	} else if (cur == 'c') {
4831 	    NEXT;
4832 	    type = XML_REGEXP_SYMBOL_CURRENCY;
4833 	    /* currency */
4834 	} else if (cur == 'k') {
4835 	    NEXT;
4836 	    type = XML_REGEXP_SYMBOL_MODIFIER;
4837 	    /* modifiers */
4838 	} else if (cur == 'o') {
4839 	    NEXT;
4840 	    type = XML_REGEXP_SYMBOL_OTHERS;
4841 	    /* other */
4842 	} else {
4843 	    /* all symbols */
4844 	    type = XML_REGEXP_SYMBOL;
4845 	}
4846     } else if (cur == 'C') {
4847 	NEXT;
4848 	cur = CUR;
4849 	if (cur == 'c') {
4850 	    NEXT;
4851 	    /* control */
4852 	    type = XML_REGEXP_OTHER_CONTROL;
4853 	} else if (cur == 'f') {
4854 	    NEXT;
4855 	    /* format */
4856 	    type = XML_REGEXP_OTHER_FORMAT;
4857 	} else if (cur == 'o') {
4858 	    NEXT;
4859 	    /* private use */
4860 	    type = XML_REGEXP_OTHER_PRIVATE;
4861 	} else if (cur == 'n') {
4862 	    NEXT;
4863 	    /* not assigned */
4864 	    type = XML_REGEXP_OTHER_NA;
4865 	} else {
4866 	    /* all others */
4867 	    type = XML_REGEXP_OTHER;
4868 	}
4869     } else if (cur == 'I') {
4870 	const xmlChar *start;
4871 	NEXT;
4872 	cur = CUR;
4873 	if (cur != 's') {
4874 	    ERROR("IsXXXX expected");
4875 	    return;
4876 	}
4877 	NEXT;
4878 	start = ctxt->cur;
4879 	cur = CUR;
4880 	if (((cur >= 'a') && (cur <= 'z')) ||
4881 	    ((cur >= 'A') && (cur <= 'Z')) ||
4882 	    ((cur >= '0') && (cur <= '9')) ||
4883 	    (cur == 0x2D)) {
4884 	    NEXT;
4885 	    cur = CUR;
4886 	    while (((cur >= 'a') && (cur <= 'z')) ||
4887 		((cur >= 'A') && (cur <= 'Z')) ||
4888 		((cur >= '0') && (cur <= '9')) ||
4889 		(cur == 0x2D)) {
4890 		NEXT;
4891 		cur = CUR;
4892 	    }
4893 	}
4894 	type = XML_REGEXP_BLOCK_NAME;
4895 	blockName = xmlStrndup(start, ctxt->cur - start);
4896     } else {
4897 	ERROR("Unknown char property");
4898 	return;
4899     }
4900     if (ctxt->atom == NULL) {
4901 	ctxt->atom = xmlRegNewAtom(ctxt, type);
4902 	if (ctxt->atom != NULL)
4903 	    ctxt->atom->valuep = blockName;
4904     } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4905         xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4906 		           type, 0, 0, blockName);
4907     }
4908 }
4909 
4910 /**
4911  * xmlFAParseCharClassEsc:
4912  * @ctxt:  a regexp parser context
4913  *
4914  * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4915  * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4916  * [25] catEsc   ::=   '\p{' charProp '}'
4917  * [26] complEsc ::=   '\P{' charProp '}'
4918  * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4919  */
4920 static void
xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt)4921 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4922     int cur;
4923 
4924     if (CUR == '.') {
4925 	if (ctxt->atom == NULL) {
4926 	    ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4927 	} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4928 	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4929 			       XML_REGEXP_ANYCHAR, 0, 0, NULL);
4930 	}
4931 	NEXT;
4932 	return;
4933     }
4934     if (CUR != '\\') {
4935 	ERROR("Escaped sequence: expecting \\");
4936 	return;
4937     }
4938     NEXT;
4939     cur = CUR;
4940     if (cur == 'p') {
4941 	NEXT;
4942 	if (CUR != '{') {
4943 	    ERROR("Expecting '{'");
4944 	    return;
4945 	}
4946 	NEXT;
4947 	xmlFAParseCharProp(ctxt);
4948 	if (CUR != '}') {
4949 	    ERROR("Expecting '}'");
4950 	    return;
4951 	}
4952 	NEXT;
4953     } else if (cur == 'P') {
4954 	NEXT;
4955 	if (CUR != '{') {
4956 	    ERROR("Expecting '{'");
4957 	    return;
4958 	}
4959 	NEXT;
4960 	xmlFAParseCharProp(ctxt);
4961         if (ctxt->atom != NULL)
4962 	    ctxt->atom->neg = 1;
4963 	if (CUR != '}') {
4964 	    ERROR("Expecting '}'");
4965 	    return;
4966 	}
4967 	NEXT;
4968     } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4969 	(cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4970 	(cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4971 	(cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4972 	(cur == 0x5E)) {
4973 	if (ctxt->atom == NULL) {
4974 	    ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
4975 	    if (ctxt->atom != NULL) {
4976 	        switch (cur) {
4977 		    case 'n':
4978 		        ctxt->atom->codepoint = '\n';
4979 			break;
4980 		    case 'r':
4981 		        ctxt->atom->codepoint = '\r';
4982 			break;
4983 		    case 't':
4984 		        ctxt->atom->codepoint = '\t';
4985 			break;
4986 		    default:
4987 			ctxt->atom->codepoint = cur;
4988 		}
4989 	    }
4990 	} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4991             switch (cur) {
4992                 case 'n':
4993                     cur = '\n';
4994                     break;
4995                 case 'r':
4996                     cur = '\r';
4997                     break;
4998                 case 't':
4999                     cur = '\t';
5000                     break;
5001             }
5002 	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5003 			       XML_REGEXP_CHARVAL, cur, cur, NULL);
5004 	}
5005 	NEXT;
5006     } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
5007 	(cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
5008 	(cur == 'w') || (cur == 'W')) {
5009 	xmlRegAtomType type = XML_REGEXP_ANYSPACE;
5010 
5011 	switch (cur) {
5012 	    case 's':
5013 		type = XML_REGEXP_ANYSPACE;
5014 		break;
5015 	    case 'S':
5016 		type = XML_REGEXP_NOTSPACE;
5017 		break;
5018 	    case 'i':
5019 		type = XML_REGEXP_INITNAME;
5020 		break;
5021 	    case 'I':
5022 		type = XML_REGEXP_NOTINITNAME;
5023 		break;
5024 	    case 'c':
5025 		type = XML_REGEXP_NAMECHAR;
5026 		break;
5027 	    case 'C':
5028 		type = XML_REGEXP_NOTNAMECHAR;
5029 		break;
5030 	    case 'd':
5031 		type = XML_REGEXP_DECIMAL;
5032 		break;
5033 	    case 'D':
5034 		type = XML_REGEXP_NOTDECIMAL;
5035 		break;
5036 	    case 'w':
5037 		type = XML_REGEXP_REALCHAR;
5038 		break;
5039 	    case 'W':
5040 		type = XML_REGEXP_NOTREALCHAR;
5041 		break;
5042 	}
5043 	NEXT;
5044 	if (ctxt->atom == NULL) {
5045 	    ctxt->atom = xmlRegNewAtom(ctxt, type);
5046 	} else if (ctxt->atom->type == XML_REGEXP_RANGES) {
5047 	    xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5048 			       type, 0, 0, NULL);
5049 	}
5050     } else {
5051 	ERROR("Wrong escape sequence, misuse of character '\\'");
5052     }
5053 }
5054 
5055 /**
5056  * xmlFAParseCharRange:
5057  * @ctxt:  a regexp parser context
5058  *
5059  * [17]   charRange   ::=     seRange | XmlCharRef | XmlCharIncDash
5060  * [18]   seRange   ::=   charOrEsc '-' charOrEsc
5061  * [20]   charOrEsc   ::=   XmlChar | SingleCharEsc
5062  * [21]   XmlChar   ::=   [^\#x2D#x5B#x5D]
5063  * [22]   XmlCharIncDash   ::=   [^\#x5B#x5D]
5064  */
5065 static void
xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt)5066 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
5067     int cur, len;
5068     int start = -1;
5069     int end = -1;
5070 
5071     if (CUR == '\0') {
5072         ERROR("Expecting ']'");
5073 	return;
5074     }
5075 
5076     cur = CUR;
5077     if (cur == '\\') {
5078 	NEXT;
5079 	cur = CUR;
5080 	switch (cur) {
5081 	    case 'n': start = 0xA; break;
5082 	    case 'r': start = 0xD; break;
5083 	    case 't': start = 0x9; break;
5084 	    case '\\': case '|': case '.': case '-': case '^': case '?':
5085 	    case '*': case '+': case '{': case '}': case '(': case ')':
5086 	    case '[': case ']':
5087 		start = cur; break;
5088 	    default:
5089 		ERROR("Invalid escape value");
5090 		return;
5091 	}
5092 	end = start;
5093         len = 1;
5094     } else if ((cur != 0x5B) && (cur != 0x5D)) {
5095         end = start = CUR_SCHAR(ctxt->cur, len);
5096     } else {
5097 	ERROR("Expecting a char range");
5098 	return;
5099     }
5100     /*
5101      * Since we are "inside" a range, we can assume ctxt->cur is past
5102      * the start of ctxt->string, and PREV should be safe
5103      */
5104     if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
5105 	NEXTL(len);
5106 	return;
5107     }
5108     NEXTL(len);
5109     cur = CUR;
5110     if ((cur != '-') || (NXT(1) == ']')) {
5111         xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5112 		              XML_REGEXP_CHARVAL, start, end, NULL);
5113 	return;
5114     }
5115     NEXT;
5116     cur = CUR;
5117     if (cur == '\\') {
5118 	NEXT;
5119 	cur = CUR;
5120 	switch (cur) {
5121 	    case 'n': end = 0xA; break;
5122 	    case 'r': end = 0xD; break;
5123 	    case 't': end = 0x9; break;
5124 	    case '\\': case '|': case '.': case '-': case '^': case '?':
5125 	    case '*': case '+': case '{': case '}': case '(': case ')':
5126 	    case '[': case ']':
5127 		end = cur; break;
5128 	    default:
5129 		ERROR("Invalid escape value");
5130 		return;
5131 	}
5132         len = 1;
5133     } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
5134         end = CUR_SCHAR(ctxt->cur, len);
5135     } else {
5136 	ERROR("Expecting the end of a char range");
5137 	return;
5138     }
5139 
5140     /* TODO check that the values are acceptable character ranges for XML */
5141     if (end < start) {
5142 	ERROR("End of range is before start of range");
5143     } else {
5144         NEXTL(len);
5145         xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5146 		           XML_REGEXP_CHARVAL, start, end, NULL);
5147     }
5148     return;
5149 }
5150 
5151 /**
5152  * xmlFAParsePosCharGroup:
5153  * @ctxt:  a regexp parser context
5154  *
5155  * [14]   posCharGroup ::= ( charRange | charClassEsc  )+
5156  */
5157 static void
xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt)5158 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5159     do {
5160 	if (CUR == '\\') {
5161 	    xmlFAParseCharClassEsc(ctxt);
5162 	} else {
5163 	    xmlFAParseCharRange(ctxt);
5164 	}
5165     } while ((CUR != ']') && (CUR != '-') &&
5166              (CUR != 0) && (ctxt->error == 0));
5167 }
5168 
5169 /**
5170  * xmlFAParseCharGroup:
5171  * @ctxt:  a regexp parser context
5172  *
5173  * [13]   charGroup    ::= posCharGroup | negCharGroup | charClassSub
5174  * [15]   negCharGroup ::= '^' posCharGroup
5175  * [16]   charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
5176  * [12]   charClassExpr ::= '[' charGroup ']'
5177  */
5178 static void
xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt)5179 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
5180     int neg = ctxt->neg;
5181 
5182     if (CUR == '^') {
5183 	NEXT;
5184 	ctxt->neg = !ctxt->neg;
5185 	xmlFAParsePosCharGroup(ctxt);
5186 	ctxt->neg = neg;
5187     }
5188     while ((CUR != ']') && (ctxt->error == 0)) {
5189 	if ((CUR == '-') && (NXT(1) == '[')) {
5190 	    NEXT;	/* eat the '-' */
5191 	    NEXT;	/* eat the '[' */
5192 	    ctxt->neg = 2;
5193 	    xmlFAParseCharGroup(ctxt);
5194 	    ctxt->neg = neg;
5195 	    if (CUR == ']') {
5196 		NEXT;
5197 	    } else {
5198 		ERROR("charClassExpr: ']' expected");
5199 	    }
5200 	    break;
5201 	} else {
5202 	    xmlFAParsePosCharGroup(ctxt);
5203 	}
5204     }
5205 }
5206 
5207 /**
5208  * xmlFAParseCharClass:
5209  * @ctxt:  a regexp parser context
5210  *
5211  * [11]   charClass   ::=     charClassEsc | charClassExpr
5212  * [12]   charClassExpr   ::=   '[' charGroup ']'
5213  */
5214 static void
xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt)5215 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5216     if (CUR == '[') {
5217 	NEXT;
5218 	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5219 	if (ctxt->atom == NULL)
5220 	    return;
5221 	xmlFAParseCharGroup(ctxt);
5222 	if (CUR == ']') {
5223 	    NEXT;
5224 	} else {
5225 	    ERROR("xmlFAParseCharClass: ']' expected");
5226 	}
5227     } else {
5228 	xmlFAParseCharClassEsc(ctxt);
5229     }
5230 }
5231 
5232 /**
5233  * xmlFAParseQuantExact:
5234  * @ctxt:  a regexp parser context
5235  *
5236  * [8]   QuantExact   ::=   [0-9]+
5237  *
5238  * Returns 0 if success or -1 in case of error
5239  */
5240 static int
xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt)5241 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5242     int ret = 0;
5243     int ok = 0;
5244     int overflow = 0;
5245 
5246     while ((CUR >= '0') && (CUR <= '9')) {
5247         if (ret > INT_MAX / 10) {
5248             overflow = 1;
5249         } else {
5250             int digit = CUR - '0';
5251 
5252             ret *= 10;
5253             if (ret > INT_MAX - digit)
5254                 overflow = 1;
5255             else
5256                 ret += digit;
5257         }
5258 	ok = 1;
5259 	NEXT;
5260     }
5261     if ((ok != 1) || (overflow == 1)) {
5262 	return(-1);
5263     }
5264     return(ret);
5265 }
5266 
5267 /**
5268  * xmlFAParseQuantifier:
5269  * @ctxt:  a regexp parser context
5270  *
5271  * [4]   quantifier   ::=   [?*+] | ( '{' quantity '}' )
5272  * [5]   quantity   ::=   quantRange | quantMin | QuantExact
5273  * [6]   quantRange   ::=   QuantExact ',' QuantExact
5274  * [7]   quantMin   ::=   QuantExact ','
5275  * [8]   QuantExact   ::=   [0-9]+
5276  */
5277 static int
xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt)5278 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5279     int cur;
5280 
5281     cur = CUR;
5282     if ((cur == '?') || (cur == '*') || (cur == '+')) {
5283 	if (ctxt->atom != NULL) {
5284 	    if (cur == '?')
5285 		ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5286 	    else if (cur == '*')
5287 		ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5288 	    else if (cur == '+')
5289 		ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5290 	}
5291 	NEXT;
5292 	return(1);
5293     }
5294     if (cur == '{') {
5295 	int min = 0, max = 0;
5296 
5297 	NEXT;
5298 	cur = xmlFAParseQuantExact(ctxt);
5299 	if (cur >= 0)
5300 	    min = cur;
5301         else {
5302             ERROR("Improper quantifier");
5303         }
5304 	if (CUR == ',') {
5305 	    NEXT;
5306 	    if (CUR == '}')
5307 	        max = INT_MAX;
5308 	    else {
5309 	        cur = xmlFAParseQuantExact(ctxt);
5310 	        if (cur >= 0)
5311 		    max = cur;
5312 		else {
5313 		    ERROR("Improper quantifier");
5314 		}
5315 	    }
5316 	}
5317 	if (CUR == '}') {
5318 	    NEXT;
5319 	} else {
5320 	    ERROR("Unterminated quantifier");
5321 	}
5322 	if (max == 0)
5323 	    max = min;
5324 	if (ctxt->atom != NULL) {
5325 	    ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5326 	    ctxt->atom->min = min;
5327 	    ctxt->atom->max = max;
5328 	}
5329 	return(1);
5330     }
5331     return(0);
5332 }
5333 
5334 /**
5335  * xmlFAParseAtom:
5336  * @ctxt:  a regexp parser context
5337  *
5338  * [9]   atom   ::=   Char | charClass | ( '(' regExp ')' )
5339  */
5340 static int
xmlFAParseAtom(xmlRegParserCtxtPtr ctxt)5341 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5342     int codepoint, len;
5343 
5344     codepoint = xmlFAIsChar(ctxt);
5345     if (codepoint > 0) {
5346 	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5347 	if (ctxt->atom == NULL)
5348 	    return(-1);
5349 	codepoint = CUR_SCHAR(ctxt->cur, len);
5350 	ctxt->atom->codepoint = codepoint;
5351 	NEXTL(len);
5352 	return(1);
5353     } else if (CUR == '|') {
5354 	return(0);
5355     } else if (CUR == 0) {
5356 	return(0);
5357     } else if (CUR == ')') {
5358 	return(0);
5359     } else if (CUR == '(') {
5360 	xmlRegStatePtr start, oldend, start0;
5361 
5362 	NEXT;
5363         if (ctxt->depth >= 50) {
5364 	    ERROR("xmlFAParseAtom: maximum nesting depth exceeded");
5365             return(-1);
5366         }
5367 	/*
5368 	 * this extra Epsilon transition is needed if we count with 0 allowed
5369 	 * unfortunately this can't be known at that point
5370 	 */
5371 	xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5372 	start0 = ctxt->state;
5373 	xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5374 	start = ctxt->state;
5375 	oldend = ctxt->end;
5376 	ctxt->end = NULL;
5377 	ctxt->atom = NULL;
5378         ctxt->depth++;
5379 	xmlFAParseRegExp(ctxt, 0);
5380         ctxt->depth--;
5381 	if (CUR == ')') {
5382 	    NEXT;
5383 	} else {
5384 	    ERROR("xmlFAParseAtom: expecting ')'");
5385 	}
5386 	ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5387 	if (ctxt->atom == NULL)
5388 	    return(-1);
5389 	ctxt->atom->start = start;
5390 	ctxt->atom->start0 = start0;
5391 	ctxt->atom->stop = ctxt->state;
5392 	ctxt->end = oldend;
5393 	return(1);
5394     } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5395 	xmlFAParseCharClass(ctxt);
5396 	return(1);
5397     }
5398     return(0);
5399 }
5400 
5401 /**
5402  * xmlFAParsePiece:
5403  * @ctxt:  a regexp parser context
5404  *
5405  * [3]   piece   ::=   atom quantifier?
5406  */
5407 static int
xmlFAParsePiece(xmlRegParserCtxtPtr ctxt)5408 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5409     int ret;
5410 
5411     ctxt->atom = NULL;
5412     ret = xmlFAParseAtom(ctxt);
5413     if (ret == 0)
5414 	return(0);
5415     if (ctxt->atom == NULL) {
5416 	ERROR("internal: no atom generated");
5417     }
5418     xmlFAParseQuantifier(ctxt);
5419     return(1);
5420 }
5421 
5422 /**
5423  * xmlFAParseBranch:
5424  * @ctxt:  a regexp parser context
5425  * @to: optional target to the end of the branch
5426  *
5427  * @to is used to optimize by removing duplicate path in automata
5428  * in expressions like (a|b)(c|d)
5429  *
5430  * [2]   branch   ::=   piece*
5431  */
5432 static int
xmlFAParseBranch(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr to)5433 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
5434     xmlRegStatePtr previous;
5435     int ret;
5436 
5437     previous = ctxt->state;
5438     ret = xmlFAParsePiece(ctxt);
5439     if (ret == 0) {
5440         /* Empty branch */
5441 	xmlFAGenerateEpsilonTransition(ctxt, previous, to);
5442     } else {
5443 	if (xmlFAGenerateTransitions(ctxt, previous,
5444 	        (CUR=='|' || CUR==')' || CUR==0) ? to : NULL, ctxt->atom) < 0)
5445 	    return(-1);
5446 	previous = ctxt->state;
5447 	ctxt->atom = NULL;
5448     }
5449     while ((ret != 0) && (ctxt->error == 0)) {
5450 	ret = xmlFAParsePiece(ctxt);
5451 	if (ret != 0) {
5452 	    if (xmlFAGenerateTransitions(ctxt, previous,
5453 	            (CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5454                     ctxt->atom) < 0)
5455 		    return(-1);
5456 	    previous = ctxt->state;
5457 	    ctxt->atom = NULL;
5458 	}
5459     }
5460     return(0);
5461 }
5462 
5463 /**
5464  * xmlFAParseRegExp:
5465  * @ctxt:  a regexp parser context
5466  * @top:  is this the top-level expression ?
5467  *
5468  * [1]   regExp   ::=     branch  ( '|' branch )*
5469  */
5470 static void
xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt,int top)5471 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
5472     xmlRegStatePtr start, end;
5473 
5474     /* if not top start should have been generated by an epsilon trans */
5475     start = ctxt->state;
5476     ctxt->end = NULL;
5477     xmlFAParseBranch(ctxt, NULL);
5478     if (top) {
5479 #ifdef DEBUG_REGEXP_GRAPH
5480 	printf("State %d is final\n", ctxt->state->no);
5481 #endif
5482 	ctxt->state->type = XML_REGEXP_FINAL_STATE;
5483     }
5484     if (CUR != '|') {
5485 	ctxt->end = ctxt->state;
5486 	return;
5487     }
5488     end = ctxt->state;
5489     while ((CUR == '|') && (ctxt->error == 0)) {
5490 	NEXT;
5491 	ctxt->state = start;
5492 	ctxt->end = NULL;
5493 	xmlFAParseBranch(ctxt, end);
5494     }
5495     if (!top) {
5496 	ctxt->state = end;
5497 	ctxt->end = end;
5498     }
5499 }
5500 
5501 /************************************************************************
5502  *									*
5503  *			The basic API					*
5504  *									*
5505  ************************************************************************/
5506 
5507 /**
5508  * xmlRegexpPrint:
5509  * @output: the file for the output debug
5510  * @regexp: the compiled regexp
5511  *
5512  * Print the content of the compiled regular expression
5513  */
5514 void
xmlRegexpPrint(FILE * output,xmlRegexpPtr regexp)5515 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5516     int i;
5517 
5518     if (output == NULL)
5519         return;
5520     fprintf(output, " regexp: ");
5521     if (regexp == NULL) {
5522 	fprintf(output, "NULL\n");
5523 	return;
5524     }
5525     fprintf(output, "'%s' ", regexp->string);
5526     fprintf(output, "\n");
5527     fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5528     for (i = 0;i < regexp->nbAtoms; i++) {
5529 	fprintf(output, " %02d ", i);
5530 	xmlRegPrintAtom(output, regexp->atoms[i]);
5531     }
5532     fprintf(output, "%d states:", regexp->nbStates);
5533     fprintf(output, "\n");
5534     for (i = 0;i < regexp->nbStates; i++) {
5535 	xmlRegPrintState(output, regexp->states[i]);
5536     }
5537     fprintf(output, "%d counters:\n", regexp->nbCounters);
5538     for (i = 0;i < regexp->nbCounters; i++) {
5539 	fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5540 		                                regexp->counters[i].max);
5541     }
5542 }
5543 
5544 /**
5545  * xmlRegexpCompile:
5546  * @regexp:  a regular expression string
5547  *
5548  * Parses a regular expression conforming to XML Schemas Part 2 Datatype
5549  * Appendix F and builds an automata suitable for testing strings against
5550  * that regular expression
5551  *
5552  * Returns the compiled expression or NULL in case of error
5553  */
5554 xmlRegexpPtr
xmlRegexpCompile(const xmlChar * regexp)5555 xmlRegexpCompile(const xmlChar *regexp) {
5556     xmlRegexpPtr ret;
5557     xmlRegParserCtxtPtr ctxt;
5558 
5559     ctxt = xmlRegNewParserCtxt(regexp);
5560     if (ctxt == NULL)
5561 	return(NULL);
5562 
5563     /* initialize the parser */
5564     ctxt->end = NULL;
5565     ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5566     xmlRegStatePush(ctxt, ctxt->start);
5567 
5568     /* parse the expression building an automata */
5569     xmlFAParseRegExp(ctxt, 1);
5570     if (CUR != 0) {
5571 	ERROR("xmlFAParseRegExp: extra characters");
5572     }
5573     if (ctxt->error != 0) {
5574 	xmlRegFreeParserCtxt(ctxt);
5575 	return(NULL);
5576     }
5577     ctxt->end = ctxt->state;
5578     ctxt->start->type = XML_REGEXP_START_STATE;
5579     ctxt->end->type = XML_REGEXP_FINAL_STATE;
5580 
5581     /* remove the Epsilon except for counted transitions */
5582     xmlFAEliminateEpsilonTransitions(ctxt);
5583 
5584 
5585     if (ctxt->error != 0) {
5586 	xmlRegFreeParserCtxt(ctxt);
5587 	return(NULL);
5588     }
5589     ret = xmlRegEpxFromParse(ctxt);
5590     xmlRegFreeParserCtxt(ctxt);
5591     return(ret);
5592 }
5593 
5594 /**
5595  * xmlRegexpExec:
5596  * @comp:  the compiled regular expression
5597  * @content:  the value to check against the regular expression
5598  *
5599  * Check if the regular expression generates the value
5600  *
5601  * Returns 1 if it matches, 0 if not and a negative value in case of error
5602  */
5603 int
xmlRegexpExec(xmlRegexpPtr comp,const xmlChar * content)5604 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5605     if ((comp == NULL) || (content == NULL))
5606 	return(-1);
5607     return(xmlFARegExec(comp, content));
5608 }
5609 
5610 /**
5611  * xmlRegexpIsDeterminist:
5612  * @comp:  the compiled regular expression
5613  *
5614  * Check if the regular expression is determinist
5615  *
5616  * Returns 1 if it yes, 0 if not and a negative value in case of error
5617  */
5618 int
xmlRegexpIsDeterminist(xmlRegexpPtr comp)5619 xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5620     xmlAutomataPtr am;
5621     int ret;
5622 
5623     if (comp == NULL)
5624 	return(-1);
5625     if (comp->determinist != -1)
5626 	return(comp->determinist);
5627 
5628     am = xmlNewAutomata();
5629     if (am == NULL)
5630         return(-1);
5631     if (am->states != NULL) {
5632 	int i;
5633 
5634 	for (i = 0;i < am->nbStates;i++)
5635 	    xmlRegFreeState(am->states[i]);
5636 	xmlFree(am->states);
5637     }
5638     am->nbAtoms = comp->nbAtoms;
5639     am->atoms = comp->atoms;
5640     am->nbStates = comp->nbStates;
5641     am->states = comp->states;
5642     am->determinist = -1;
5643     am->flags = comp->flags;
5644     ret = xmlFAComputesDeterminism(am);
5645     am->atoms = NULL;
5646     am->states = NULL;
5647     xmlFreeAutomata(am);
5648     comp->determinist = ret;
5649     return(ret);
5650 }
5651 
5652 /**
5653  * xmlRegFreeRegexp:
5654  * @regexp:  the regexp
5655  *
5656  * Free a regexp
5657  */
5658 void
xmlRegFreeRegexp(xmlRegexpPtr regexp)5659 xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5660     int i;
5661     if (regexp == NULL)
5662 	return;
5663 
5664     if (regexp->string != NULL)
5665 	xmlFree(regexp->string);
5666     if (regexp->states != NULL) {
5667 	for (i = 0;i < regexp->nbStates;i++)
5668 	    xmlRegFreeState(regexp->states[i]);
5669 	xmlFree(regexp->states);
5670     }
5671     if (regexp->atoms != NULL) {
5672 	for (i = 0;i < regexp->nbAtoms;i++)
5673 	    xmlRegFreeAtom(regexp->atoms[i]);
5674 	xmlFree(regexp->atoms);
5675     }
5676     if (regexp->counters != NULL)
5677 	xmlFree(regexp->counters);
5678     if (regexp->compact != NULL)
5679 	xmlFree(regexp->compact);
5680     if (regexp->transdata != NULL)
5681 	xmlFree(regexp->transdata);
5682     if (regexp->stringMap != NULL) {
5683 	for (i = 0; i < regexp->nbstrings;i++)
5684 	    xmlFree(regexp->stringMap[i]);
5685 	xmlFree(regexp->stringMap);
5686     }
5687 
5688     xmlFree(regexp);
5689 }
5690 
5691 #ifdef LIBXML_AUTOMATA_ENABLED
5692 /************************************************************************
5693  *									*
5694  *			The Automata interface				*
5695  *									*
5696  ************************************************************************/
5697 
5698 /**
5699  * xmlNewAutomata:
5700  *
5701  * Create a new automata
5702  *
5703  * Returns the new object or NULL in case of failure
5704  */
5705 xmlAutomataPtr
xmlNewAutomata(void)5706 xmlNewAutomata(void) {
5707     xmlAutomataPtr ctxt;
5708 
5709     ctxt = xmlRegNewParserCtxt(NULL);
5710     if (ctxt == NULL)
5711 	return(NULL);
5712 
5713     /* initialize the parser */
5714     ctxt->end = NULL;
5715     ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5716     if (ctxt->start == NULL) {
5717 	xmlFreeAutomata(ctxt);
5718 	return(NULL);
5719     }
5720     ctxt->start->type = XML_REGEXP_START_STATE;
5721     if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5722         xmlRegFreeState(ctxt->start);
5723 	xmlFreeAutomata(ctxt);
5724 	return(NULL);
5725     }
5726     ctxt->flags = 0;
5727 
5728     return(ctxt);
5729 }
5730 
5731 /**
5732  * xmlFreeAutomata:
5733  * @am: an automata
5734  *
5735  * Free an automata
5736  */
5737 void
xmlFreeAutomata(xmlAutomataPtr am)5738 xmlFreeAutomata(xmlAutomataPtr am) {
5739     if (am == NULL)
5740 	return;
5741     xmlRegFreeParserCtxt(am);
5742 }
5743 
5744 /**
5745  * xmlAutomataSetFlags:
5746  * @am: an automata
5747  * @flags:  a set of internal flags
5748  *
5749  * Set some flags on the automata
5750  */
5751 void
xmlAutomataSetFlags(xmlAutomataPtr am,int flags)5752 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5753     if (am == NULL)
5754 	return;
5755     am->flags |= flags;
5756 }
5757 
5758 /**
5759  * xmlAutomataGetInitState:
5760  * @am: an automata
5761  *
5762  * Initial state lookup
5763  *
5764  * Returns the initial state of the automata
5765  */
5766 xmlAutomataStatePtr
xmlAutomataGetInitState(xmlAutomataPtr am)5767 xmlAutomataGetInitState(xmlAutomataPtr am) {
5768     if (am == NULL)
5769 	return(NULL);
5770     return(am->start);
5771 }
5772 
5773 /**
5774  * xmlAutomataSetFinalState:
5775  * @am: an automata
5776  * @state: a state in this automata
5777  *
5778  * Makes that state a final state
5779  *
5780  * Returns 0 or -1 in case of error
5781  */
5782 int
xmlAutomataSetFinalState(xmlAutomataPtr am,xmlAutomataStatePtr state)5783 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5784     if ((am == NULL) || (state == NULL))
5785 	return(-1);
5786     state->type = XML_REGEXP_FINAL_STATE;
5787     return(0);
5788 }
5789 
5790 /**
5791  * xmlAutomataNewTransition:
5792  * @am: an automata
5793  * @from: the starting point of the transition
5794  * @to: the target point of the transition or NULL
5795  * @token: the input string associated to that transition
5796  * @data: data passed to the callback function if the transition is activated
5797  *
5798  * If @to is NULL, this creates first a new target state in the automata
5799  * and then adds a transition from the @from state to the target state
5800  * activated by the value of @token
5801  *
5802  * Returns the target state or NULL in case of error
5803  */
5804 xmlAutomataStatePtr
xmlAutomataNewTransition(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,void * data)5805 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5806 			 xmlAutomataStatePtr to, const xmlChar *token,
5807 			 void *data) {
5808     xmlRegAtomPtr atom;
5809 
5810     if ((am == NULL) || (from == NULL) || (token == NULL))
5811 	return(NULL);
5812     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5813     if (atom == NULL)
5814         return(NULL);
5815     atom->data = data;
5816     atom->valuep = xmlStrdup(token);
5817 
5818     if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5819         xmlRegFreeAtom(atom);
5820 	return(NULL);
5821     }
5822     if (to == NULL)
5823 	return(am->state);
5824     return(to);
5825 }
5826 
5827 /**
5828  * xmlAutomataNewTransition2:
5829  * @am: an automata
5830  * @from: the starting point of the transition
5831  * @to: the target point of the transition or NULL
5832  * @token: the first input string associated to that transition
5833  * @token2: the second input string associated to that transition
5834  * @data: data passed to the callback function if the transition is activated
5835  *
5836  * If @to is NULL, this creates first a new target state in the automata
5837  * and then adds a transition from the @from state to the target state
5838  * activated by the value of @token
5839  *
5840  * Returns the target state or NULL in case of error
5841  */
5842 xmlAutomataStatePtr
xmlAutomataNewTransition2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5843 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5844 			  xmlAutomataStatePtr to, const xmlChar *token,
5845 			  const xmlChar *token2, void *data) {
5846     xmlRegAtomPtr atom;
5847 
5848     if ((am == NULL) || (from == NULL) || (token == NULL))
5849 	return(NULL);
5850     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5851     if (atom == NULL)
5852 	return(NULL);
5853     atom->data = data;
5854     if ((token2 == NULL) || (*token2 == 0)) {
5855 	atom->valuep = xmlStrdup(token);
5856     } else {
5857 	int lenn, lenp;
5858 	xmlChar *str;
5859 
5860 	lenn = strlen((char *) token2);
5861 	lenp = strlen((char *) token);
5862 
5863 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5864 	if (str == NULL) {
5865 	    xmlRegFreeAtom(atom);
5866 	    return(NULL);
5867 	}
5868 	memcpy(&str[0], token, lenp);
5869 	str[lenp] = '|';
5870 	memcpy(&str[lenp + 1], token2, lenn);
5871 	str[lenn + lenp + 1] = 0;
5872 
5873 	atom->valuep = str;
5874     }
5875 
5876     if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5877         xmlRegFreeAtom(atom);
5878 	return(NULL);
5879     }
5880     if (to == NULL)
5881 	return(am->state);
5882     return(to);
5883 }
5884 
5885 /**
5886  * xmlAutomataNewNegTrans:
5887  * @am: an automata
5888  * @from: the starting point of the transition
5889  * @to: the target point of the transition or NULL
5890  * @token: the first input string associated to that transition
5891  * @token2: the second input string associated to that transition
5892  * @data: data passed to the callback function if the transition is activated
5893  *
5894  * If @to is NULL, this creates first a new target state in the automata
5895  * and then adds a transition from the @from state to the target state
5896  * activated by any value except (@token,@token2)
5897  * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5898  # the semantic of XSD ##other
5899  *
5900  * Returns the target state or NULL in case of error
5901  */
5902 xmlAutomataStatePtr
xmlAutomataNewNegTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5903 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5904 		       xmlAutomataStatePtr to, const xmlChar *token,
5905 		       const xmlChar *token2, void *data) {
5906     xmlRegAtomPtr atom;
5907     xmlChar err_msg[200];
5908 
5909     if ((am == NULL) || (from == NULL) || (token == NULL))
5910 	return(NULL);
5911     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5912     if (atom == NULL)
5913 	return(NULL);
5914     atom->data = data;
5915     atom->neg = 1;
5916     if ((token2 == NULL) || (*token2 == 0)) {
5917 	atom->valuep = xmlStrdup(token);
5918     } else {
5919 	int lenn, lenp;
5920 	xmlChar *str;
5921 
5922 	lenn = strlen((char *) token2);
5923 	lenp = strlen((char *) token);
5924 
5925 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5926 	if (str == NULL) {
5927 	    xmlRegFreeAtom(atom);
5928 	    return(NULL);
5929 	}
5930 	memcpy(&str[0], token, lenp);
5931 	str[lenp] = '|';
5932 	memcpy(&str[lenp + 1], token2, lenn);
5933 	str[lenn + lenp + 1] = 0;
5934 
5935 	atom->valuep = str;
5936     }
5937     snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
5938     err_msg[199] = 0;
5939     atom->valuep2 = xmlStrdup(err_msg);
5940 
5941     if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5942         xmlRegFreeAtom(atom);
5943 	return(NULL);
5944     }
5945     am->negs++;
5946     if (to == NULL)
5947 	return(am->state);
5948     return(to);
5949 }
5950 
5951 /**
5952  * xmlAutomataNewCountTrans2:
5953  * @am: an automata
5954  * @from: the starting point of the transition
5955  * @to: the target point of the transition or NULL
5956  * @token: the input string associated to that transition
5957  * @token2: the second input string associated to that transition
5958  * @min:  the minimum successive occurrences of token
5959  * @max:  the maximum successive occurrences of token
5960  * @data:  data associated to the transition
5961  *
5962  * If @to is NULL, this creates first a new target state in the automata
5963  * and then adds a transition from the @from state to the target state
5964  * activated by a succession of input of value @token and @token2 and
5965  * whose number is between @min and @max
5966  *
5967  * Returns the target state or NULL in case of error
5968  */
5969 xmlAutomataStatePtr
xmlAutomataNewCountTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)5970 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5971 			 xmlAutomataStatePtr to, const xmlChar *token,
5972 			 const xmlChar *token2,
5973 			 int min, int max, void *data) {
5974     xmlRegAtomPtr atom;
5975     int counter;
5976 
5977     if ((am == NULL) || (from == NULL) || (token == NULL))
5978 	return(NULL);
5979     if (min < 0)
5980 	return(NULL);
5981     if ((max < min) || (max < 1))
5982 	return(NULL);
5983     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5984     if (atom == NULL)
5985 	return(NULL);
5986     if ((token2 == NULL) || (*token2 == 0)) {
5987 	atom->valuep = xmlStrdup(token);
5988     } else {
5989 	int lenn, lenp;
5990 	xmlChar *str;
5991 
5992 	lenn = strlen((char *) token2);
5993 	lenp = strlen((char *) token);
5994 
5995 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5996 	if (str == NULL) {
5997 	    xmlRegFreeAtom(atom);
5998 	    return(NULL);
5999 	}
6000 	memcpy(&str[0], token, lenp);
6001 	str[lenp] = '|';
6002 	memcpy(&str[lenp + 1], token2, lenn);
6003 	str[lenn + lenp + 1] = 0;
6004 
6005 	atom->valuep = str;
6006     }
6007     atom->data = data;
6008     if (min == 0)
6009 	atom->min = 1;
6010     else
6011 	atom->min = min;
6012     atom->max = max;
6013 
6014     /*
6015      * associate a counter to the transition.
6016      */
6017     counter = xmlRegGetCounter(am);
6018     am->counters[counter].min = min;
6019     am->counters[counter].max = max;
6020 
6021     /* xmlFAGenerateTransitions(am, from, to, atom); */
6022     if (to == NULL) {
6023         to = xmlRegNewState(am);
6024 	xmlRegStatePush(am, to);
6025     }
6026     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6027     xmlRegAtomPush(am, atom);
6028     am->state = to;
6029 
6030     if (to == NULL)
6031 	to = am->state;
6032     if (to == NULL)
6033 	return(NULL);
6034     if (min == 0)
6035 	xmlFAGenerateEpsilonTransition(am, from, to);
6036     return(to);
6037 }
6038 
6039 /**
6040  * xmlAutomataNewCountTrans:
6041  * @am: an automata
6042  * @from: the starting point of the transition
6043  * @to: the target point of the transition or NULL
6044  * @token: the input string associated to that transition
6045  * @min:  the minimum successive occurrences of token
6046  * @max:  the maximum successive occurrences of token
6047  * @data:  data associated to the transition
6048  *
6049  * If @to is NULL, this creates first a new target state in the automata
6050  * and then adds a transition from the @from state to the target state
6051  * activated by a succession of input of value @token and whose number
6052  * is between @min and @max
6053  *
6054  * Returns the target state or NULL in case of error
6055  */
6056 xmlAutomataStatePtr
xmlAutomataNewCountTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)6057 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6058 			 xmlAutomataStatePtr to, const xmlChar *token,
6059 			 int min, int max, void *data) {
6060     xmlRegAtomPtr atom;
6061     int counter;
6062 
6063     if ((am == NULL) || (from == NULL) || (token == NULL))
6064 	return(NULL);
6065     if (min < 0)
6066 	return(NULL);
6067     if ((max < min) || (max < 1))
6068 	return(NULL);
6069     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6070     if (atom == NULL)
6071 	return(NULL);
6072     atom->valuep = xmlStrdup(token);
6073     atom->data = data;
6074     if (min == 0)
6075 	atom->min = 1;
6076     else
6077 	atom->min = min;
6078     atom->max = max;
6079 
6080     /*
6081      * associate a counter to the transition.
6082      */
6083     counter = xmlRegGetCounter(am);
6084     am->counters[counter].min = min;
6085     am->counters[counter].max = max;
6086 
6087     /* xmlFAGenerateTransitions(am, from, to, atom); */
6088     if (to == NULL) {
6089         to = xmlRegNewState(am);
6090 	xmlRegStatePush(am, to);
6091     }
6092     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6093     xmlRegAtomPush(am, atom);
6094     am->state = to;
6095 
6096     if (to == NULL)
6097 	to = am->state;
6098     if (to == NULL)
6099 	return(NULL);
6100     if (min == 0)
6101 	xmlFAGenerateEpsilonTransition(am, from, to);
6102     return(to);
6103 }
6104 
6105 /**
6106  * xmlAutomataNewOnceTrans2:
6107  * @am: an automata
6108  * @from: the starting point of the transition
6109  * @to: the target point of the transition or NULL
6110  * @token: the input string associated to that transition
6111  * @token2: the second input string associated to that transition
6112  * @min:  the minimum successive occurrences of token
6113  * @max:  the maximum successive occurrences of token
6114  * @data:  data associated to the transition
6115  *
6116  * If @to is NULL, this creates first a new target state in the automata
6117  * and then adds a transition from the @from state to the target state
6118  * activated by a succession of input of value @token and @token2 and whose
6119  * number is between @min and @max, moreover that transition can only be
6120  * crossed once.
6121  *
6122  * Returns the target state or NULL in case of error
6123  */
6124 xmlAutomataStatePtr
xmlAutomataNewOnceTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)6125 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6126 			 xmlAutomataStatePtr to, const xmlChar *token,
6127 			 const xmlChar *token2,
6128 			 int min, int max, void *data) {
6129     xmlRegAtomPtr atom;
6130     int counter;
6131 
6132     if ((am == NULL) || (from == NULL) || (token == NULL))
6133 	return(NULL);
6134     if (min < 1)
6135 	return(NULL);
6136     if (max < min)
6137 	return(NULL);
6138     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6139     if (atom == NULL)
6140 	return(NULL);
6141     if ((token2 == NULL) || (*token2 == 0)) {
6142 	atom->valuep = xmlStrdup(token);
6143     } else {
6144 	int lenn, lenp;
6145 	xmlChar *str;
6146 
6147 	lenn = strlen((char *) token2);
6148 	lenp = strlen((char *) token);
6149 
6150 	str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6151 	if (str == NULL) {
6152 	    xmlRegFreeAtom(atom);
6153 	    return(NULL);
6154 	}
6155 	memcpy(&str[0], token, lenp);
6156 	str[lenp] = '|';
6157 	memcpy(&str[lenp + 1], token2, lenn);
6158 	str[lenn + lenp + 1] = 0;
6159 
6160 	atom->valuep = str;
6161     }
6162     atom->data = data;
6163     atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6164     atom->min = min;
6165     atom->max = max;
6166     /*
6167      * associate a counter to the transition.
6168      */
6169     counter = xmlRegGetCounter(am);
6170     am->counters[counter].min = 1;
6171     am->counters[counter].max = 1;
6172 
6173     /* xmlFAGenerateTransitions(am, from, to, atom); */
6174     if (to == NULL) {
6175 	to = xmlRegNewState(am);
6176 	xmlRegStatePush(am, to);
6177     }
6178     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6179     xmlRegAtomPush(am, atom);
6180     am->state = to;
6181     return(to);
6182 }
6183 
6184 
6185 
6186 /**
6187  * xmlAutomataNewOnceTrans:
6188  * @am: an automata
6189  * @from: the starting point of the transition
6190  * @to: the target point of the transition or NULL
6191  * @token: the input string associated to that transition
6192  * @min:  the minimum successive occurrences of token
6193  * @max:  the maximum successive occurrences of token
6194  * @data:  data associated to the transition
6195  *
6196  * If @to is NULL, this creates first a new target state in the automata
6197  * and then adds a transition from the @from state to the target state
6198  * activated by a succession of input of value @token and whose number
6199  * is between @min and @max, moreover that transition can only be crossed
6200  * once.
6201  *
6202  * Returns the target state or NULL in case of error
6203  */
6204 xmlAutomataStatePtr
xmlAutomataNewOnceTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)6205 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6206 			 xmlAutomataStatePtr to, const xmlChar *token,
6207 			 int min, int max, void *data) {
6208     xmlRegAtomPtr atom;
6209     int counter;
6210 
6211     if ((am == NULL) || (from == NULL) || (token == NULL))
6212 	return(NULL);
6213     if (min < 1)
6214 	return(NULL);
6215     if (max < min)
6216 	return(NULL);
6217     atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6218     if (atom == NULL)
6219 	return(NULL);
6220     atom->valuep = xmlStrdup(token);
6221     atom->data = data;
6222     atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6223     atom->min = min;
6224     atom->max = max;
6225     /*
6226      * associate a counter to the transition.
6227      */
6228     counter = xmlRegGetCounter(am);
6229     am->counters[counter].min = 1;
6230     am->counters[counter].max = 1;
6231 
6232     /* xmlFAGenerateTransitions(am, from, to, atom); */
6233     if (to == NULL) {
6234 	to = xmlRegNewState(am);
6235 	xmlRegStatePush(am, to);
6236     }
6237     xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6238     xmlRegAtomPush(am, atom);
6239     am->state = to;
6240     return(to);
6241 }
6242 
6243 /**
6244  * xmlAutomataNewState:
6245  * @am: an automata
6246  *
6247  * Create a new disconnected state in the automata
6248  *
6249  * Returns the new state or NULL in case of error
6250  */
6251 xmlAutomataStatePtr
xmlAutomataNewState(xmlAutomataPtr am)6252 xmlAutomataNewState(xmlAutomataPtr am) {
6253     xmlAutomataStatePtr to;
6254 
6255     if (am == NULL)
6256 	return(NULL);
6257     to = xmlRegNewState(am);
6258     xmlRegStatePush(am, to);
6259     return(to);
6260 }
6261 
6262 /**
6263  * xmlAutomataNewEpsilon:
6264  * @am: an automata
6265  * @from: the starting point of the transition
6266  * @to: the target point of the transition or NULL
6267  *
6268  * If @to is NULL, this creates first a new target state in the automata
6269  * and then adds an epsilon transition from the @from state to the
6270  * target state
6271  *
6272  * Returns the target state or NULL in case of error
6273  */
6274 xmlAutomataStatePtr
xmlAutomataNewEpsilon(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to)6275 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
6276 		      xmlAutomataStatePtr to) {
6277     if ((am == NULL) || (from == NULL))
6278 	return(NULL);
6279     xmlFAGenerateEpsilonTransition(am, from, to);
6280     if (to == NULL)
6281 	return(am->state);
6282     return(to);
6283 }
6284 
6285 /**
6286  * xmlAutomataNewAllTrans:
6287  * @am: an automata
6288  * @from: the starting point of the transition
6289  * @to: the target point of the transition or NULL
6290  * @lax: allow to transition if not all all transitions have been activated
6291  *
6292  * If @to is NULL, this creates first a new target state in the automata
6293  * and then adds a an ALL transition from the @from state to the
6294  * target state. That transition is an epsilon transition allowed only when
6295  * all transitions from the @from node have been activated.
6296  *
6297  * Returns the target state or NULL in case of error
6298  */
6299 xmlAutomataStatePtr
xmlAutomataNewAllTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int lax)6300 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6301 		       xmlAutomataStatePtr to, int lax) {
6302     if ((am == NULL) || (from == NULL))
6303 	return(NULL);
6304     xmlFAGenerateAllTransition(am, from, to, lax);
6305     if (to == NULL)
6306 	return(am->state);
6307     return(to);
6308 }
6309 
6310 /**
6311  * xmlAutomataNewCounter:
6312  * @am: an automata
6313  * @min:  the minimal value on the counter
6314  * @max:  the maximal value on the counter
6315  *
6316  * Create a new counter
6317  *
6318  * Returns the counter number or -1 in case of error
6319  */
6320 int
xmlAutomataNewCounter(xmlAutomataPtr am,int min,int max)6321 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6322     int ret;
6323 
6324     if (am == NULL)
6325 	return(-1);
6326 
6327     ret = xmlRegGetCounter(am);
6328     if (ret < 0)
6329 	return(-1);
6330     am->counters[ret].min = min;
6331     am->counters[ret].max = max;
6332     return(ret);
6333 }
6334 
6335 /**
6336  * xmlAutomataNewCountedTrans:
6337  * @am: an automata
6338  * @from: the starting point of the transition
6339  * @to: the target point of the transition or NULL
6340  * @counter: the counter associated to that transition
6341  *
6342  * If @to is NULL, this creates first a new target state in the automata
6343  * and then adds an epsilon transition from the @from state to the target state
6344  * which will increment the counter provided
6345  *
6346  * Returns the target state or NULL in case of error
6347  */
6348 xmlAutomataStatePtr
xmlAutomataNewCountedTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6349 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6350 		xmlAutomataStatePtr to, int counter) {
6351     if ((am == NULL) || (from == NULL) || (counter < 0))
6352 	return(NULL);
6353     xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6354     if (to == NULL)
6355 	return(am->state);
6356     return(to);
6357 }
6358 
6359 /**
6360  * xmlAutomataNewCounterTrans:
6361  * @am: an automata
6362  * @from: the starting point of the transition
6363  * @to: the target point of the transition or NULL
6364  * @counter: the counter associated to that transition
6365  *
6366  * If @to is NULL, this creates first a new target state in the automata
6367  * and then adds an epsilon transition from the @from state to the target state
6368  * which will be allowed only if the counter is within the right range.
6369  *
6370  * Returns the target state or NULL in case of error
6371  */
6372 xmlAutomataStatePtr
xmlAutomataNewCounterTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6373 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6374 		xmlAutomataStatePtr to, int counter) {
6375     if ((am == NULL) || (from == NULL) || (counter < 0))
6376 	return(NULL);
6377     xmlFAGenerateCountedTransition(am, from, to, counter);
6378     if (to == NULL)
6379 	return(am->state);
6380     return(to);
6381 }
6382 
6383 /**
6384  * xmlAutomataCompile:
6385  * @am: an automata
6386  *
6387  * Compile the automata into a Reg Exp ready for being executed.
6388  * The automata should be free after this point.
6389  *
6390  * Returns the compiled regexp or NULL in case of error
6391  */
6392 xmlRegexpPtr
xmlAutomataCompile(xmlAutomataPtr am)6393 xmlAutomataCompile(xmlAutomataPtr am) {
6394     xmlRegexpPtr ret;
6395 
6396     if ((am == NULL) || (am->error != 0)) return(NULL);
6397     xmlFAEliminateEpsilonTransitions(am);
6398     /* xmlFAComputesDeterminism(am); */
6399     ret = xmlRegEpxFromParse(am);
6400 
6401     return(ret);
6402 }
6403 
6404 /**
6405  * xmlAutomataIsDeterminist:
6406  * @am: an automata
6407  *
6408  * Checks if an automata is determinist.
6409  *
6410  * Returns 1 if true, 0 if not, and -1 in case of error
6411  */
6412 int
xmlAutomataIsDeterminist(xmlAutomataPtr am)6413 xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6414     int ret;
6415 
6416     if (am == NULL)
6417 	return(-1);
6418 
6419     ret = xmlFAComputesDeterminism(am);
6420     return(ret);
6421 }
6422 #endif /* LIBXML_AUTOMATA_ENABLED */
6423 
6424 #ifdef LIBXML_EXPR_ENABLED
6425 /************************************************************************
6426  *									*
6427  *		Formal Expression handling code				*
6428  *									*
6429  ************************************************************************/
6430 /************************************************************************
6431  *									*
6432  *		Expression handling context				*
6433  *									*
6434  ************************************************************************/
6435 
6436 struct _xmlExpCtxt {
6437     xmlDictPtr dict;
6438     xmlExpNodePtr *table;
6439     int size;
6440     int nbElems;
6441     int nb_nodes;
6442     int maxNodes;
6443     const char *expr;
6444     const char *cur;
6445     int nb_cons;
6446     int tabSize;
6447 };
6448 
6449 /**
6450  * xmlExpNewCtxt:
6451  * @maxNodes:  the maximum number of nodes
6452  * @dict:  optional dictionary to use internally
6453  *
6454  * Creates a new context for manipulating expressions
6455  *
6456  * Returns the context or NULL in case of error
6457  */
6458 xmlExpCtxtPtr
xmlExpNewCtxt(int maxNodes,xmlDictPtr dict)6459 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6460     xmlExpCtxtPtr ret;
6461     int size = 256;
6462 
6463     if (maxNodes <= 4096)
6464         maxNodes = 4096;
6465 
6466     ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6467     if (ret == NULL)
6468         return(NULL);
6469     memset(ret, 0, sizeof(xmlExpCtxt));
6470     ret->size = size;
6471     ret->nbElems = 0;
6472     ret->maxNodes = maxNodes;
6473     ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6474     if (ret->table == NULL) {
6475         xmlFree(ret);
6476 	return(NULL);
6477     }
6478     memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6479     if (dict == NULL) {
6480         ret->dict = xmlDictCreate();
6481 	if (ret->dict == NULL) {
6482 	    xmlFree(ret->table);
6483 	    xmlFree(ret);
6484 	    return(NULL);
6485 	}
6486     } else {
6487         ret->dict = dict;
6488 	xmlDictReference(ret->dict);
6489     }
6490     return(ret);
6491 }
6492 
6493 /**
6494  * xmlExpFreeCtxt:
6495  * @ctxt:  an expression context
6496  *
6497  * Free an expression context
6498  */
6499 void
xmlExpFreeCtxt(xmlExpCtxtPtr ctxt)6500 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6501     if (ctxt == NULL)
6502         return;
6503     xmlDictFree(ctxt->dict);
6504     if (ctxt->table != NULL)
6505 	xmlFree(ctxt->table);
6506     xmlFree(ctxt);
6507 }
6508 
6509 /************************************************************************
6510  *									*
6511  *		Structure associated to an expression node		*
6512  *									*
6513  ************************************************************************/
6514 #define MAX_NODES 10000
6515 
6516 /* #define DEBUG_DERIV */
6517 
6518 /*
6519  * TODO:
6520  * - Wildcards
6521  * - public API for creation
6522  *
6523  * Started
6524  * - regression testing
6525  *
6526  * Done
6527  * - split into module and test tool
6528  * - memleaks
6529  */
6530 
6531 typedef enum {
6532     XML_EXP_NILABLE = (1 << 0)
6533 } xmlExpNodeInfo;
6534 
6535 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6536 
6537 struct _xmlExpNode {
6538     unsigned char type;/* xmlExpNodeType */
6539     unsigned char info;/* OR of xmlExpNodeInfo */
6540     unsigned short key;	/* the hash key */
6541     unsigned int ref;	/* The number of references */
6542     int c_max;		/* the maximum length it can consume */
6543     xmlExpNodePtr exp_left;
6544     xmlExpNodePtr next;/* the next node in the hash table or free list */
6545     union {
6546 	struct {
6547 	    int f_min;
6548 	    int f_max;
6549 	} count;
6550 	struct {
6551 	    xmlExpNodePtr f_right;
6552 	} children;
6553         const xmlChar *f_str;
6554     } field;
6555 };
6556 
6557 #define exp_min field.count.f_min
6558 #define exp_max field.count.f_max
6559 /* #define exp_left field.children.f_left */
6560 #define exp_right field.children.f_right
6561 #define exp_str field.f_str
6562 
6563 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6564 static xmlExpNode forbiddenExpNode = {
6565     XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6566 };
6567 xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6568 static xmlExpNode emptyExpNode = {
6569     XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6570 };
6571 xmlExpNodePtr emptyExp = &emptyExpNode;
6572 
6573 /************************************************************************
6574  *									*
6575  *  The custom hash table for unicity and canonicalization		*
6576  *  of sub-expressions pointers						*
6577  *									*
6578  ************************************************************************/
6579 /*
6580  * xmlExpHashNameComputeKey:
6581  * Calculate the hash key for a token
6582  */
6583 static unsigned short
xmlExpHashNameComputeKey(const xmlChar * name)6584 xmlExpHashNameComputeKey(const xmlChar *name) {
6585     unsigned short value = 0L;
6586     char ch;
6587 
6588     if (name != NULL) {
6589 	value += 30 * (*name);
6590 	while ((ch = *name++) != 0) {
6591 	    value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6592 	}
6593     }
6594     return (value);
6595 }
6596 
6597 /*
6598  * xmlExpHashComputeKey:
6599  * Calculate the hash key for a compound expression
6600  */
6601 static unsigned short
xmlExpHashComputeKey(xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right)6602 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6603                      xmlExpNodePtr right) {
6604     unsigned long value;
6605     unsigned short ret;
6606 
6607     switch (type) {
6608         case XML_EXP_SEQ:
6609 	    value = left->key;
6610 	    value += right->key;
6611 	    value *= 3;
6612 	    ret = (unsigned short) value;
6613 	    break;
6614         case XML_EXP_OR:
6615 	    value = left->key;
6616 	    value += right->key;
6617 	    value *= 7;
6618 	    ret = (unsigned short) value;
6619 	    break;
6620         case XML_EXP_COUNT:
6621 	    value = left->key;
6622 	    value += right->key;
6623 	    ret = (unsigned short) value;
6624 	    break;
6625 	default:
6626 	    ret = 0;
6627     }
6628     return(ret);
6629 }
6630 
6631 
6632 static xmlExpNodePtr
xmlExpNewNode(xmlExpCtxtPtr ctxt,xmlExpNodeType type)6633 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6634     xmlExpNodePtr ret;
6635 
6636     if (ctxt->nb_nodes >= MAX_NODES)
6637         return(NULL);
6638     ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6639     if (ret == NULL)
6640         return(NULL);
6641     memset(ret, 0, sizeof(xmlExpNode));
6642     ret->type = type;
6643     ret->next = NULL;
6644     ctxt->nb_nodes++;
6645     ctxt->nb_cons++;
6646     return(ret);
6647 }
6648 
6649 /**
6650  * xmlExpHashGetEntry:
6651  * @table: the hash table
6652  *
6653  * Get the unique entry from the hash table. The entry is created if
6654  * needed. @left and @right are consumed, i.e. their ref count will
6655  * be decremented by the operation.
6656  *
6657  * Returns the pointer or NULL in case of error
6658  */
6659 static xmlExpNodePtr
xmlExpHashGetEntry(xmlExpCtxtPtr ctxt,xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right,const xmlChar * name,int min,int max)6660 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6661                    xmlExpNodePtr left, xmlExpNodePtr right,
6662 		   const xmlChar *name, int min, int max) {
6663     unsigned short kbase, key;
6664     xmlExpNodePtr entry;
6665     xmlExpNodePtr insert;
6666 
6667     if (ctxt == NULL)
6668 	return(NULL);
6669 
6670     /*
6671      * Check for duplicate and insertion location.
6672      */
6673     if (type == XML_EXP_ATOM) {
6674 	kbase = xmlExpHashNameComputeKey(name);
6675     } else if (type == XML_EXP_COUNT) {
6676         /* COUNT reduction rule 1 */
6677 	/* a{1} -> a */
6678 	if (min == max) {
6679 	    if (min == 1) {
6680 		return(left);
6681 	    }
6682 	    if (min == 0) {
6683 		xmlExpFree(ctxt, left);
6684 	        return(emptyExp);
6685 	    }
6686 	}
6687 	if (min < 0) {
6688 	    xmlExpFree(ctxt, left);
6689 	    return(forbiddenExp);
6690 	}
6691         if (max == -1)
6692 	    kbase = min + 79;
6693 	else
6694 	    kbase = max - min;
6695 	kbase += left->key;
6696     } else if (type == XML_EXP_OR) {
6697         /* Forbid reduction rules */
6698         if (left->type == XML_EXP_FORBID) {
6699 	    xmlExpFree(ctxt, left);
6700 	    return(right);
6701 	}
6702         if (right->type == XML_EXP_FORBID) {
6703 	    xmlExpFree(ctxt, right);
6704 	    return(left);
6705 	}
6706 
6707         /* OR reduction rule 1 */
6708 	/* a | a reduced to a */
6709         if (left == right) {
6710 	    xmlExpFree(ctxt, right);
6711 	    return(left);
6712 	}
6713         /* OR canonicalization rule 1 */
6714 	/* linearize (a | b) | c into a | (b | c) */
6715         if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6716 	    xmlExpNodePtr tmp = left;
6717             left = right;
6718 	    right = tmp;
6719 	}
6720         /* OR reduction rule 2 */
6721 	/* a | (a | b) and b | (a | b) are reduced to a | b */
6722         if (right->type == XML_EXP_OR) {
6723 	    if ((left == right->exp_left) ||
6724 	        (left == right->exp_right)) {
6725 		xmlExpFree(ctxt, left);
6726 		return(right);
6727 	    }
6728 	}
6729         /* OR canonicalization rule 2 */
6730 	/* linearize (a | b) | c into a | (b | c) */
6731         if (left->type == XML_EXP_OR) {
6732 	    xmlExpNodePtr tmp;
6733 
6734 	    /* OR canonicalization rule 2 */
6735 	    if ((left->exp_right->type != XML_EXP_OR) &&
6736 	        (left->exp_right->key < left->exp_left->key)) {
6737 	        tmp = left->exp_right;
6738 		left->exp_right = left->exp_left;
6739 		left->exp_left = tmp;
6740 	    }
6741 	    left->exp_right->ref++;
6742 	    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6743 	                             NULL, 0, 0);
6744 	    left->exp_left->ref++;
6745 	    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6746 	                             NULL, 0, 0);
6747 
6748 	    xmlExpFree(ctxt, left);
6749 	    return(tmp);
6750 	}
6751 	if (right->type == XML_EXP_OR) {
6752 	    /* Ordering in the tree */
6753 	    /* C | (A | B) -> A | (B | C) */
6754 	    if (left->key > right->exp_right->key) {
6755 		xmlExpNodePtr tmp;
6756 		right->exp_right->ref++;
6757 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6758 		                         left, NULL, 0, 0);
6759 		right->exp_left->ref++;
6760 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6761 		                         tmp, NULL, 0, 0);
6762 		xmlExpFree(ctxt, right);
6763 		return(tmp);
6764 	    }
6765 	    /* Ordering in the tree */
6766 	    /* B | (A | C) -> A | (B | C) */
6767 	    if (left->key > right->exp_left->key) {
6768 		xmlExpNodePtr tmp;
6769 		right->exp_right->ref++;
6770 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6771 		                         right->exp_right, NULL, 0, 0);
6772 		right->exp_left->ref++;
6773 		tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6774 		                         tmp, NULL, 0, 0);
6775 		xmlExpFree(ctxt, right);
6776 		return(tmp);
6777 	    }
6778 	}
6779 	/* we know both types are != XML_EXP_OR here */
6780         else if (left->key > right->key) {
6781 	    xmlExpNodePtr tmp = left;
6782             left = right;
6783 	    right = tmp;
6784 	}
6785 	kbase = xmlExpHashComputeKey(type, left, right);
6786     } else if (type == XML_EXP_SEQ) {
6787         /* Forbid reduction rules */
6788         if (left->type == XML_EXP_FORBID) {
6789 	    xmlExpFree(ctxt, right);
6790 	    return(left);
6791 	}
6792         if (right->type == XML_EXP_FORBID) {
6793 	    xmlExpFree(ctxt, left);
6794 	    return(right);
6795 	}
6796         /* Empty reduction rules */
6797         if (right->type == XML_EXP_EMPTY) {
6798 	    return(left);
6799 	}
6800         if (left->type == XML_EXP_EMPTY) {
6801 	    return(right);
6802 	}
6803 	kbase = xmlExpHashComputeKey(type, left, right);
6804     } else
6805         return(NULL);
6806 
6807     key = kbase % ctxt->size;
6808     if (ctxt->table[key] != NULL) {
6809 	for (insert = ctxt->table[key]; insert != NULL;
6810 	     insert = insert->next) {
6811 	    if ((insert->key == kbase) &&
6812 	        (insert->type == type)) {
6813 		if (type == XML_EXP_ATOM) {
6814 		    if (name == insert->exp_str) {
6815 			insert->ref++;
6816 			return(insert);
6817 		    }
6818 		} else if (type == XML_EXP_COUNT) {
6819 		    if ((insert->exp_min == min) && (insert->exp_max == max) &&
6820 		        (insert->exp_left == left)) {
6821 			insert->ref++;
6822 			left->ref--;
6823 			return(insert);
6824 		    }
6825 		} else if ((insert->exp_left == left) &&
6826 			   (insert->exp_right == right)) {
6827 		    insert->ref++;
6828 		    left->ref--;
6829 		    right->ref--;
6830 		    return(insert);
6831 		}
6832 	    }
6833 	}
6834     }
6835 
6836     entry = xmlExpNewNode(ctxt, type);
6837     if (entry == NULL)
6838         return(NULL);
6839     entry->key = kbase;
6840     if (type == XML_EXP_ATOM) {
6841 	entry->exp_str = name;
6842 	entry->c_max = 1;
6843     } else if (type == XML_EXP_COUNT) {
6844         entry->exp_min = min;
6845         entry->exp_max = max;
6846 	entry->exp_left = left;
6847 	if ((min == 0) || (IS_NILLABLE(left)))
6848 	    entry->info |= XML_EXP_NILABLE;
6849 	if (max < 0)
6850 	    entry->c_max = -1;
6851 	else
6852 	    entry->c_max = max * entry->exp_left->c_max;
6853     } else {
6854 	entry->exp_left = left;
6855 	entry->exp_right = right;
6856 	if (type == XML_EXP_OR) {
6857 	    if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6858 		entry->info |= XML_EXP_NILABLE;
6859 	    if ((entry->exp_left->c_max == -1) ||
6860 	        (entry->exp_right->c_max == -1))
6861 		entry->c_max = -1;
6862 	    else if (entry->exp_left->c_max > entry->exp_right->c_max)
6863 	        entry->c_max = entry->exp_left->c_max;
6864 	    else
6865 	        entry->c_max = entry->exp_right->c_max;
6866 	} else {
6867 	    if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6868 		entry->info |= XML_EXP_NILABLE;
6869 	    if ((entry->exp_left->c_max == -1) ||
6870 	        (entry->exp_right->c_max == -1))
6871 		entry->c_max = -1;
6872 	    else
6873 	        entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6874 	}
6875     }
6876     entry->ref = 1;
6877     if (ctxt->table[key] != NULL)
6878         entry->next = ctxt->table[key];
6879 
6880     ctxt->table[key] = entry;
6881     ctxt->nbElems++;
6882 
6883     return(entry);
6884 }
6885 
6886 /**
6887  * xmlExpFree:
6888  * @ctxt: the expression context
6889  * @exp: the expression
6890  *
6891  * Dereference the expression
6892  */
6893 void
xmlExpFree(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp)6894 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6895     if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6896         return;
6897     exp->ref--;
6898     if (exp->ref == 0) {
6899         unsigned short key;
6900 
6901         /* Unlink it first from the hash table */
6902 	key = exp->key % ctxt->size;
6903 	if (ctxt->table[key] == exp) {
6904 	    ctxt->table[key] = exp->next;
6905 	} else {
6906 	    xmlExpNodePtr tmp;
6907 
6908 	    tmp = ctxt->table[key];
6909 	    while (tmp != NULL) {
6910 	        if (tmp->next == exp) {
6911 		    tmp->next = exp->next;
6912 		    break;
6913 		}
6914 	        tmp = tmp->next;
6915 	    }
6916 	}
6917 
6918         if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6919 	    xmlExpFree(ctxt, exp->exp_left);
6920 	    xmlExpFree(ctxt, exp->exp_right);
6921 	} else if (exp->type == XML_EXP_COUNT) {
6922 	    xmlExpFree(ctxt, exp->exp_left);
6923 	}
6924         xmlFree(exp);
6925 	ctxt->nb_nodes--;
6926     }
6927 }
6928 
6929 /**
6930  * xmlExpRef:
6931  * @exp: the expression
6932  *
6933  * Increase the reference count of the expression
6934  */
6935 void
xmlExpRef(xmlExpNodePtr exp)6936 xmlExpRef(xmlExpNodePtr exp) {
6937     if (exp != NULL)
6938         exp->ref++;
6939 }
6940 
6941 /**
6942  * xmlExpNewAtom:
6943  * @ctxt: the expression context
6944  * @name: the atom name
6945  * @len: the atom name length in byte (or -1);
6946  *
6947  * Get the atom associated to this name from that context
6948  *
6949  * Returns the node or NULL in case of error
6950  */
6951 xmlExpNodePtr
xmlExpNewAtom(xmlExpCtxtPtr ctxt,const xmlChar * name,int len)6952 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
6953     if ((ctxt == NULL) || (name == NULL))
6954         return(NULL);
6955     name = xmlDictLookup(ctxt->dict, name, len);
6956     if (name == NULL)
6957         return(NULL);
6958     return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
6959 }
6960 
6961 /**
6962  * xmlExpNewOr:
6963  * @ctxt: the expression context
6964  * @left: left expression
6965  * @right: right expression
6966  *
6967  * Get the atom associated to the choice @left | @right
6968  * Note that @left and @right are consumed in the operation, to keep
6969  * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6970  * this is true even in case of failure (unless ctxt == NULL).
6971  *
6972  * Returns the node or NULL in case of error
6973  */
6974 xmlExpNodePtr
xmlExpNewOr(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)6975 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6976     if (ctxt == NULL)
6977         return(NULL);
6978     if ((left == NULL) || (right == NULL)) {
6979         xmlExpFree(ctxt, left);
6980         xmlExpFree(ctxt, right);
6981         return(NULL);
6982     }
6983     return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
6984 }
6985 
6986 /**
6987  * xmlExpNewSeq:
6988  * @ctxt: the expression context
6989  * @left: left expression
6990  * @right: right expression
6991  *
6992  * Get the atom associated to the sequence @left , @right
6993  * Note that @left and @right are consumed in the operation, to keep
6994  * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6995  * this is true even in case of failure (unless ctxt == NULL).
6996  *
6997  * Returns the node or NULL in case of error
6998  */
6999 xmlExpNodePtr
xmlExpNewSeq(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)7000 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
7001     if (ctxt == NULL)
7002         return(NULL);
7003     if ((left == NULL) || (right == NULL)) {
7004         xmlExpFree(ctxt, left);
7005         xmlExpFree(ctxt, right);
7006         return(NULL);
7007     }
7008     return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
7009 }
7010 
7011 /**
7012  * xmlExpNewRange:
7013  * @ctxt: the expression context
7014  * @subset: the expression to be repeated
7015  * @min: the lower bound for the repetition
7016  * @max: the upper bound for the repetition, -1 means infinite
7017  *
7018  * Get the atom associated to the range (@subset){@min, @max}
7019  * Note that @subset is consumed in the operation, to keep
7020  * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
7021  * this is true even in case of failure (unless ctxt == NULL).
7022  *
7023  * Returns the node or NULL in case of error
7024  */
7025 xmlExpNodePtr
xmlExpNewRange(xmlExpCtxtPtr ctxt,xmlExpNodePtr subset,int min,int max)7026 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
7027     if (ctxt == NULL)
7028         return(NULL);
7029     if ((subset == NULL) || (min < 0) || (max < -1) ||
7030         ((max >= 0) && (min > max))) {
7031 	xmlExpFree(ctxt, subset);
7032         return(NULL);
7033     }
7034     return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
7035                               NULL, NULL, min, max));
7036 }
7037 
7038 /************************************************************************
7039  *									*
7040  *		Public API for operations on expressions		*
7041  *									*
7042  ************************************************************************/
7043 
7044 static int
xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)7045 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7046                      const xmlChar**list, int len, int nb) {
7047     int tmp, tmp2;
7048 tail:
7049     switch (exp->type) {
7050         case XML_EXP_EMPTY:
7051 	    return(0);
7052         case XML_EXP_ATOM:
7053 	    for (tmp = 0;tmp < nb;tmp++)
7054 	        if (list[tmp] == exp->exp_str)
7055 		    return(0);
7056             if (nb >= len)
7057 	        return(-2);
7058 	    list[nb] = exp->exp_str;
7059 	    return(1);
7060         case XML_EXP_COUNT:
7061 	    exp = exp->exp_left;
7062 	    goto tail;
7063         case XML_EXP_SEQ:
7064         case XML_EXP_OR:
7065 	    tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
7066 	    if (tmp < 0)
7067 	        return(tmp);
7068 	    tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
7069 	                                nb + tmp);
7070 	    if (tmp2 < 0)
7071 	        return(tmp2);
7072             return(tmp + tmp2);
7073     }
7074     return(-1);
7075 }
7076 
7077 /**
7078  * xmlExpGetLanguage:
7079  * @ctxt: the expression context
7080  * @exp: the expression
7081  * @langList: where to store the tokens
7082  * @len: the allocated length of @list
7083  *
7084  * Find all the strings used in @exp and store them in @list
7085  *
7086  * Returns the number of unique strings found, -1 in case of errors and
7087  *         -2 if there is more than @len strings
7088  */
7089 int
xmlExpGetLanguage(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** langList,int len)7090 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7091                   const xmlChar**langList, int len) {
7092     if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
7093         return(-1);
7094     return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
7095 }
7096 
7097 static int
xmlExpGetStartInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)7098 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7099                   const xmlChar**list, int len, int nb) {
7100     int tmp, tmp2;
7101 tail:
7102     switch (exp->type) {
7103         case XML_EXP_FORBID:
7104 	    return(0);
7105         case XML_EXP_EMPTY:
7106 	    return(0);
7107         case XML_EXP_ATOM:
7108 	    for (tmp = 0;tmp < nb;tmp++)
7109 	        if (list[tmp] == exp->exp_str)
7110 		    return(0);
7111             if (nb >= len)
7112 	        return(-2);
7113 	    list[nb] = exp->exp_str;
7114 	    return(1);
7115         case XML_EXP_COUNT:
7116 	    exp = exp->exp_left;
7117 	    goto tail;
7118         case XML_EXP_SEQ:
7119 	    tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7120 	    if (tmp < 0)
7121 	        return(tmp);
7122 	    if (IS_NILLABLE(exp->exp_left)) {
7123 		tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7124 					    nb + tmp);
7125 		if (tmp2 < 0)
7126 		    return(tmp2);
7127 		tmp += tmp2;
7128 	    }
7129             return(tmp);
7130         case XML_EXP_OR:
7131 	    tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7132 	    if (tmp < 0)
7133 	        return(tmp);
7134 	    tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7135 	                                nb + tmp);
7136 	    if (tmp2 < 0)
7137 	        return(tmp2);
7138             return(tmp + tmp2);
7139     }
7140     return(-1);
7141 }
7142 
7143 /**
7144  * xmlExpGetStart:
7145  * @ctxt: the expression context
7146  * @exp: the expression
7147  * @tokList: where to store the tokens
7148  * @len: the allocated length of @list
7149  *
7150  * Find all the strings that appears at the start of the languages
7151  * accepted by @exp and store them in @list. E.g. for (a, b) | c
7152  * it will return the list [a, c]
7153  *
7154  * Returns the number of unique strings found, -1 in case of errors and
7155  *         -2 if there is more than @len strings
7156  */
7157 int
xmlExpGetStart(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** tokList,int len)7158 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7159                const xmlChar**tokList, int len) {
7160     if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
7161         return(-1);
7162     return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
7163 }
7164 
7165 /**
7166  * xmlExpIsNillable:
7167  * @exp: the expression
7168  *
7169  * Finds if the expression is nillable, i.e. if it accepts the empty sequence
7170  *
7171  * Returns 1 if nillable, 0 if not and -1 in case of error
7172  */
7173 int
xmlExpIsNillable(xmlExpNodePtr exp)7174 xmlExpIsNillable(xmlExpNodePtr exp) {
7175     if (exp == NULL)
7176         return(-1);
7177     return(IS_NILLABLE(exp) != 0);
7178 }
7179 
7180 static xmlExpNodePtr
xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str)7181 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
7182 {
7183     xmlExpNodePtr ret;
7184 
7185     switch (exp->type) {
7186 	case XML_EXP_EMPTY:
7187 	    return(forbiddenExp);
7188 	case XML_EXP_FORBID:
7189 	    return(forbiddenExp);
7190 	case XML_EXP_ATOM:
7191 	    if (exp->exp_str == str) {
7192 #ifdef DEBUG_DERIV
7193 		printf("deriv atom: equal => Empty\n");
7194 #endif
7195 	        ret = emptyExp;
7196 	    } else {
7197 #ifdef DEBUG_DERIV
7198 		printf("deriv atom: mismatch => forbid\n");
7199 #endif
7200 	        /* TODO wildcards here */
7201 		ret = forbiddenExp;
7202 	    }
7203 	    return(ret);
7204 	case XML_EXP_OR: {
7205 	    xmlExpNodePtr tmp;
7206 
7207 #ifdef DEBUG_DERIV
7208 	    printf("deriv or: => or(derivs)\n");
7209 #endif
7210 	    tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7211 	    if (tmp == NULL) {
7212 		return(NULL);
7213 	    }
7214 	    ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7215 	    if (ret == NULL) {
7216 	        xmlExpFree(ctxt, tmp);
7217 		return(NULL);
7218 	    }
7219             ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
7220 			     NULL, 0, 0);
7221 	    return(ret);
7222 	}
7223 	case XML_EXP_SEQ:
7224 #ifdef DEBUG_DERIV
7225 	    printf("deriv seq: starting with left\n");
7226 #endif
7227 	    ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7228 	    if (ret == NULL) {
7229 	        return(NULL);
7230 	    } else if (ret == forbiddenExp) {
7231 	        if (IS_NILLABLE(exp->exp_left)) {
7232 #ifdef DEBUG_DERIV
7233 		    printf("deriv seq: left failed but nillable\n");
7234 #endif
7235 		    ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7236 		}
7237 	    } else {
7238 #ifdef DEBUG_DERIV
7239 		printf("deriv seq: left match => sequence\n");
7240 #endif
7241 	        exp->exp_right->ref++;
7242 	        ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
7243 		                         NULL, 0, 0);
7244 	    }
7245 	    return(ret);
7246 	case XML_EXP_COUNT: {
7247 	    int min, max;
7248 	    xmlExpNodePtr tmp;
7249 
7250 	    if (exp->exp_max == 0)
7251 		return(forbiddenExp);
7252 	    ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7253 	    if (ret == NULL)
7254 	        return(NULL);
7255 	    if (ret == forbiddenExp) {
7256 #ifdef DEBUG_DERIV
7257 		printf("deriv count: pattern mismatch => forbid\n");
7258 #endif
7259 	        return(ret);
7260 	    }
7261 	    if (exp->exp_max == 1)
7262 		return(ret);
7263 	    if (exp->exp_max < 0) /* unbounded */
7264 		max = -1;
7265 	    else
7266 		max = exp->exp_max - 1;
7267 	    if (exp->exp_min > 0)
7268 		min = exp->exp_min - 1;
7269 	    else
7270 		min = 0;
7271 	    exp->exp_left->ref++;
7272 	    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
7273 				     NULL, min, max);
7274 	    if (ret == emptyExp) {
7275 #ifdef DEBUG_DERIV
7276 		printf("deriv count: match to empty => new count\n");
7277 #endif
7278 	        return(tmp);
7279 	    }
7280 #ifdef DEBUG_DERIV
7281 	    printf("deriv count: match => sequence with new count\n");
7282 #endif
7283 	    return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
7284 	                              NULL, 0, 0));
7285 	}
7286     }
7287     return(NULL);
7288 }
7289 
7290 /**
7291  * xmlExpStringDerive:
7292  * @ctxt: the expression context
7293  * @exp: the expression
7294  * @str: the string
7295  * @len: the string len in bytes if available
7296  *
7297  * Do one step of Brzozowski derivation of the expression @exp with
7298  * respect to the input string
7299  *
7300  * Returns the resulting expression or NULL in case of internal error
7301  */
7302 xmlExpNodePtr
xmlExpStringDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str,int len)7303 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7304                    const xmlChar *str, int len) {
7305     const xmlChar *input;
7306 
7307     if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
7308         return(NULL);
7309     }
7310     /*
7311      * check the string is in the dictionary, if yes use an interned
7312      * copy, otherwise we know it's not an acceptable input
7313      */
7314     input = xmlDictExists(ctxt->dict, str, len);
7315     if (input == NULL) {
7316         return(forbiddenExp);
7317     }
7318     return(xmlExpStringDeriveInt(ctxt, exp, input));
7319 }
7320 
7321 static int
xmlExpCheckCard(xmlExpNodePtr exp,xmlExpNodePtr sub)7322 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7323     int ret = 1;
7324 
7325     if (sub->c_max == -1) {
7326         if (exp->c_max != -1)
7327 	    ret = 0;
7328     } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7329         ret = 0;
7330     }
7331 #if 0
7332     if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7333         ret = 0;
7334 #endif
7335     return(ret);
7336 }
7337 
7338 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7339                                         xmlExpNodePtr sub);
7340 /**
7341  * xmlExpDivide:
7342  * @ctxt: the expressions context
7343  * @exp: the englobing expression
7344  * @sub: the subexpression
7345  * @mult: the multiple expression
7346  * @remain: the remain from the derivation of the multiple
7347  *
7348  * Check if exp is a multiple of sub, i.e. if there is a finite number n
7349  * so that sub{n} subsume exp
7350  *
7351  * Returns the multiple value if successful, 0 if it is not a multiple
7352  *         and -1 in case of internal error.
7353  */
7354 
7355 static int
xmlExpDivide(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub,xmlExpNodePtr * mult,xmlExpNodePtr * remain)7356 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7357              xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7358     int i;
7359     xmlExpNodePtr tmp, tmp2;
7360 
7361     if (mult != NULL) *mult = NULL;
7362     if (remain != NULL) *remain = NULL;
7363     if (exp->c_max == -1) return(0);
7364     if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7365 
7366     for (i = 1;i <= exp->c_max;i++) {
7367         sub->ref++;
7368         tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7369 				 sub, NULL, NULL, i, i);
7370 	if (tmp == NULL) {
7371 	    return(-1);
7372 	}
7373 	if (!xmlExpCheckCard(tmp, exp)) {
7374 	    xmlExpFree(ctxt, tmp);
7375 	    continue;
7376 	}
7377 	tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7378 	if (tmp2 == NULL) {
7379 	    xmlExpFree(ctxt, tmp);
7380 	    return(-1);
7381 	}
7382 	if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7383 	    if (remain != NULL)
7384 	        *remain = tmp2;
7385 	    else
7386 	        xmlExpFree(ctxt, tmp2);
7387 	    if (mult != NULL)
7388 	        *mult = tmp;
7389 	    else
7390 	        xmlExpFree(ctxt, tmp);
7391 #ifdef DEBUG_DERIV
7392 	    printf("Divide succeeded %d\n", i);
7393 #endif
7394 	    return(i);
7395 	}
7396 	xmlExpFree(ctxt, tmp);
7397 	xmlExpFree(ctxt, tmp2);
7398     }
7399 #ifdef DEBUG_DERIV
7400     printf("Divide failed\n");
7401 #endif
7402     return(0);
7403 }
7404 
7405 /**
7406  * xmlExpExpDeriveInt:
7407  * @ctxt: the expressions context
7408  * @exp: the englobing expression
7409  * @sub: the subexpression
7410  *
7411  * Try to do a step of Brzozowski derivation but at a higher level
7412  * the input being a subexpression.
7413  *
7414  * Returns the resulting expression or NULL in case of internal error
7415  */
7416 static xmlExpNodePtr
xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7417 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7418     xmlExpNodePtr ret, tmp, tmp2, tmp3;
7419     const xmlChar **tab;
7420     int len, i;
7421 
7422     /*
7423      * In case of equality and if the expression can only consume a finite
7424      * amount, then the derivation is empty
7425      */
7426     if ((exp == sub) && (exp->c_max >= 0)) {
7427 #ifdef DEBUG_DERIV
7428         printf("Equal(exp, sub) and finite -> Empty\n");
7429 #endif
7430         return(emptyExp);
7431     }
7432     /*
7433      * decompose sub sequence first
7434      */
7435     if (sub->type == XML_EXP_EMPTY) {
7436 #ifdef DEBUG_DERIV
7437         printf("Empty(sub) -> Empty\n");
7438 #endif
7439 	exp->ref++;
7440         return(exp);
7441     }
7442     if (sub->type == XML_EXP_SEQ) {
7443 #ifdef DEBUG_DERIV
7444         printf("Seq(sub) -> decompose\n");
7445 #endif
7446         tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7447 	if (tmp == NULL)
7448 	    return(NULL);
7449 	if (tmp == forbiddenExp)
7450 	    return(tmp);
7451 	ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7452 	xmlExpFree(ctxt, tmp);
7453 	return(ret);
7454     }
7455     if (sub->type == XML_EXP_OR) {
7456 #ifdef DEBUG_DERIV
7457         printf("Or(sub) -> decompose\n");
7458 #endif
7459         tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7460 	if (tmp == forbiddenExp)
7461 	    return(tmp);
7462 	if (tmp == NULL)
7463 	    return(NULL);
7464 	ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7465 	if ((ret == NULL) || (ret == forbiddenExp)) {
7466 	    xmlExpFree(ctxt, tmp);
7467 	    return(ret);
7468 	}
7469 	return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7470     }
7471     if (!xmlExpCheckCard(exp, sub)) {
7472 #ifdef DEBUG_DERIV
7473         printf("CheckCard(exp, sub) failed -> Forbid\n");
7474 #endif
7475         return(forbiddenExp);
7476     }
7477     switch (exp->type) {
7478         case XML_EXP_EMPTY:
7479 	    if (sub == emptyExp)
7480 	        return(emptyExp);
7481 #ifdef DEBUG_DERIV
7482 	    printf("Empty(exp) -> Forbid\n");
7483 #endif
7484 	    return(forbiddenExp);
7485         case XML_EXP_FORBID:
7486 #ifdef DEBUG_DERIV
7487 	    printf("Forbid(exp) -> Forbid\n");
7488 #endif
7489 	    return(forbiddenExp);
7490         case XML_EXP_ATOM:
7491 	    if (sub->type == XML_EXP_ATOM) {
7492 	        /* TODO: handle wildcards */
7493 	        if (exp->exp_str == sub->exp_str) {
7494 #ifdef DEBUG_DERIV
7495 		    printf("Atom match -> Empty\n");
7496 #endif
7497 		    return(emptyExp);
7498                 }
7499 #ifdef DEBUG_DERIV
7500 		printf("Atom mismatch -> Forbid\n");
7501 #endif
7502 	        return(forbiddenExp);
7503 	    }
7504 	    if ((sub->type == XML_EXP_COUNT) &&
7505 	        (sub->exp_max == 1) &&
7506 	        (sub->exp_left->type == XML_EXP_ATOM)) {
7507 	        /* TODO: handle wildcards */
7508 	        if (exp->exp_str == sub->exp_left->exp_str) {
7509 #ifdef DEBUG_DERIV
7510 		    printf("Atom match -> Empty\n");
7511 #endif
7512 		    return(emptyExp);
7513 		}
7514 #ifdef DEBUG_DERIV
7515 		printf("Atom mismatch -> Forbid\n");
7516 #endif
7517 	        return(forbiddenExp);
7518 	    }
7519 #ifdef DEBUG_DERIV
7520 	    printf("Complex exp vs Atom -> Forbid\n");
7521 #endif
7522 	    return(forbiddenExp);
7523         case XML_EXP_SEQ:
7524 	    /* try to get the sequence consumed only if possible */
7525 	    if (xmlExpCheckCard(exp->exp_left, sub)) {
7526 		/* See if the sequence can be consumed directly */
7527 #ifdef DEBUG_DERIV
7528 		printf("Seq trying left only\n");
7529 #endif
7530 		ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7531 		if ((ret != forbiddenExp) && (ret != NULL)) {
7532 #ifdef DEBUG_DERIV
7533 		    printf("Seq trying left only worked\n");
7534 #endif
7535 		    /*
7536 		     * TODO: assumption here that we are determinist
7537 		     *       i.e. we won't get to a nillable exp left
7538 		     *       subset which could be matched by the right
7539 		     *       part too.
7540 		     * e.g.: (a | b)+,(a | c) and 'a+,a'
7541 		     */
7542 		    exp->exp_right->ref++;
7543 		    return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7544 					      exp->exp_right, NULL, 0, 0));
7545 		}
7546 #ifdef DEBUG_DERIV
7547 	    } else {
7548 		printf("Seq: left too short\n");
7549 #endif
7550 	    }
7551 	    /* Try instead to decompose */
7552 	    if (sub->type == XML_EXP_COUNT) {
7553 		int min, max;
7554 
7555 #ifdef DEBUG_DERIV
7556 		printf("Seq: sub is a count\n");
7557 #endif
7558 	        ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7559 		if (ret == NULL)
7560 		    return(NULL);
7561 		if (ret != forbiddenExp) {
7562 #ifdef DEBUG_DERIV
7563 		    printf("Seq , Count match on left\n");
7564 #endif
7565 		    if (sub->exp_max < 0)
7566 		        max = -1;
7567 	            else
7568 		        max = sub->exp_max -1;
7569 		    if (sub->exp_min > 0)
7570 		        min = sub->exp_min -1;
7571 		    else
7572 		        min = 0;
7573 		    exp->exp_right->ref++;
7574 		    tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7575 		                             exp->exp_right, NULL, 0, 0);
7576 		    if (tmp == NULL)
7577 		        return(NULL);
7578 
7579 		    sub->exp_left->ref++;
7580 		    tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7581 				      sub->exp_left, NULL, NULL, min, max);
7582 		    if (tmp2 == NULL) {
7583 		        xmlExpFree(ctxt, tmp);
7584 			return(NULL);
7585 		    }
7586 		    ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7587 		    xmlExpFree(ctxt, tmp);
7588 		    xmlExpFree(ctxt, tmp2);
7589 		    return(ret);
7590 		}
7591 	    }
7592 	    /* we made no progress on structured operations */
7593 	    break;
7594         case XML_EXP_OR:
7595 #ifdef DEBUG_DERIV
7596 	    printf("Or , trying both side\n");
7597 #endif
7598 	    ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7599 	    if (ret == NULL)
7600 	        return(NULL);
7601 	    tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7602 	    if (tmp == NULL) {
7603 		xmlExpFree(ctxt, ret);
7604 	        return(NULL);
7605 	    }
7606 	    return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7607         case XML_EXP_COUNT: {
7608 	    int min, max;
7609 
7610 	    if (sub->type == XML_EXP_COUNT) {
7611 	        /*
7612 		 * Try to see if the loop is completely subsumed
7613 		 */
7614 	        tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7615 		if (tmp == NULL)
7616 		    return(NULL);
7617 		if (tmp == forbiddenExp) {
7618 		    int mult;
7619 
7620 #ifdef DEBUG_DERIV
7621 		    printf("Count, Count inner don't subsume\n");
7622 #endif
7623 		    mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7624 		                        NULL, &tmp);
7625 		    if (mult <= 0) {
7626 #ifdef DEBUG_DERIV
7627 			printf("Count, Count not multiple => forbidden\n");
7628 #endif
7629                         return(forbiddenExp);
7630 		    }
7631 		    if (sub->exp_max == -1) {
7632 		        max = -1;
7633 			if (exp->exp_max == -1) {
7634 			    if (exp->exp_min <= sub->exp_min * mult)
7635 			        min = 0;
7636 			    else
7637 			        min = exp->exp_min - sub->exp_min * mult;
7638 			} else {
7639 #ifdef DEBUG_DERIV
7640 			    printf("Count, Count finite can't subsume infinite\n");
7641 #endif
7642                             xmlExpFree(ctxt, tmp);
7643 			    return(forbiddenExp);
7644 			}
7645 		    } else {
7646 			if (exp->exp_max == -1) {
7647 #ifdef DEBUG_DERIV
7648 			    printf("Infinite loop consume mult finite loop\n");
7649 #endif
7650 			    if (exp->exp_min > sub->exp_min * mult) {
7651 				max = -1;
7652 				min = exp->exp_min - sub->exp_min * mult;
7653 			    } else {
7654 				max = -1;
7655 				min = 0;
7656 			    }
7657 			} else {
7658 			    if (exp->exp_max < sub->exp_max * mult) {
7659 #ifdef DEBUG_DERIV
7660 				printf("loops max mult mismatch => forbidden\n");
7661 #endif
7662 				xmlExpFree(ctxt, tmp);
7663 				return(forbiddenExp);
7664 			    }
7665 			    if (sub->exp_max * mult > exp->exp_min)
7666 				min = 0;
7667 			    else
7668 				min = exp->exp_min - sub->exp_max * mult;
7669 			    max = exp->exp_max - sub->exp_max * mult;
7670 			}
7671 		    }
7672 		} else if (!IS_NILLABLE(tmp)) {
7673 		    /*
7674 		     * TODO: loop here to try to grow if working on finite
7675 		     *       blocks.
7676 		     */
7677 #ifdef DEBUG_DERIV
7678 		    printf("Count, Count remain not nillable => forbidden\n");
7679 #endif
7680 		    xmlExpFree(ctxt, tmp);
7681 		    return(forbiddenExp);
7682 		} else if (sub->exp_max == -1) {
7683 		    if (exp->exp_max == -1) {
7684 		        if (exp->exp_min <= sub->exp_min) {
7685 #ifdef DEBUG_DERIV
7686 			    printf("Infinite loops Okay => COUNT(0,Inf)\n");
7687 #endif
7688                             max = -1;
7689 			    min = 0;
7690 			} else {
7691 #ifdef DEBUG_DERIV
7692 			    printf("Infinite loops min => Count(X,Inf)\n");
7693 #endif
7694                             max = -1;
7695 			    min = exp->exp_min - sub->exp_min;
7696 			}
7697 		    } else if (exp->exp_min > sub->exp_min) {
7698 #ifdef DEBUG_DERIV
7699 			printf("loops min mismatch 1 => forbidden ???\n");
7700 #endif
7701 		        xmlExpFree(ctxt, tmp);
7702 		        return(forbiddenExp);
7703 		    } else {
7704 			max = -1;
7705 			min = 0;
7706 		    }
7707 		} else {
7708 		    if (exp->exp_max == -1) {
7709 #ifdef DEBUG_DERIV
7710 			printf("Infinite loop consume finite loop\n");
7711 #endif
7712 		        if (exp->exp_min > sub->exp_min) {
7713 			    max = -1;
7714 			    min = exp->exp_min - sub->exp_min;
7715 			} else {
7716 			    max = -1;
7717 			    min = 0;
7718 			}
7719 		    } else {
7720 		        if (exp->exp_max < sub->exp_max) {
7721 #ifdef DEBUG_DERIV
7722 			    printf("loops max mismatch => forbidden\n");
7723 #endif
7724 			    xmlExpFree(ctxt, tmp);
7725 			    return(forbiddenExp);
7726 			}
7727 			if (sub->exp_max > exp->exp_min)
7728 			    min = 0;
7729 			else
7730 			    min = exp->exp_min - sub->exp_max;
7731 			max = exp->exp_max - sub->exp_max;
7732 		    }
7733 		}
7734 #ifdef DEBUG_DERIV
7735 		printf("loops match => SEQ(COUNT())\n");
7736 #endif
7737 		exp->exp_left->ref++;
7738 		tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7739 		                          NULL, NULL, min, max);
7740 		if (tmp2 == NULL) {
7741 		    return(NULL);
7742 		}
7743                 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7744 		                         NULL, 0, 0);
7745 		return(ret);
7746 	    }
7747 	    tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7748 	    if (tmp == NULL)
7749 		return(NULL);
7750 	    if (tmp == forbiddenExp) {
7751 #ifdef DEBUG_DERIV
7752 		printf("loop mismatch => forbidden\n");
7753 #endif
7754 		return(forbiddenExp);
7755 	    }
7756 	    if (exp->exp_min > 0)
7757 		min = exp->exp_min - 1;
7758 	    else
7759 		min = 0;
7760 	    if (exp->exp_max < 0)
7761 		max = -1;
7762 	    else
7763 		max = exp->exp_max - 1;
7764 
7765 #ifdef DEBUG_DERIV
7766 	    printf("loop match => SEQ(COUNT())\n");
7767 #endif
7768 	    exp->exp_left->ref++;
7769 	    tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7770 				      NULL, NULL, min, max);
7771 	    if (tmp2 == NULL)
7772 		return(NULL);
7773 	    ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7774 				     NULL, 0, 0);
7775 	    return(ret);
7776 	}
7777     }
7778 
7779 #ifdef DEBUG_DERIV
7780     printf("Fallback to derivative\n");
7781 #endif
7782     if (IS_NILLABLE(sub)) {
7783         if (!(IS_NILLABLE(exp)))
7784 	    return(forbiddenExp);
7785 	else
7786 	    ret = emptyExp;
7787     } else
7788 	ret = NULL;
7789     /*
7790      * here the structured derivation made no progress so
7791      * we use the default token based derivation to force one more step
7792      */
7793     if (ctxt->tabSize == 0)
7794         ctxt->tabSize = 40;
7795 
7796     tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7797 	                               sizeof(const xmlChar *));
7798     if (tab == NULL) {
7799 	return(NULL);
7800     }
7801 
7802     /*
7803      * collect all the strings accepted by the subexpression on input
7804      */
7805     len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7806     while (len < 0) {
7807         const xmlChar **temp;
7808 	temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
7809 	                                     sizeof(const xmlChar *));
7810 	if (temp == NULL) {
7811 	    xmlFree((xmlChar **) tab);
7812 	    return(NULL);
7813 	}
7814 	tab = temp;
7815 	ctxt->tabSize *= 2;
7816 	len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7817     }
7818     for (i = 0;i < len;i++) {
7819         tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7820 	if ((tmp == NULL) || (tmp == forbiddenExp)) {
7821 	    xmlExpFree(ctxt, ret);
7822 	    xmlFree((xmlChar **) tab);
7823 	    return(tmp);
7824 	}
7825 	tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7826 	if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7827 	    xmlExpFree(ctxt, tmp);
7828 	    xmlExpFree(ctxt, ret);
7829 	    xmlFree((xmlChar **) tab);
7830 	    return(tmp);
7831 	}
7832 	tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7833 	xmlExpFree(ctxt, tmp);
7834 	xmlExpFree(ctxt, tmp2);
7835 
7836 	if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7837 	    xmlExpFree(ctxt, ret);
7838 	    xmlFree((xmlChar **) tab);
7839 	    return(tmp3);
7840 	}
7841 
7842 	if (ret == NULL)
7843 	    ret = tmp3;
7844 	else {
7845 	    ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7846 	    if (ret == NULL) {
7847 		xmlFree((xmlChar **) tab);
7848 	        return(NULL);
7849 	    }
7850 	}
7851     }
7852     xmlFree((xmlChar **) tab);
7853     return(ret);
7854 }
7855 
7856 /**
7857  * xmlExpExpDerive:
7858  * @ctxt: the expressions context
7859  * @exp: the englobing expression
7860  * @sub: the subexpression
7861  *
7862  * Evaluates the expression resulting from @exp consuming a sub expression @sub
7863  * Based on algebraic derivation and sometimes direct Brzozowski derivation
7864  * it usually takes less than linear time and can handle expressions generating
7865  * infinite languages.
7866  *
7867  * Returns the resulting expression or NULL in case of internal error, the
7868  *         result must be freed
7869  */
7870 xmlExpNodePtr
xmlExpExpDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7871 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7872     if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7873         return(NULL);
7874 
7875     /*
7876      * O(1) speedups
7877      */
7878     if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7879 #ifdef DEBUG_DERIV
7880 	printf("Sub nillable and not exp : can't subsume\n");
7881 #endif
7882         return(forbiddenExp);
7883     }
7884     if (xmlExpCheckCard(exp, sub) == 0) {
7885 #ifdef DEBUG_DERIV
7886 	printf("sub generate longer sequences than exp : can't subsume\n");
7887 #endif
7888         return(forbiddenExp);
7889     }
7890     return(xmlExpExpDeriveInt(ctxt, exp, sub));
7891 }
7892 
7893 /**
7894  * xmlExpSubsume:
7895  * @ctxt: the expressions context
7896  * @exp: the englobing expression
7897  * @sub: the subexpression
7898  *
7899  * Check whether @exp accepts all the languages accepted by @sub
7900  * the input being a subexpression.
7901  *
7902  * Returns 1 if true 0 if false and -1 in case of failure.
7903  */
7904 int
xmlExpSubsume(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7905 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7906     xmlExpNodePtr tmp;
7907 
7908     if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7909         return(-1);
7910 
7911     /*
7912      * TODO: speedup by checking the language of sub is a subset of the
7913      *       language of exp
7914      */
7915     /*
7916      * O(1) speedups
7917      */
7918     if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7919 #ifdef DEBUG_DERIV
7920 	printf("Sub nillable and not exp : can't subsume\n");
7921 #endif
7922         return(0);
7923     }
7924     if (xmlExpCheckCard(exp, sub) == 0) {
7925 #ifdef DEBUG_DERIV
7926 	printf("sub generate longer sequences than exp : can't subsume\n");
7927 #endif
7928         return(0);
7929     }
7930     tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7931 #ifdef DEBUG_DERIV
7932     printf("Result derivation :\n");
7933     PRINT_EXP(tmp);
7934 #endif
7935     if (tmp == NULL)
7936         return(-1);
7937     if (tmp == forbiddenExp)
7938 	return(0);
7939     if (tmp == emptyExp)
7940 	return(1);
7941     if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
7942         xmlExpFree(ctxt, tmp);
7943         return(1);
7944     }
7945     xmlExpFree(ctxt, tmp);
7946     return(0);
7947 }
7948 
7949 /************************************************************************
7950  *									*
7951  *			Parsing expression				*
7952  *									*
7953  ************************************************************************/
7954 
7955 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
7956 
7957 #undef CUR
7958 #define CUR (*ctxt->cur)
7959 #undef NEXT
7960 #define NEXT ctxt->cur++;
7961 #undef IS_BLANK
7962 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
7963 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
7964 
7965 static int
xmlExpParseNumber(xmlExpCtxtPtr ctxt)7966 xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
7967     int ret = 0;
7968 
7969     SKIP_BLANKS
7970     if (CUR == '*') {
7971 	NEXT
7972 	return(-1);
7973     }
7974     if ((CUR < '0') || (CUR > '9'))
7975         return(-1);
7976     while ((CUR >= '0') && (CUR <= '9')) {
7977         ret = ret * 10 + (CUR - '0');
7978 	NEXT
7979     }
7980     return(ret);
7981 }
7982 
7983 static xmlExpNodePtr
xmlExpParseOr(xmlExpCtxtPtr ctxt)7984 xmlExpParseOr(xmlExpCtxtPtr ctxt) {
7985     const char *base;
7986     xmlExpNodePtr ret;
7987     const xmlChar *val;
7988 
7989     SKIP_BLANKS
7990     base = ctxt->cur;
7991     if (*ctxt->cur == '(') {
7992         NEXT
7993 	ret = xmlExpParseExpr(ctxt);
7994 	SKIP_BLANKS
7995 	if (*ctxt->cur != ')') {
7996 	    fprintf(stderr, "unbalanced '(' : %s\n", base);
7997 	    xmlExpFree(ctxt, ret);
7998 	    return(NULL);
7999 	}
8000 	NEXT;
8001 	SKIP_BLANKS
8002 	goto parse_quantifier;
8003     }
8004     while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
8005            (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
8006 	   (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
8007 	NEXT;
8008     val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
8009     if (val == NULL)
8010         return(NULL);
8011     ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
8012     if (ret == NULL)
8013         return(NULL);
8014     SKIP_BLANKS
8015 parse_quantifier:
8016     if (CUR == '{') {
8017         int min, max;
8018 
8019         NEXT
8020 	min = xmlExpParseNumber(ctxt);
8021 	if (min < 0) {
8022 	    xmlExpFree(ctxt, ret);
8023 	    return(NULL);
8024 	}
8025 	SKIP_BLANKS
8026 	if (CUR == ',') {
8027 	    NEXT
8028 	    max = xmlExpParseNumber(ctxt);
8029 	    SKIP_BLANKS
8030 	} else
8031 	    max = min;
8032 	if (CUR != '}') {
8033 	    xmlExpFree(ctxt, ret);
8034 	    return(NULL);
8035 	}
8036         NEXT
8037 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8038 	                         min, max);
8039 	SKIP_BLANKS
8040     } else if (CUR == '?') {
8041         NEXT
8042 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8043 	                         0, 1);
8044 	SKIP_BLANKS
8045     } else if (CUR == '+') {
8046         NEXT
8047 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8048 	                         1, -1);
8049 	SKIP_BLANKS
8050     } else if (CUR == '*') {
8051         NEXT
8052 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8053 	                         0, -1);
8054 	SKIP_BLANKS
8055     }
8056     return(ret);
8057 }
8058 
8059 
8060 static xmlExpNodePtr
xmlExpParseSeq(xmlExpCtxtPtr ctxt)8061 xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
8062     xmlExpNodePtr ret, right;
8063 
8064     ret = xmlExpParseOr(ctxt);
8065     SKIP_BLANKS
8066     while (CUR == '|') {
8067         NEXT
8068 	right = xmlExpParseOr(ctxt);
8069 	if (right == NULL) {
8070 	    xmlExpFree(ctxt, ret);
8071 	    return(NULL);
8072 	}
8073 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
8074 	if (ret == NULL)
8075 	    return(NULL);
8076     }
8077     return(ret);
8078 }
8079 
8080 static xmlExpNodePtr
xmlExpParseExpr(xmlExpCtxtPtr ctxt)8081 xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
8082     xmlExpNodePtr ret, right;
8083 
8084     ret = xmlExpParseSeq(ctxt);
8085     SKIP_BLANKS
8086     while (CUR == ',') {
8087         NEXT
8088 	right = xmlExpParseSeq(ctxt);
8089 	if (right == NULL) {
8090 	    xmlExpFree(ctxt, ret);
8091 	    return(NULL);
8092 	}
8093 	ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
8094 	if (ret == NULL)
8095 	    return(NULL);
8096     }
8097     return(ret);
8098 }
8099 
8100 /**
8101  * xmlExpParse:
8102  * @ctxt: the expressions context
8103  * @expr: the 0 terminated string
8104  *
8105  * Minimal parser for regexps, it understand the following constructs
8106  *  - string terminals
8107  *  - choice operator |
8108  *  - sequence operator ,
8109  *  - subexpressions (...)
8110  *  - usual cardinality operators + * and ?
8111  *  - finite sequences  { min, max }
8112  *  - infinite sequences { min, * }
8113  * There is minimal checkings made especially no checking on strings values
8114  *
8115  * Returns a new expression or NULL in case of failure
8116  */
8117 xmlExpNodePtr
xmlExpParse(xmlExpCtxtPtr ctxt,const char * expr)8118 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
8119     xmlExpNodePtr ret;
8120 
8121     ctxt->expr = expr;
8122     ctxt->cur = expr;
8123 
8124     ret = xmlExpParseExpr(ctxt);
8125     SKIP_BLANKS
8126     if (*ctxt->cur != 0) {
8127         xmlExpFree(ctxt, ret);
8128         return(NULL);
8129     }
8130     return(ret);
8131 }
8132 
8133 static void
xmlExpDumpInt(xmlBufferPtr buf,xmlExpNodePtr expr,int glob)8134 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
8135     xmlExpNodePtr c;
8136 
8137     if (expr == NULL) return;
8138     if (glob) xmlBufferWriteChar(buf, "(");
8139     switch (expr->type) {
8140         case XML_EXP_EMPTY:
8141 	    xmlBufferWriteChar(buf, "empty");
8142 	    break;
8143         case XML_EXP_FORBID:
8144 	    xmlBufferWriteChar(buf, "forbidden");
8145 	    break;
8146         case XML_EXP_ATOM:
8147 	    xmlBufferWriteCHAR(buf, expr->exp_str);
8148 	    break;
8149         case XML_EXP_SEQ:
8150 	    c = expr->exp_left;
8151 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8152 	        xmlExpDumpInt(buf, c, 1);
8153 	    else
8154 	        xmlExpDumpInt(buf, c, 0);
8155 	    xmlBufferWriteChar(buf, " , ");
8156 	    c = expr->exp_right;
8157 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8158 	        xmlExpDumpInt(buf, c, 1);
8159 	    else
8160 	        xmlExpDumpInt(buf, c, 0);
8161             break;
8162         case XML_EXP_OR:
8163 	    c = expr->exp_left;
8164 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8165 	        xmlExpDumpInt(buf, c, 1);
8166 	    else
8167 	        xmlExpDumpInt(buf, c, 0);
8168 	    xmlBufferWriteChar(buf, " | ");
8169 	    c = expr->exp_right;
8170 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8171 	        xmlExpDumpInt(buf, c, 1);
8172 	    else
8173 	        xmlExpDumpInt(buf, c, 0);
8174             break;
8175         case XML_EXP_COUNT: {
8176 	    char rep[40];
8177 
8178 	    c = expr->exp_left;
8179 	    if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8180 	        xmlExpDumpInt(buf, c, 1);
8181 	    else
8182 	        xmlExpDumpInt(buf, c, 0);
8183 	    if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
8184 		rep[0] = '?';
8185 		rep[1] = 0;
8186 	    } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
8187 		rep[0] = '*';
8188 		rep[1] = 0;
8189 	    } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
8190 		rep[0] = '+';
8191 		rep[1] = 0;
8192 	    } else if (expr->exp_max == expr->exp_min) {
8193 	        snprintf(rep, 39, "{%d}", expr->exp_min);
8194 	    } else if (expr->exp_max < 0) {
8195 	        snprintf(rep, 39, "{%d,inf}", expr->exp_min);
8196 	    } else {
8197 	        snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
8198 	    }
8199 	    rep[39] = 0;
8200 	    xmlBufferWriteChar(buf, rep);
8201 	    break;
8202 	}
8203 	default:
8204 	    fprintf(stderr, "Error in tree\n");
8205     }
8206     if (glob)
8207         xmlBufferWriteChar(buf, ")");
8208 }
8209 /**
8210  * xmlExpDump:
8211  * @buf:  a buffer to receive the output
8212  * @expr:  the compiled expression
8213  *
8214  * Serialize the expression as compiled to the buffer
8215  */
8216 void
xmlExpDump(xmlBufferPtr buf,xmlExpNodePtr expr)8217 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
8218     if ((buf == NULL) || (expr == NULL))
8219         return;
8220     xmlExpDumpInt(buf, expr, 0);
8221 }
8222 
8223 /**
8224  * xmlExpMaxToken:
8225  * @expr: a compiled expression
8226  *
8227  * Indicate the maximum number of input a expression can accept
8228  *
8229  * Returns the maximum length or -1 in case of error
8230  */
8231 int
xmlExpMaxToken(xmlExpNodePtr expr)8232 xmlExpMaxToken(xmlExpNodePtr expr) {
8233     if (expr == NULL)
8234         return(-1);
8235     return(expr->c_max);
8236 }
8237 
8238 /**
8239  * xmlExpCtxtNbNodes:
8240  * @ctxt: an expression context
8241  *
8242  * Debugging facility provides the number of allocated nodes at a that point
8243  *
8244  * Returns the number of nodes in use or -1 in case of error
8245  */
8246 int
xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt)8247 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
8248     if (ctxt == NULL)
8249         return(-1);
8250     return(ctxt->nb_nodes);
8251 }
8252 
8253 /**
8254  * xmlExpCtxtNbCons:
8255  * @ctxt: an expression context
8256  *
8257  * Debugging facility provides the number of allocated nodes over lifetime
8258  *
8259  * Returns the number of nodes ever allocated or -1 in case of error
8260  */
8261 int
xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt)8262 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
8263     if (ctxt == NULL)
8264         return(-1);
8265     return(ctxt->nb_cons);
8266 }
8267 
8268 #endif /* LIBXML_EXPR_ENABLED */
8269 #define bottom_xmlregexp
8270 #include "elfgcchack.h"
8271 #endif /* LIBXML_REGEXP_ENABLED */
8272