1 /*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/schemas mechanisms now available in
6 * XML related specifications these include:
7 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17 #define IN_LIBXML
18 #include "libxml.h"
19
20 #ifdef LIBXML_REGEXP_ENABLED
21
22 /* #define DEBUG_ERR */
23
24 #include <stdio.h>
25 #include <string.h>
26 #ifdef HAVE_LIMITS_H
27 #include <limits.h>
28 #endif
29 #ifdef HAVE_STDINT_H
30 #include <stdint.h>
31 #endif
32
33 #include <libxml/tree.h>
34 #include <libxml/parserInternals.h>
35 #include <libxml/xmlregexp.h>
36 #include <libxml/xmlautomata.h>
37 #include <libxml/xmlunicode.h>
38
39 #ifndef INT_MAX
40 #define INT_MAX 123456789 /* easy to flag and big enough for our needs */
41 #endif
42 #ifndef SIZE_MAX
43 #define SIZE_MAX ((size_t) -1)
44 #endif
45
46 /* #define DEBUG_REGEXP_GRAPH */
47 /* #define DEBUG_REGEXP_EXEC */
48 /* #define DEBUG_PUSH */
49 /* #define DEBUG_COMPACTION */
50
51 #define MAX_PUSH 10000000
52
53 #ifdef ERROR
54 #undef ERROR
55 #endif
56 #define ERROR(str) \
57 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
58 xmlRegexpErrCompile(ctxt, str);
59 #define NEXT ctxt->cur++
60 #define CUR (*(ctxt->cur))
61 #define NXT(index) (ctxt->cur[index])
62
63 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
64 #define NEXTL(l) ctxt->cur += l;
65 #define XML_REG_STRING_SEPARATOR '|'
66 /*
67 * Need PREV to check on a '-' within a Character Group. May only be used
68 * when it's guaranteed that cur is not at the beginning of ctxt->string!
69 */
70 #define PREV (ctxt->cur[-1])
71
72 /**
73 * TODO:
74 *
75 * macro to flag unimplemented blocks
76 */
77 #define TODO \
78 xmlGenericError(xmlGenericErrorContext, \
79 "Unimplemented block at %s:%d\n", \
80 __FILE__, __LINE__);
81
82 /************************************************************************
83 * *
84 * Datatypes and structures *
85 * *
86 ************************************************************************/
87
88 /*
89 * Note: the order of the enums below is significant, do not shuffle
90 */
91 typedef enum {
92 XML_REGEXP_EPSILON = 1,
93 XML_REGEXP_CHARVAL,
94 XML_REGEXP_RANGES,
95 XML_REGEXP_SUBREG, /* used for () sub regexps */
96 XML_REGEXP_STRING,
97 XML_REGEXP_ANYCHAR, /* . */
98 XML_REGEXP_ANYSPACE, /* \s */
99 XML_REGEXP_NOTSPACE, /* \S */
100 XML_REGEXP_INITNAME, /* \l */
101 XML_REGEXP_NOTINITNAME, /* \L */
102 XML_REGEXP_NAMECHAR, /* \c */
103 XML_REGEXP_NOTNAMECHAR, /* \C */
104 XML_REGEXP_DECIMAL, /* \d */
105 XML_REGEXP_NOTDECIMAL, /* \D */
106 XML_REGEXP_REALCHAR, /* \w */
107 XML_REGEXP_NOTREALCHAR, /* \W */
108 XML_REGEXP_LETTER = 100,
109 XML_REGEXP_LETTER_UPPERCASE,
110 XML_REGEXP_LETTER_LOWERCASE,
111 XML_REGEXP_LETTER_TITLECASE,
112 XML_REGEXP_LETTER_MODIFIER,
113 XML_REGEXP_LETTER_OTHERS,
114 XML_REGEXP_MARK,
115 XML_REGEXP_MARK_NONSPACING,
116 XML_REGEXP_MARK_SPACECOMBINING,
117 XML_REGEXP_MARK_ENCLOSING,
118 XML_REGEXP_NUMBER,
119 XML_REGEXP_NUMBER_DECIMAL,
120 XML_REGEXP_NUMBER_LETTER,
121 XML_REGEXP_NUMBER_OTHERS,
122 XML_REGEXP_PUNCT,
123 XML_REGEXP_PUNCT_CONNECTOR,
124 XML_REGEXP_PUNCT_DASH,
125 XML_REGEXP_PUNCT_OPEN,
126 XML_REGEXP_PUNCT_CLOSE,
127 XML_REGEXP_PUNCT_INITQUOTE,
128 XML_REGEXP_PUNCT_FINQUOTE,
129 XML_REGEXP_PUNCT_OTHERS,
130 XML_REGEXP_SEPAR,
131 XML_REGEXP_SEPAR_SPACE,
132 XML_REGEXP_SEPAR_LINE,
133 XML_REGEXP_SEPAR_PARA,
134 XML_REGEXP_SYMBOL,
135 XML_REGEXP_SYMBOL_MATH,
136 XML_REGEXP_SYMBOL_CURRENCY,
137 XML_REGEXP_SYMBOL_MODIFIER,
138 XML_REGEXP_SYMBOL_OTHERS,
139 XML_REGEXP_OTHER,
140 XML_REGEXP_OTHER_CONTROL,
141 XML_REGEXP_OTHER_FORMAT,
142 XML_REGEXP_OTHER_PRIVATE,
143 XML_REGEXP_OTHER_NA,
144 XML_REGEXP_BLOCK_NAME
145 } xmlRegAtomType;
146
147 typedef enum {
148 XML_REGEXP_QUANT_EPSILON = 1,
149 XML_REGEXP_QUANT_ONCE,
150 XML_REGEXP_QUANT_OPT,
151 XML_REGEXP_QUANT_MULT,
152 XML_REGEXP_QUANT_PLUS,
153 XML_REGEXP_QUANT_ONCEONLY,
154 XML_REGEXP_QUANT_ALL,
155 XML_REGEXP_QUANT_RANGE
156 } xmlRegQuantType;
157
158 typedef enum {
159 XML_REGEXP_START_STATE = 1,
160 XML_REGEXP_FINAL_STATE,
161 XML_REGEXP_TRANS_STATE,
162 XML_REGEXP_SINK_STATE,
163 XML_REGEXP_UNREACH_STATE
164 } xmlRegStateType;
165
166 typedef enum {
167 XML_REGEXP_MARK_NORMAL = 0,
168 XML_REGEXP_MARK_START,
169 XML_REGEXP_MARK_VISITED
170 } xmlRegMarkedType;
171
172 typedef struct _xmlRegRange xmlRegRange;
173 typedef xmlRegRange *xmlRegRangePtr;
174
175 struct _xmlRegRange {
176 int neg; /* 0 normal, 1 not, 2 exclude */
177 xmlRegAtomType type;
178 int start;
179 int end;
180 xmlChar *blockName;
181 };
182
183 typedef struct _xmlRegAtom xmlRegAtom;
184 typedef xmlRegAtom *xmlRegAtomPtr;
185
186 typedef struct _xmlAutomataState xmlRegState;
187 typedef xmlRegState *xmlRegStatePtr;
188
189 struct _xmlRegAtom {
190 int no;
191 xmlRegAtomType type;
192 xmlRegQuantType quant;
193 int min;
194 int max;
195
196 void *valuep;
197 void *valuep2;
198 int neg;
199 int codepoint;
200 xmlRegStatePtr start;
201 xmlRegStatePtr start0;
202 xmlRegStatePtr stop;
203 int maxRanges;
204 int nbRanges;
205 xmlRegRangePtr *ranges;
206 void *data;
207 };
208
209 typedef struct _xmlRegCounter xmlRegCounter;
210 typedef xmlRegCounter *xmlRegCounterPtr;
211
212 struct _xmlRegCounter {
213 int min;
214 int max;
215 };
216
217 typedef struct _xmlRegTrans xmlRegTrans;
218 typedef xmlRegTrans *xmlRegTransPtr;
219
220 struct _xmlRegTrans {
221 xmlRegAtomPtr atom;
222 int to;
223 int counter;
224 int count;
225 int nd;
226 };
227
228 struct _xmlAutomataState {
229 xmlRegStateType type;
230 xmlRegMarkedType mark;
231 xmlRegMarkedType markd;
232 xmlRegMarkedType reached;
233 int no;
234 int maxTrans;
235 int nbTrans;
236 xmlRegTrans *trans;
237 /* knowing states pointing to us can speed things up */
238 int maxTransTo;
239 int nbTransTo;
240 int *transTo;
241 };
242
243 typedef struct _xmlAutomata xmlRegParserCtxt;
244 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
245
246 #define AM_AUTOMATA_RNG 1
247
248 struct _xmlAutomata {
249 xmlChar *string;
250 xmlChar *cur;
251
252 int error;
253 int neg;
254
255 xmlRegStatePtr start;
256 xmlRegStatePtr end;
257 xmlRegStatePtr state;
258
259 xmlRegAtomPtr atom;
260
261 int maxAtoms;
262 int nbAtoms;
263 xmlRegAtomPtr *atoms;
264
265 int maxStates;
266 int nbStates;
267 xmlRegStatePtr *states;
268
269 int maxCounters;
270 int nbCounters;
271 xmlRegCounter *counters;
272
273 int determinist;
274 int negs;
275 int flags;
276
277 int depth;
278 };
279
280 struct _xmlRegexp {
281 xmlChar *string;
282 int nbStates;
283 xmlRegStatePtr *states;
284 int nbAtoms;
285 xmlRegAtomPtr *atoms;
286 int nbCounters;
287 xmlRegCounter *counters;
288 int determinist;
289 int flags;
290 /*
291 * That's the compact form for determinists automatas
292 */
293 int nbstates;
294 int *compact;
295 void **transdata;
296 int nbstrings;
297 xmlChar **stringMap;
298 };
299
300 typedef struct _xmlRegExecRollback xmlRegExecRollback;
301 typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
302
303 struct _xmlRegExecRollback {
304 xmlRegStatePtr state;/* the current state */
305 int index; /* the index in the input stack */
306 int nextbranch; /* the next transition to explore in that state */
307 int *counts; /* save the automata state if it has some */
308 };
309
310 typedef struct _xmlRegInputToken xmlRegInputToken;
311 typedef xmlRegInputToken *xmlRegInputTokenPtr;
312
313 struct _xmlRegInputToken {
314 xmlChar *value;
315 void *data;
316 };
317
318 struct _xmlRegExecCtxt {
319 int status; /* execution status != 0 indicate an error */
320 int determinist; /* did we find an indeterministic behaviour */
321 xmlRegexpPtr comp; /* the compiled regexp */
322 xmlRegExecCallbacks callback;
323 void *data;
324
325 xmlRegStatePtr state;/* the current state */
326 int transno; /* the current transition on that state */
327 int transcount; /* the number of chars in char counted transitions */
328
329 /*
330 * A stack of rollback states
331 */
332 int maxRollbacks;
333 int nbRollbacks;
334 xmlRegExecRollback *rollbacks;
335
336 /*
337 * The state of the automata if any
338 */
339 int *counts;
340
341 /*
342 * The input stack
343 */
344 int inputStackMax;
345 int inputStackNr;
346 int index;
347 int *charStack;
348 const xmlChar *inputString; /* when operating on characters */
349 xmlRegInputTokenPtr inputStack;/* when operating on strings */
350
351 /*
352 * error handling
353 */
354 int errStateNo; /* the error state number */
355 xmlRegStatePtr errState; /* the error state */
356 xmlChar *errString; /* the string raising the error */
357 int *errCounts; /* counters at the error state */
358 int nbPush;
359 };
360
361 #define REGEXP_ALL_COUNTER 0x123456
362 #define REGEXP_ALL_LAX_COUNTER 0x123457
363
364 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
365 static void xmlRegFreeState(xmlRegStatePtr state);
366 static void xmlRegFreeAtom(xmlRegAtomPtr atom);
367 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
368 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
369 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
370 int neg, int start, int end, const xmlChar *blockName);
371
372 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags);
373
374 /************************************************************************
375 * *
376 * Regexp memory error handler *
377 * *
378 ************************************************************************/
379 /**
380 * xmlRegexpErrMemory:
381 * @extra: extra information
382 *
383 * Handle an out of memory condition
384 */
385 static void
xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt,const char * extra)386 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
387 {
388 const char *regexp = NULL;
389 if (ctxt != NULL) {
390 regexp = (const char *) ctxt->string;
391 ctxt->error = XML_ERR_NO_MEMORY;
392 }
393 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
394 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
395 regexp, NULL, 0, 0,
396 "Memory allocation failed : %s\n", extra);
397 }
398
399 /**
400 * xmlRegexpErrCompile:
401 * @extra: extra information
402 *
403 * Handle a compilation failure
404 */
405 static void
xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt,const char * extra)406 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
407 {
408 const char *regexp = NULL;
409 int idx = 0;
410
411 if (ctxt != NULL) {
412 regexp = (const char *) ctxt->string;
413 idx = ctxt->cur - ctxt->string;
414 ctxt->error = XML_REGEXP_COMPILE_ERROR;
415 }
416 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
417 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
418 regexp, NULL, idx, 0,
419 "failed to compile: %s\n", extra);
420 }
421
422 /************************************************************************
423 * *
424 * Allocation/Deallocation *
425 * *
426 ************************************************************************/
427
428 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
429
430 /**
431 * xmlRegCalloc2:
432 * @dim1: size of first dimension
433 * @dim2: size of second dimension
434 * @elemSize: size of element
435 *
436 * Allocate a two-dimensional array and set all elements to zero.
437 *
438 * Returns the new array or NULL in case of error.
439 */
440 static void*
xmlRegCalloc2(size_t dim1,size_t dim2,size_t elemSize)441 xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) {
442 size_t totalSize;
443 void *ret;
444
445 /* Check for overflow */
446 if (dim1 > SIZE_MAX / dim2 / elemSize)
447 return (NULL);
448 totalSize = dim1 * dim2 * elemSize;
449 ret = xmlMalloc(totalSize);
450 if (ret != NULL)
451 memset(ret, 0, totalSize);
452 return (ret);
453 }
454
455 /**
456 * xmlRegEpxFromParse:
457 * @ctxt: the parser context used to build it
458 *
459 * Allocate a new regexp and fill it with the result from the parser
460 *
461 * Returns the new regexp or NULL in case of error
462 */
463 static xmlRegexpPtr
xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt)464 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
465 xmlRegexpPtr ret;
466
467 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
468 if (ret == NULL) {
469 xmlRegexpErrMemory(ctxt, "compiling regexp");
470 return(NULL);
471 }
472 memset(ret, 0, sizeof(xmlRegexp));
473 ret->string = ctxt->string;
474 ret->nbStates = ctxt->nbStates;
475 ret->states = ctxt->states;
476 ret->nbAtoms = ctxt->nbAtoms;
477 ret->atoms = ctxt->atoms;
478 ret->nbCounters = ctxt->nbCounters;
479 ret->counters = ctxt->counters;
480 ret->determinist = ctxt->determinist;
481 ret->flags = ctxt->flags;
482 if (ret->determinist == -1) {
483 xmlRegexpIsDeterminist(ret);
484 }
485
486 if ((ret->determinist != 0) &&
487 (ret->nbCounters == 0) &&
488 (ctxt->negs == 0) &&
489 (ret->atoms != NULL) &&
490 (ret->atoms[0] != NULL) &&
491 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
492 int i, j, nbstates = 0, nbatoms = 0;
493 int *stateRemap;
494 int *stringRemap;
495 int *transitions;
496 void **transdata;
497 xmlChar **stringMap;
498 xmlChar *value;
499
500 /*
501 * Switch to a compact representation
502 * 1/ counting the effective number of states left
503 * 2/ counting the unique number of atoms, and check that
504 * they are all of the string type
505 * 3/ build a table state x atom for the transitions
506 */
507
508 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
509 if (stateRemap == NULL) {
510 xmlRegexpErrMemory(ctxt, "compiling regexp");
511 xmlFree(ret);
512 return(NULL);
513 }
514 for (i = 0;i < ret->nbStates;i++) {
515 if (ret->states[i] != NULL) {
516 stateRemap[i] = nbstates;
517 nbstates++;
518 } else {
519 stateRemap[i] = -1;
520 }
521 }
522 #ifdef DEBUG_COMPACTION
523 printf("Final: %d states\n", nbstates);
524 #endif
525 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
526 if (stringMap == NULL) {
527 xmlRegexpErrMemory(ctxt, "compiling regexp");
528 xmlFree(stateRemap);
529 xmlFree(ret);
530 return(NULL);
531 }
532 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
533 if (stringRemap == NULL) {
534 xmlRegexpErrMemory(ctxt, "compiling regexp");
535 xmlFree(stringMap);
536 xmlFree(stateRemap);
537 xmlFree(ret);
538 return(NULL);
539 }
540 for (i = 0;i < ret->nbAtoms;i++) {
541 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
542 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
543 value = ret->atoms[i]->valuep;
544 for (j = 0;j < nbatoms;j++) {
545 if (xmlStrEqual(stringMap[j], value)) {
546 stringRemap[i] = j;
547 break;
548 }
549 }
550 if (j >= nbatoms) {
551 stringRemap[i] = nbatoms;
552 stringMap[nbatoms] = xmlStrdup(value);
553 if (stringMap[nbatoms] == NULL) {
554 for (i = 0;i < nbatoms;i++)
555 xmlFree(stringMap[i]);
556 xmlFree(stringRemap);
557 xmlFree(stringMap);
558 xmlFree(stateRemap);
559 xmlFree(ret);
560 return(NULL);
561 }
562 nbatoms++;
563 }
564 } else {
565 xmlFree(stateRemap);
566 xmlFree(stringRemap);
567 for (i = 0;i < nbatoms;i++)
568 xmlFree(stringMap[i]);
569 xmlFree(stringMap);
570 xmlFree(ret);
571 return(NULL);
572 }
573 }
574 #ifdef DEBUG_COMPACTION
575 printf("Final: %d atoms\n", nbatoms);
576 #endif
577 transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1,
578 sizeof(int));
579 if (transitions == NULL) {
580 xmlFree(stateRemap);
581 xmlFree(stringRemap);
582 for (i = 0;i < nbatoms;i++)
583 xmlFree(stringMap[i]);
584 xmlFree(stringMap);
585 xmlFree(ret);
586 return(NULL);
587 }
588
589 /*
590 * Allocate the transition table. The first entry for each
591 * state corresponds to the state type.
592 */
593 transdata = NULL;
594
595 for (i = 0;i < ret->nbStates;i++) {
596 int stateno, atomno, targetno, prev;
597 xmlRegStatePtr state;
598 xmlRegTransPtr trans;
599
600 stateno = stateRemap[i];
601 if (stateno == -1)
602 continue;
603 state = ret->states[i];
604
605 transitions[stateno * (nbatoms + 1)] = state->type;
606
607 for (j = 0;j < state->nbTrans;j++) {
608 trans = &(state->trans[j]);
609 if ((trans->to == -1) || (trans->atom == NULL))
610 continue;
611 atomno = stringRemap[trans->atom->no];
612 if ((trans->atom->data != NULL) && (transdata == NULL)) {
613 transdata = (void **) xmlRegCalloc2(nbstates, nbatoms,
614 sizeof(void *));
615 if (transdata == NULL) {
616 xmlRegexpErrMemory(ctxt, "compiling regexp");
617 break;
618 }
619 }
620 targetno = stateRemap[trans->to];
621 /*
622 * if the same atom can generate transitions to 2 different
623 * states then it means the automata is not deterministic and
624 * the compact form can't be used !
625 */
626 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
627 if (prev != 0) {
628 if (prev != targetno + 1) {
629 ret->determinist = 0;
630 #ifdef DEBUG_COMPACTION
631 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
632 i, j, trans->atom->no, trans->to, atomno, targetno);
633 printf(" previous to is %d\n", prev);
634 #endif
635 if (transdata != NULL)
636 xmlFree(transdata);
637 xmlFree(transitions);
638 xmlFree(stateRemap);
639 xmlFree(stringRemap);
640 for (i = 0;i < nbatoms;i++)
641 xmlFree(stringMap[i]);
642 xmlFree(stringMap);
643 goto not_determ;
644 }
645 } else {
646 #if 0
647 printf("State %d trans %d: atom %d to %d : %d to %d\n",
648 i, j, trans->atom->no, trans->to, atomno, targetno);
649 #endif
650 transitions[stateno * (nbatoms + 1) + atomno + 1] =
651 targetno + 1; /* to avoid 0 */
652 if (transdata != NULL)
653 transdata[stateno * nbatoms + atomno] =
654 trans->atom->data;
655 }
656 }
657 }
658 ret->determinist = 1;
659 #ifdef DEBUG_COMPACTION
660 /*
661 * Debug
662 */
663 for (i = 0;i < nbstates;i++) {
664 for (j = 0;j < nbatoms + 1;j++) {
665 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
666 }
667 printf("\n");
668 }
669 printf("\n");
670 #endif
671 /*
672 * Cleanup of the old data
673 */
674 if (ret->states != NULL) {
675 for (i = 0;i < ret->nbStates;i++)
676 xmlRegFreeState(ret->states[i]);
677 xmlFree(ret->states);
678 }
679 ret->states = NULL;
680 ret->nbStates = 0;
681 if (ret->atoms != NULL) {
682 for (i = 0;i < ret->nbAtoms;i++)
683 xmlRegFreeAtom(ret->atoms[i]);
684 xmlFree(ret->atoms);
685 }
686 ret->atoms = NULL;
687 ret->nbAtoms = 0;
688
689 ret->compact = transitions;
690 ret->transdata = transdata;
691 ret->stringMap = stringMap;
692 ret->nbstrings = nbatoms;
693 ret->nbstates = nbstates;
694 xmlFree(stateRemap);
695 xmlFree(stringRemap);
696 }
697 not_determ:
698 ctxt->string = NULL;
699 ctxt->nbStates = 0;
700 ctxt->states = NULL;
701 ctxt->nbAtoms = 0;
702 ctxt->atoms = NULL;
703 ctxt->nbCounters = 0;
704 ctxt->counters = NULL;
705 return(ret);
706 }
707
708 /**
709 * xmlRegNewParserCtxt:
710 * @string: the string to parse
711 *
712 * Allocate a new regexp parser context
713 *
714 * Returns the new context or NULL in case of error
715 */
716 static xmlRegParserCtxtPtr
xmlRegNewParserCtxt(const xmlChar * string)717 xmlRegNewParserCtxt(const xmlChar *string) {
718 xmlRegParserCtxtPtr ret;
719
720 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
721 if (ret == NULL)
722 return(NULL);
723 memset(ret, 0, sizeof(xmlRegParserCtxt));
724 if (string != NULL)
725 ret->string = xmlStrdup(string);
726 ret->cur = ret->string;
727 ret->neg = 0;
728 ret->negs = 0;
729 ret->error = 0;
730 ret->determinist = -1;
731 return(ret);
732 }
733
734 /**
735 * xmlRegNewRange:
736 * @ctxt: the regexp parser context
737 * @neg: is that negative
738 * @type: the type of range
739 * @start: the start codepoint
740 * @end: the end codepoint
741 *
742 * Allocate a new regexp range
743 *
744 * Returns the new range or NULL in case of error
745 */
746 static xmlRegRangePtr
xmlRegNewRange(xmlRegParserCtxtPtr ctxt,int neg,xmlRegAtomType type,int start,int end)747 xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
748 int neg, xmlRegAtomType type, int start, int end) {
749 xmlRegRangePtr ret;
750
751 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
752 if (ret == NULL) {
753 xmlRegexpErrMemory(ctxt, "allocating range");
754 return(NULL);
755 }
756 ret->neg = neg;
757 ret->type = type;
758 ret->start = start;
759 ret->end = end;
760 return(ret);
761 }
762
763 /**
764 * xmlRegFreeRange:
765 * @range: the regexp range
766 *
767 * Free a regexp range
768 */
769 static void
xmlRegFreeRange(xmlRegRangePtr range)770 xmlRegFreeRange(xmlRegRangePtr range) {
771 if (range == NULL)
772 return;
773
774 if (range->blockName != NULL)
775 xmlFree(range->blockName);
776 xmlFree(range);
777 }
778
779 /**
780 * xmlRegCopyRange:
781 * @range: the regexp range
782 *
783 * Copy a regexp range
784 *
785 * Returns the new copy or NULL in case of error.
786 */
787 static xmlRegRangePtr
xmlRegCopyRange(xmlRegParserCtxtPtr ctxt,xmlRegRangePtr range)788 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
789 xmlRegRangePtr ret;
790
791 if (range == NULL)
792 return(NULL);
793
794 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
795 range->end);
796 if (ret == NULL)
797 return(NULL);
798 if (range->blockName != NULL) {
799 ret->blockName = xmlStrdup(range->blockName);
800 if (ret->blockName == NULL) {
801 xmlRegexpErrMemory(ctxt, "allocating range");
802 xmlRegFreeRange(ret);
803 return(NULL);
804 }
805 }
806 return(ret);
807 }
808
809 /**
810 * xmlRegNewAtom:
811 * @ctxt: the regexp parser context
812 * @type: the type of atom
813 *
814 * Allocate a new atom
815 *
816 * Returns the new atom or NULL in case of error
817 */
818 static xmlRegAtomPtr
xmlRegNewAtom(xmlRegParserCtxtPtr ctxt,xmlRegAtomType type)819 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
820 xmlRegAtomPtr ret;
821
822 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
823 if (ret == NULL) {
824 xmlRegexpErrMemory(ctxt, "allocating atom");
825 return(NULL);
826 }
827 memset(ret, 0, sizeof(xmlRegAtom));
828 ret->type = type;
829 ret->quant = XML_REGEXP_QUANT_ONCE;
830 ret->min = 0;
831 ret->max = 0;
832 return(ret);
833 }
834
835 /**
836 * xmlRegFreeAtom:
837 * @atom: the regexp atom
838 *
839 * Free a regexp atom
840 */
841 static void
xmlRegFreeAtom(xmlRegAtomPtr atom)842 xmlRegFreeAtom(xmlRegAtomPtr atom) {
843 int i;
844
845 if (atom == NULL)
846 return;
847
848 for (i = 0;i < atom->nbRanges;i++)
849 xmlRegFreeRange(atom->ranges[i]);
850 if (atom->ranges != NULL)
851 xmlFree(atom->ranges);
852 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
853 xmlFree(atom->valuep);
854 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
855 xmlFree(atom->valuep2);
856 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
857 xmlFree(atom->valuep);
858 xmlFree(atom);
859 }
860
861 /**
862 * xmlRegCopyAtom:
863 * @ctxt: the regexp parser context
864 * @atom: the original atom
865 *
866 * Allocate a new regexp range
867 *
868 * Returns the new atom or NULL in case of error
869 */
870 static xmlRegAtomPtr
xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom)871 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
872 xmlRegAtomPtr ret;
873
874 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
875 if (ret == NULL) {
876 xmlRegexpErrMemory(ctxt, "copying atom");
877 return(NULL);
878 }
879 memset(ret, 0, sizeof(xmlRegAtom));
880 ret->type = atom->type;
881 ret->quant = atom->quant;
882 ret->min = atom->min;
883 ret->max = atom->max;
884 if (atom->nbRanges > 0) {
885 int i;
886
887 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
888 atom->nbRanges);
889 if (ret->ranges == NULL) {
890 xmlRegexpErrMemory(ctxt, "copying atom");
891 goto error;
892 }
893 for (i = 0;i < atom->nbRanges;i++) {
894 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
895 if (ret->ranges[i] == NULL)
896 goto error;
897 ret->nbRanges = i + 1;
898 }
899 }
900 return(ret);
901
902 error:
903 xmlRegFreeAtom(ret);
904 return(NULL);
905 }
906
907 static xmlRegStatePtr
xmlRegNewState(xmlRegParserCtxtPtr ctxt)908 xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
909 xmlRegStatePtr ret;
910
911 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
912 if (ret == NULL) {
913 xmlRegexpErrMemory(ctxt, "allocating state");
914 return(NULL);
915 }
916 memset(ret, 0, sizeof(xmlRegState));
917 ret->type = XML_REGEXP_TRANS_STATE;
918 ret->mark = XML_REGEXP_MARK_NORMAL;
919 return(ret);
920 }
921
922 /**
923 * xmlRegFreeState:
924 * @state: the regexp state
925 *
926 * Free a regexp state
927 */
928 static void
xmlRegFreeState(xmlRegStatePtr state)929 xmlRegFreeState(xmlRegStatePtr state) {
930 if (state == NULL)
931 return;
932
933 if (state->trans != NULL)
934 xmlFree(state->trans);
935 if (state->transTo != NULL)
936 xmlFree(state->transTo);
937 xmlFree(state);
938 }
939
940 /**
941 * xmlRegFreeParserCtxt:
942 * @ctxt: the regexp parser context
943 *
944 * Free a regexp parser context
945 */
946 static void
xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt)947 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
948 int i;
949 if (ctxt == NULL)
950 return;
951
952 if (ctxt->string != NULL)
953 xmlFree(ctxt->string);
954 if (ctxt->states != NULL) {
955 for (i = 0;i < ctxt->nbStates;i++)
956 xmlRegFreeState(ctxt->states[i]);
957 xmlFree(ctxt->states);
958 }
959 if (ctxt->atoms != NULL) {
960 for (i = 0;i < ctxt->nbAtoms;i++)
961 xmlRegFreeAtom(ctxt->atoms[i]);
962 xmlFree(ctxt->atoms);
963 }
964 if (ctxt->counters != NULL)
965 xmlFree(ctxt->counters);
966 xmlFree(ctxt);
967 }
968
969 /************************************************************************
970 * *
971 * Display of Data structures *
972 * *
973 ************************************************************************/
974
975 static void
xmlRegPrintAtomType(FILE * output,xmlRegAtomType type)976 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
977 switch (type) {
978 case XML_REGEXP_EPSILON:
979 fprintf(output, "epsilon "); break;
980 case XML_REGEXP_CHARVAL:
981 fprintf(output, "charval "); break;
982 case XML_REGEXP_RANGES:
983 fprintf(output, "ranges "); break;
984 case XML_REGEXP_SUBREG:
985 fprintf(output, "subexpr "); break;
986 case XML_REGEXP_STRING:
987 fprintf(output, "string "); break;
988 case XML_REGEXP_ANYCHAR:
989 fprintf(output, "anychar "); break;
990 case XML_REGEXP_ANYSPACE:
991 fprintf(output, "anyspace "); break;
992 case XML_REGEXP_NOTSPACE:
993 fprintf(output, "notspace "); break;
994 case XML_REGEXP_INITNAME:
995 fprintf(output, "initname "); break;
996 case XML_REGEXP_NOTINITNAME:
997 fprintf(output, "notinitname "); break;
998 case XML_REGEXP_NAMECHAR:
999 fprintf(output, "namechar "); break;
1000 case XML_REGEXP_NOTNAMECHAR:
1001 fprintf(output, "notnamechar "); break;
1002 case XML_REGEXP_DECIMAL:
1003 fprintf(output, "decimal "); break;
1004 case XML_REGEXP_NOTDECIMAL:
1005 fprintf(output, "notdecimal "); break;
1006 case XML_REGEXP_REALCHAR:
1007 fprintf(output, "realchar "); break;
1008 case XML_REGEXP_NOTREALCHAR:
1009 fprintf(output, "notrealchar "); break;
1010 case XML_REGEXP_LETTER:
1011 fprintf(output, "LETTER "); break;
1012 case XML_REGEXP_LETTER_UPPERCASE:
1013 fprintf(output, "LETTER_UPPERCASE "); break;
1014 case XML_REGEXP_LETTER_LOWERCASE:
1015 fprintf(output, "LETTER_LOWERCASE "); break;
1016 case XML_REGEXP_LETTER_TITLECASE:
1017 fprintf(output, "LETTER_TITLECASE "); break;
1018 case XML_REGEXP_LETTER_MODIFIER:
1019 fprintf(output, "LETTER_MODIFIER "); break;
1020 case XML_REGEXP_LETTER_OTHERS:
1021 fprintf(output, "LETTER_OTHERS "); break;
1022 case XML_REGEXP_MARK:
1023 fprintf(output, "MARK "); break;
1024 case XML_REGEXP_MARK_NONSPACING:
1025 fprintf(output, "MARK_NONSPACING "); break;
1026 case XML_REGEXP_MARK_SPACECOMBINING:
1027 fprintf(output, "MARK_SPACECOMBINING "); break;
1028 case XML_REGEXP_MARK_ENCLOSING:
1029 fprintf(output, "MARK_ENCLOSING "); break;
1030 case XML_REGEXP_NUMBER:
1031 fprintf(output, "NUMBER "); break;
1032 case XML_REGEXP_NUMBER_DECIMAL:
1033 fprintf(output, "NUMBER_DECIMAL "); break;
1034 case XML_REGEXP_NUMBER_LETTER:
1035 fprintf(output, "NUMBER_LETTER "); break;
1036 case XML_REGEXP_NUMBER_OTHERS:
1037 fprintf(output, "NUMBER_OTHERS "); break;
1038 case XML_REGEXP_PUNCT:
1039 fprintf(output, "PUNCT "); break;
1040 case XML_REGEXP_PUNCT_CONNECTOR:
1041 fprintf(output, "PUNCT_CONNECTOR "); break;
1042 case XML_REGEXP_PUNCT_DASH:
1043 fprintf(output, "PUNCT_DASH "); break;
1044 case XML_REGEXP_PUNCT_OPEN:
1045 fprintf(output, "PUNCT_OPEN "); break;
1046 case XML_REGEXP_PUNCT_CLOSE:
1047 fprintf(output, "PUNCT_CLOSE "); break;
1048 case XML_REGEXP_PUNCT_INITQUOTE:
1049 fprintf(output, "PUNCT_INITQUOTE "); break;
1050 case XML_REGEXP_PUNCT_FINQUOTE:
1051 fprintf(output, "PUNCT_FINQUOTE "); break;
1052 case XML_REGEXP_PUNCT_OTHERS:
1053 fprintf(output, "PUNCT_OTHERS "); break;
1054 case XML_REGEXP_SEPAR:
1055 fprintf(output, "SEPAR "); break;
1056 case XML_REGEXP_SEPAR_SPACE:
1057 fprintf(output, "SEPAR_SPACE "); break;
1058 case XML_REGEXP_SEPAR_LINE:
1059 fprintf(output, "SEPAR_LINE "); break;
1060 case XML_REGEXP_SEPAR_PARA:
1061 fprintf(output, "SEPAR_PARA "); break;
1062 case XML_REGEXP_SYMBOL:
1063 fprintf(output, "SYMBOL "); break;
1064 case XML_REGEXP_SYMBOL_MATH:
1065 fprintf(output, "SYMBOL_MATH "); break;
1066 case XML_REGEXP_SYMBOL_CURRENCY:
1067 fprintf(output, "SYMBOL_CURRENCY "); break;
1068 case XML_REGEXP_SYMBOL_MODIFIER:
1069 fprintf(output, "SYMBOL_MODIFIER "); break;
1070 case XML_REGEXP_SYMBOL_OTHERS:
1071 fprintf(output, "SYMBOL_OTHERS "); break;
1072 case XML_REGEXP_OTHER:
1073 fprintf(output, "OTHER "); break;
1074 case XML_REGEXP_OTHER_CONTROL:
1075 fprintf(output, "OTHER_CONTROL "); break;
1076 case XML_REGEXP_OTHER_FORMAT:
1077 fprintf(output, "OTHER_FORMAT "); break;
1078 case XML_REGEXP_OTHER_PRIVATE:
1079 fprintf(output, "OTHER_PRIVATE "); break;
1080 case XML_REGEXP_OTHER_NA:
1081 fprintf(output, "OTHER_NA "); break;
1082 case XML_REGEXP_BLOCK_NAME:
1083 fprintf(output, "BLOCK "); break;
1084 }
1085 }
1086
1087 static void
xmlRegPrintQuantType(FILE * output,xmlRegQuantType type)1088 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1089 switch (type) {
1090 case XML_REGEXP_QUANT_EPSILON:
1091 fprintf(output, "epsilon "); break;
1092 case XML_REGEXP_QUANT_ONCE:
1093 fprintf(output, "once "); break;
1094 case XML_REGEXP_QUANT_OPT:
1095 fprintf(output, "? "); break;
1096 case XML_REGEXP_QUANT_MULT:
1097 fprintf(output, "* "); break;
1098 case XML_REGEXP_QUANT_PLUS:
1099 fprintf(output, "+ "); break;
1100 case XML_REGEXP_QUANT_RANGE:
1101 fprintf(output, "range "); break;
1102 case XML_REGEXP_QUANT_ONCEONLY:
1103 fprintf(output, "onceonly "); break;
1104 case XML_REGEXP_QUANT_ALL:
1105 fprintf(output, "all "); break;
1106 }
1107 }
1108 static void
xmlRegPrintRange(FILE * output,xmlRegRangePtr range)1109 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1110 fprintf(output, " range: ");
1111 if (range->neg)
1112 fprintf(output, "negative ");
1113 xmlRegPrintAtomType(output, range->type);
1114 fprintf(output, "%c - %c\n", range->start, range->end);
1115 }
1116
1117 static void
xmlRegPrintAtom(FILE * output,xmlRegAtomPtr atom)1118 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1119 fprintf(output, " atom: ");
1120 if (atom == NULL) {
1121 fprintf(output, "NULL\n");
1122 return;
1123 }
1124 if (atom->neg)
1125 fprintf(output, "not ");
1126 xmlRegPrintAtomType(output, atom->type);
1127 xmlRegPrintQuantType(output, atom->quant);
1128 if (atom->quant == XML_REGEXP_QUANT_RANGE)
1129 fprintf(output, "%d-%d ", atom->min, atom->max);
1130 if (atom->type == XML_REGEXP_STRING)
1131 fprintf(output, "'%s' ", (char *) atom->valuep);
1132 if (atom->type == XML_REGEXP_CHARVAL)
1133 fprintf(output, "char %c\n", atom->codepoint);
1134 else if (atom->type == XML_REGEXP_RANGES) {
1135 int i;
1136 fprintf(output, "%d entries\n", atom->nbRanges);
1137 for (i = 0; i < atom->nbRanges;i++)
1138 xmlRegPrintRange(output, atom->ranges[i]);
1139 } else if (atom->type == XML_REGEXP_SUBREG) {
1140 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1141 } else {
1142 fprintf(output, "\n");
1143 }
1144 }
1145
1146 static void
xmlRegPrintTrans(FILE * output,xmlRegTransPtr trans)1147 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1148 fprintf(output, " trans: ");
1149 if (trans == NULL) {
1150 fprintf(output, "NULL\n");
1151 return;
1152 }
1153 if (trans->to < 0) {
1154 fprintf(output, "removed\n");
1155 return;
1156 }
1157 if (trans->nd != 0) {
1158 if (trans->nd == 2)
1159 fprintf(output, "last not determinist, ");
1160 else
1161 fprintf(output, "not determinist, ");
1162 }
1163 if (trans->counter >= 0) {
1164 fprintf(output, "counted %d, ", trans->counter);
1165 }
1166 if (trans->count == REGEXP_ALL_COUNTER) {
1167 fprintf(output, "all transition, ");
1168 } else if (trans->count >= 0) {
1169 fprintf(output, "count based %d, ", trans->count);
1170 }
1171 if (trans->atom == NULL) {
1172 fprintf(output, "epsilon to %d\n", trans->to);
1173 return;
1174 }
1175 if (trans->atom->type == XML_REGEXP_CHARVAL)
1176 fprintf(output, "char %c ", trans->atom->codepoint);
1177 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1178 }
1179
1180 static void
xmlRegPrintState(FILE * output,xmlRegStatePtr state)1181 xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1182 int i;
1183
1184 fprintf(output, " state: ");
1185 if (state == NULL) {
1186 fprintf(output, "NULL\n");
1187 return;
1188 }
1189 if (state->type == XML_REGEXP_START_STATE)
1190 fprintf(output, "START ");
1191 if (state->type == XML_REGEXP_FINAL_STATE)
1192 fprintf(output, "FINAL ");
1193
1194 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1195 for (i = 0;i < state->nbTrans; i++) {
1196 xmlRegPrintTrans(output, &(state->trans[i]));
1197 }
1198 }
1199
1200 #ifdef DEBUG_REGEXP_GRAPH
1201 static void
xmlRegPrintCtxt(FILE * output,xmlRegParserCtxtPtr ctxt)1202 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1203 int i;
1204
1205 fprintf(output, " ctxt: ");
1206 if (ctxt == NULL) {
1207 fprintf(output, "NULL\n");
1208 return;
1209 }
1210 fprintf(output, "'%s' ", ctxt->string);
1211 if (ctxt->error)
1212 fprintf(output, "error ");
1213 if (ctxt->neg)
1214 fprintf(output, "neg ");
1215 fprintf(output, "\n");
1216 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1217 for (i = 0;i < ctxt->nbAtoms; i++) {
1218 fprintf(output, " %02d ", i);
1219 xmlRegPrintAtom(output, ctxt->atoms[i]);
1220 }
1221 if (ctxt->atom != NULL) {
1222 fprintf(output, "current atom:\n");
1223 xmlRegPrintAtom(output, ctxt->atom);
1224 }
1225 fprintf(output, "%d states:", ctxt->nbStates);
1226 if (ctxt->start != NULL)
1227 fprintf(output, " start: %d", ctxt->start->no);
1228 if (ctxt->end != NULL)
1229 fprintf(output, " end: %d", ctxt->end->no);
1230 fprintf(output, "\n");
1231 for (i = 0;i < ctxt->nbStates; i++) {
1232 xmlRegPrintState(output, ctxt->states[i]);
1233 }
1234 fprintf(output, "%d counters:\n", ctxt->nbCounters);
1235 for (i = 0;i < ctxt->nbCounters; i++) {
1236 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1237 ctxt->counters[i].max);
1238 }
1239 }
1240 #endif
1241
1242 /************************************************************************
1243 * *
1244 * Finite Automata structures manipulations *
1245 * *
1246 ************************************************************************/
1247
1248 static void
xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom,int neg,xmlRegAtomType type,int start,int end,xmlChar * blockName)1249 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1250 int neg, xmlRegAtomType type, int start, int end,
1251 xmlChar *blockName) {
1252 xmlRegRangePtr range;
1253
1254 if (atom == NULL) {
1255 ERROR("add range: atom is NULL");
1256 return;
1257 }
1258 if (atom->type != XML_REGEXP_RANGES) {
1259 ERROR("add range: atom is not ranges");
1260 return;
1261 }
1262 if (atom->maxRanges == 0) {
1263 atom->maxRanges = 4;
1264 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1265 sizeof(xmlRegRangePtr));
1266 if (atom->ranges == NULL) {
1267 xmlRegexpErrMemory(ctxt, "adding ranges");
1268 atom->maxRanges = 0;
1269 return;
1270 }
1271 } else if (atom->nbRanges >= atom->maxRanges) {
1272 xmlRegRangePtr *tmp;
1273 atom->maxRanges *= 2;
1274 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1275 sizeof(xmlRegRangePtr));
1276 if (tmp == NULL) {
1277 xmlRegexpErrMemory(ctxt, "adding ranges");
1278 atom->maxRanges /= 2;
1279 return;
1280 }
1281 atom->ranges = tmp;
1282 }
1283 range = xmlRegNewRange(ctxt, neg, type, start, end);
1284 if (range == NULL)
1285 return;
1286 range->blockName = blockName;
1287 atom->ranges[atom->nbRanges++] = range;
1288
1289 }
1290
1291 static int
xmlRegGetCounter(xmlRegParserCtxtPtr ctxt)1292 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1293 if (ctxt->maxCounters == 0) {
1294 ctxt->maxCounters = 4;
1295 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1296 sizeof(xmlRegCounter));
1297 if (ctxt->counters == NULL) {
1298 xmlRegexpErrMemory(ctxt, "allocating counter");
1299 ctxt->maxCounters = 0;
1300 return(-1);
1301 }
1302 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1303 xmlRegCounter *tmp;
1304 ctxt->maxCounters *= 2;
1305 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1306 sizeof(xmlRegCounter));
1307 if (tmp == NULL) {
1308 xmlRegexpErrMemory(ctxt, "allocating counter");
1309 ctxt->maxCounters /= 2;
1310 return(-1);
1311 }
1312 ctxt->counters = tmp;
1313 }
1314 ctxt->counters[ctxt->nbCounters].min = -1;
1315 ctxt->counters[ctxt->nbCounters].max = -1;
1316 return(ctxt->nbCounters++);
1317 }
1318
1319 static int
xmlRegAtomPush(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom)1320 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1321 if (atom == NULL) {
1322 ERROR("atom push: atom is NULL");
1323 return(-1);
1324 }
1325 if (ctxt->maxAtoms == 0) {
1326 ctxt->maxAtoms = 4;
1327 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1328 sizeof(xmlRegAtomPtr));
1329 if (ctxt->atoms == NULL) {
1330 xmlRegexpErrMemory(ctxt, "pushing atom");
1331 ctxt->maxAtoms = 0;
1332 return(-1);
1333 }
1334 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1335 xmlRegAtomPtr *tmp;
1336 ctxt->maxAtoms *= 2;
1337 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1338 sizeof(xmlRegAtomPtr));
1339 if (tmp == NULL) {
1340 xmlRegexpErrMemory(ctxt, "allocating counter");
1341 ctxt->maxAtoms /= 2;
1342 return(-1);
1343 }
1344 ctxt->atoms = tmp;
1345 }
1346 atom->no = ctxt->nbAtoms;
1347 ctxt->atoms[ctxt->nbAtoms++] = atom;
1348 return(0);
1349 }
1350
1351 static void
xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr target,int from)1352 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1353 int from) {
1354 if (target->maxTransTo == 0) {
1355 target->maxTransTo = 8;
1356 target->transTo = (int *) xmlMalloc(target->maxTransTo *
1357 sizeof(int));
1358 if (target->transTo == NULL) {
1359 xmlRegexpErrMemory(ctxt, "adding transition");
1360 target->maxTransTo = 0;
1361 return;
1362 }
1363 } else if (target->nbTransTo >= target->maxTransTo) {
1364 int *tmp;
1365 target->maxTransTo *= 2;
1366 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1367 sizeof(int));
1368 if (tmp == NULL) {
1369 xmlRegexpErrMemory(ctxt, "adding transition");
1370 target->maxTransTo /= 2;
1371 return;
1372 }
1373 target->transTo = tmp;
1374 }
1375 target->transTo[target->nbTransTo] = from;
1376 target->nbTransTo++;
1377 }
1378
1379 static void
xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,xmlRegAtomPtr atom,xmlRegStatePtr target,int counter,int count)1380 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1381 xmlRegAtomPtr atom, xmlRegStatePtr target,
1382 int counter, int count) {
1383
1384 int nrtrans;
1385
1386 if (state == NULL) {
1387 ERROR("add state: state is NULL");
1388 return;
1389 }
1390 if (target == NULL) {
1391 ERROR("add state: target is NULL");
1392 return;
1393 }
1394 /*
1395 * Other routines follow the philosophy 'When in doubt, add a transition'
1396 * so we check here whether such a transition is already present and, if
1397 * so, silently ignore this request.
1398 */
1399
1400 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1401 xmlRegTransPtr trans = &(state->trans[nrtrans]);
1402 if ((trans->atom == atom) &&
1403 (trans->to == target->no) &&
1404 (trans->counter == counter) &&
1405 (trans->count == count)) {
1406 #ifdef DEBUG_REGEXP_GRAPH
1407 printf("Ignoring duplicate transition from %d to %d\n",
1408 state->no, target->no);
1409 #endif
1410 return;
1411 }
1412 }
1413
1414 if (state->maxTrans == 0) {
1415 state->maxTrans = 8;
1416 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1417 sizeof(xmlRegTrans));
1418 if (state->trans == NULL) {
1419 xmlRegexpErrMemory(ctxt, "adding transition");
1420 state->maxTrans = 0;
1421 return;
1422 }
1423 } else if (state->nbTrans >= state->maxTrans) {
1424 xmlRegTrans *tmp;
1425 state->maxTrans *= 2;
1426 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1427 sizeof(xmlRegTrans));
1428 if (tmp == NULL) {
1429 xmlRegexpErrMemory(ctxt, "adding transition");
1430 state->maxTrans /= 2;
1431 return;
1432 }
1433 state->trans = tmp;
1434 }
1435 #ifdef DEBUG_REGEXP_GRAPH
1436 printf("Add trans from %d to %d ", state->no, target->no);
1437 if (count == REGEXP_ALL_COUNTER)
1438 printf("all transition\n");
1439 else if (count >= 0)
1440 printf("count based %d\n", count);
1441 else if (counter >= 0)
1442 printf("counted %d\n", counter);
1443 else if (atom == NULL)
1444 printf("epsilon transition\n");
1445 else if (atom != NULL)
1446 xmlRegPrintAtom(stdout, atom);
1447 #endif
1448
1449 state->trans[state->nbTrans].atom = atom;
1450 state->trans[state->nbTrans].to = target->no;
1451 state->trans[state->nbTrans].counter = counter;
1452 state->trans[state->nbTrans].count = count;
1453 state->trans[state->nbTrans].nd = 0;
1454 state->nbTrans++;
1455 xmlRegStateAddTransTo(ctxt, target, state->no);
1456 }
1457
1458 static int
xmlRegStatePush(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state)1459 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
1460 if (state == NULL) return(-1);
1461 if (ctxt->maxStates == 0) {
1462 ctxt->maxStates = 4;
1463 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1464 sizeof(xmlRegStatePtr));
1465 if (ctxt->states == NULL) {
1466 xmlRegexpErrMemory(ctxt, "adding state");
1467 ctxt->maxStates = 0;
1468 return(-1);
1469 }
1470 } else if (ctxt->nbStates >= ctxt->maxStates) {
1471 xmlRegStatePtr *tmp;
1472 ctxt->maxStates *= 2;
1473 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1474 sizeof(xmlRegStatePtr));
1475 if (tmp == NULL) {
1476 xmlRegexpErrMemory(ctxt, "adding state");
1477 ctxt->maxStates /= 2;
1478 return(-1);
1479 }
1480 ctxt->states = tmp;
1481 }
1482 state->no = ctxt->nbStates;
1483 ctxt->states[ctxt->nbStates++] = state;
1484 return(0);
1485 }
1486
1487 /**
1488 * xmlFAGenerateAllTransition:
1489 * @ctxt: a regexp parser context
1490 * @from: the from state
1491 * @to: the target state or NULL for building a new one
1492 * @lax:
1493 *
1494 */
1495 static void
xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int lax)1496 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1497 xmlRegStatePtr from, xmlRegStatePtr to,
1498 int lax) {
1499 if (to == NULL) {
1500 to = xmlRegNewState(ctxt);
1501 xmlRegStatePush(ctxt, to);
1502 ctxt->state = to;
1503 }
1504 if (lax)
1505 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1506 else
1507 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1508 }
1509
1510 /**
1511 * xmlFAGenerateEpsilonTransition:
1512 * @ctxt: a regexp parser context
1513 * @from: the from state
1514 * @to: the target state or NULL for building a new one
1515 *
1516 */
1517 static void
xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to)1518 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1519 xmlRegStatePtr from, xmlRegStatePtr to) {
1520 if (to == NULL) {
1521 to = xmlRegNewState(ctxt);
1522 xmlRegStatePush(ctxt, to);
1523 ctxt->state = to;
1524 }
1525 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1526 }
1527
1528 /**
1529 * xmlFAGenerateCountedEpsilonTransition:
1530 * @ctxt: a regexp parser context
1531 * @from: the from state
1532 * @to: the target state or NULL for building a new one
1533 * counter: the counter for that transition
1534 *
1535 */
1536 static void
xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1537 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1538 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1539 if (to == NULL) {
1540 to = xmlRegNewState(ctxt);
1541 xmlRegStatePush(ctxt, to);
1542 ctxt->state = to;
1543 }
1544 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1545 }
1546
1547 /**
1548 * xmlFAGenerateCountedTransition:
1549 * @ctxt: a regexp parser context
1550 * @from: the from state
1551 * @to: the target state or NULL for building a new one
1552 * counter: the counter for that transition
1553 *
1554 */
1555 static void
xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1556 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1557 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1558 if (to == NULL) {
1559 to = xmlRegNewState(ctxt);
1560 xmlRegStatePush(ctxt, to);
1561 ctxt->state = to;
1562 }
1563 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1564 }
1565
1566 /**
1567 * xmlFAGenerateTransitions:
1568 * @ctxt: a regexp parser context
1569 * @from: the from state
1570 * @to: the target state or NULL for building a new one
1571 * @atom: the atom generating the transition
1572 *
1573 * Returns 0 if success and -1 in case of error.
1574 */
1575 static int
xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,xmlRegAtomPtr atom)1576 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1577 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1578 xmlRegStatePtr end;
1579 int nullable = 0;
1580
1581 if (atom == NULL) {
1582 ERROR("generate transition: atom == NULL");
1583 return(-1);
1584 }
1585 if (atom->type == XML_REGEXP_SUBREG) {
1586 /*
1587 * this is a subexpression handling one should not need to
1588 * create a new node except for XML_REGEXP_QUANT_RANGE.
1589 */
1590 if (xmlRegAtomPush(ctxt, atom) < 0) {
1591 return(-1);
1592 }
1593 if ((to != NULL) && (atom->stop != to) &&
1594 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1595 /*
1596 * Generate an epsilon transition to link to the target
1597 */
1598 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1599 #ifdef DV
1600 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1601 (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1602 to = xmlRegNewState(ctxt);
1603 xmlRegStatePush(ctxt, to);
1604 ctxt->state = to;
1605 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1606 #endif
1607 }
1608 switch (atom->quant) {
1609 case XML_REGEXP_QUANT_OPT:
1610 atom->quant = XML_REGEXP_QUANT_ONCE;
1611 /*
1612 * transition done to the state after end of atom.
1613 * 1. set transition from atom start to new state
1614 * 2. set transition from atom end to this state.
1615 */
1616 if (to == NULL) {
1617 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1618 xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1619 ctxt->state);
1620 } else {
1621 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1622 }
1623 break;
1624 case XML_REGEXP_QUANT_MULT:
1625 atom->quant = XML_REGEXP_QUANT_ONCE;
1626 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1627 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1628 break;
1629 case XML_REGEXP_QUANT_PLUS:
1630 atom->quant = XML_REGEXP_QUANT_ONCE;
1631 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1632 break;
1633 case XML_REGEXP_QUANT_RANGE: {
1634 int counter;
1635 xmlRegStatePtr inter, newstate;
1636
1637 /*
1638 * create the final state now if needed
1639 */
1640 if (to != NULL) {
1641 newstate = to;
1642 } else {
1643 newstate = xmlRegNewState(ctxt);
1644 xmlRegStatePush(ctxt, newstate);
1645 }
1646
1647 /*
1648 * The principle here is to use counted transition
1649 * to avoid explosion in the number of states in the
1650 * graph. This is clearly more complex but should not
1651 * be exploitable at runtime.
1652 */
1653 if ((atom->min == 0) && (atom->start0 == NULL)) {
1654 xmlRegAtomPtr copy;
1655 /*
1656 * duplicate a transition based on atom to count next
1657 * occurrences after 1. We cannot loop to atom->start
1658 * directly because we need an epsilon transition to
1659 * newstate.
1660 */
1661 /* ???? For some reason it seems we never reach that
1662 case, I suppose this got optimized out before when
1663 building the automata */
1664 copy = xmlRegCopyAtom(ctxt, atom);
1665 if (copy == NULL)
1666 return(-1);
1667 copy->quant = XML_REGEXP_QUANT_ONCE;
1668 copy->min = 0;
1669 copy->max = 0;
1670
1671 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1672 < 0)
1673 return(-1);
1674 inter = ctxt->state;
1675 counter = xmlRegGetCounter(ctxt);
1676 ctxt->counters[counter].min = atom->min - 1;
1677 ctxt->counters[counter].max = atom->max - 1;
1678 /* count the number of times we see it again */
1679 xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1680 atom->stop, counter);
1681 /* allow a way out based on the count */
1682 xmlFAGenerateCountedTransition(ctxt, inter,
1683 newstate, counter);
1684 /* and also allow a direct exit for 0 */
1685 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1686 newstate);
1687 } else {
1688 /*
1689 * either we need the atom at least once or there
1690 * is an atom->start0 allowing to easily plug the
1691 * epsilon transition.
1692 */
1693 counter = xmlRegGetCounter(ctxt);
1694 ctxt->counters[counter].min = atom->min - 1;
1695 ctxt->counters[counter].max = atom->max - 1;
1696 /* count the number of times we see it again */
1697 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1698 atom->start, counter);
1699 /* allow a way out based on the count */
1700 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1701 newstate, counter);
1702 /* and if needed allow a direct exit for 0 */
1703 if (atom->min == 0)
1704 xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1705 newstate);
1706
1707 }
1708 atom->min = 0;
1709 atom->max = 0;
1710 atom->quant = XML_REGEXP_QUANT_ONCE;
1711 ctxt->state = newstate;
1712 }
1713 default:
1714 break;
1715 }
1716 return(0);
1717 }
1718 if ((atom->min == 0) && (atom->max == 0) &&
1719 (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1720 /*
1721 * we can discard the atom and generate an epsilon transition instead
1722 */
1723 if (to == NULL) {
1724 to = xmlRegNewState(ctxt);
1725 if (to != NULL)
1726 xmlRegStatePush(ctxt, to);
1727 else {
1728 return(-1);
1729 }
1730 }
1731 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1732 ctxt->state = to;
1733 xmlRegFreeAtom(atom);
1734 return(0);
1735 }
1736 if (to == NULL) {
1737 to = xmlRegNewState(ctxt);
1738 if (to != NULL)
1739 xmlRegStatePush(ctxt, to);
1740 else {
1741 return(-1);
1742 }
1743 }
1744 end = to;
1745 if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
1746 (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1747 /*
1748 * Do not pollute the target state by adding transitions from
1749 * it as it is likely to be the shared target of multiple branches.
1750 * So isolate with an epsilon transition.
1751 */
1752 xmlRegStatePtr tmp;
1753
1754 tmp = xmlRegNewState(ctxt);
1755 if (tmp != NULL)
1756 xmlRegStatePush(ctxt, tmp);
1757 else {
1758 return(-1);
1759 }
1760 xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1761 to = tmp;
1762 }
1763 if (xmlRegAtomPush(ctxt, atom) < 0) {
1764 return(-1);
1765 }
1766 if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
1767 (atom->min == 0) && (atom->max > 0)) {
1768 nullable = 1;
1769 atom->min = 1;
1770 if (atom->max == 1)
1771 atom->quant = XML_REGEXP_QUANT_OPT;
1772 }
1773 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1774 ctxt->state = end;
1775 switch (atom->quant) {
1776 case XML_REGEXP_QUANT_OPT:
1777 atom->quant = XML_REGEXP_QUANT_ONCE;
1778 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1779 break;
1780 case XML_REGEXP_QUANT_MULT:
1781 atom->quant = XML_REGEXP_QUANT_ONCE;
1782 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1783 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1784 break;
1785 case XML_REGEXP_QUANT_PLUS:
1786 atom->quant = XML_REGEXP_QUANT_ONCE;
1787 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1788 break;
1789 case XML_REGEXP_QUANT_RANGE:
1790 if (nullable)
1791 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1792 break;
1793 default:
1794 break;
1795 }
1796 return(0);
1797 }
1798
1799 /**
1800 * xmlFAReduceEpsilonTransitions:
1801 * @ctxt: a regexp parser context
1802 * @fromnr: the from state
1803 * @tonr: the to state
1804 * @counter: should that transition be associated to a counted
1805 *
1806 */
1807 static void
xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt,int fromnr,int tonr,int counter)1808 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1809 int tonr, int counter) {
1810 int transnr;
1811 xmlRegStatePtr from;
1812 xmlRegStatePtr to;
1813
1814 #ifdef DEBUG_REGEXP_GRAPH
1815 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1816 #endif
1817 from = ctxt->states[fromnr];
1818 if (from == NULL)
1819 return;
1820 to = ctxt->states[tonr];
1821 if (to == NULL)
1822 return;
1823 if ((to->mark == XML_REGEXP_MARK_START) ||
1824 (to->mark == XML_REGEXP_MARK_VISITED))
1825 return;
1826
1827 to->mark = XML_REGEXP_MARK_VISITED;
1828 if (to->type == XML_REGEXP_FINAL_STATE) {
1829 #ifdef DEBUG_REGEXP_GRAPH
1830 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1831 #endif
1832 from->type = XML_REGEXP_FINAL_STATE;
1833 }
1834 for (transnr = 0;transnr < to->nbTrans;transnr++) {
1835 if (to->trans[transnr].to < 0)
1836 continue;
1837 if (to->trans[transnr].atom == NULL) {
1838 /*
1839 * Don't remove counted transitions
1840 * Don't loop either
1841 */
1842 if (to->trans[transnr].to != fromnr) {
1843 if (to->trans[transnr].count >= 0) {
1844 int newto = to->trans[transnr].to;
1845
1846 xmlRegStateAddTrans(ctxt, from, NULL,
1847 ctxt->states[newto],
1848 -1, to->trans[transnr].count);
1849 } else {
1850 #ifdef DEBUG_REGEXP_GRAPH
1851 printf("Found epsilon trans %d from %d to %d\n",
1852 transnr, tonr, to->trans[transnr].to);
1853 #endif
1854 if (to->trans[transnr].counter >= 0) {
1855 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1856 to->trans[transnr].to,
1857 to->trans[transnr].counter);
1858 } else {
1859 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1860 to->trans[transnr].to,
1861 counter);
1862 }
1863 }
1864 }
1865 } else {
1866 int newto = to->trans[transnr].to;
1867
1868 if (to->trans[transnr].counter >= 0) {
1869 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1870 ctxt->states[newto],
1871 to->trans[transnr].counter, -1);
1872 } else {
1873 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1874 ctxt->states[newto], counter, -1);
1875 }
1876 }
1877 }
1878 to->mark = XML_REGEXP_MARK_NORMAL;
1879 }
1880
1881 /**
1882 * xmlFAEliminateSimpleEpsilonTransitions:
1883 * @ctxt: a regexp parser context
1884 *
1885 * Eliminating general epsilon transitions can get costly in the general
1886 * algorithm due to the large amount of generated new transitions and
1887 * associated comparisons. However for simple epsilon transition used just
1888 * to separate building blocks when generating the automata this can be
1889 * reduced to state elimination:
1890 * - if there exists an epsilon from X to Y
1891 * - if there is no other transition from X
1892 * then X and Y are semantically equivalent and X can be eliminated
1893 * If X is the start state then make Y the start state, else replace the
1894 * target of all transitions to X by transitions to Y.
1895 */
1896 static void
xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1897 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1898 int statenr, i, j, newto;
1899 xmlRegStatePtr state, tmp;
1900
1901 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1902 state = ctxt->states[statenr];
1903 if (state == NULL)
1904 continue;
1905 if (state->nbTrans != 1)
1906 continue;
1907 if (state->type == XML_REGEXP_UNREACH_STATE)
1908 continue;
1909 /* is the only transition out a basic transition */
1910 if ((state->trans[0].atom == NULL) &&
1911 (state->trans[0].to >= 0) &&
1912 (state->trans[0].to != statenr) &&
1913 (state->trans[0].counter < 0) &&
1914 (state->trans[0].count < 0)) {
1915 newto = state->trans[0].to;
1916
1917 if (state->type == XML_REGEXP_START_STATE) {
1918 #ifdef DEBUG_REGEXP_GRAPH
1919 printf("Found simple epsilon trans from start %d to %d\n",
1920 statenr, newto);
1921 #endif
1922 } else {
1923 #ifdef DEBUG_REGEXP_GRAPH
1924 printf("Found simple epsilon trans from %d to %d\n",
1925 statenr, newto);
1926 #endif
1927 for (i = 0;i < state->nbTransTo;i++) {
1928 tmp = ctxt->states[state->transTo[i]];
1929 for (j = 0;j < tmp->nbTrans;j++) {
1930 if (tmp->trans[j].to == statenr) {
1931 #ifdef DEBUG_REGEXP_GRAPH
1932 printf("Changed transition %d on %d to go to %d\n",
1933 j, tmp->no, newto);
1934 #endif
1935 tmp->trans[j].to = -1;
1936 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
1937 ctxt->states[newto],
1938 tmp->trans[j].counter,
1939 tmp->trans[j].count);
1940 }
1941 }
1942 }
1943 if (state->type == XML_REGEXP_FINAL_STATE)
1944 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1945 /* eliminate the transition completely */
1946 state->nbTrans = 0;
1947
1948 state->type = XML_REGEXP_UNREACH_STATE;
1949
1950 }
1951
1952 }
1953 }
1954 }
1955 /**
1956 * xmlFAEliminateEpsilonTransitions:
1957 * @ctxt: a regexp parser context
1958 *
1959 */
1960 static void
xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1961 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1962 int statenr, transnr;
1963 xmlRegStatePtr state;
1964 int has_epsilon;
1965
1966 if (ctxt->states == NULL) return;
1967
1968 /*
1969 * Eliminate simple epsilon transition and the associated unreachable
1970 * states.
1971 */
1972 xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1973 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1974 state = ctxt->states[statenr];
1975 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1976 #ifdef DEBUG_REGEXP_GRAPH
1977 printf("Removed unreachable state %d\n", statenr);
1978 #endif
1979 xmlRegFreeState(state);
1980 ctxt->states[statenr] = NULL;
1981 }
1982 }
1983
1984 has_epsilon = 0;
1985
1986 /*
1987 * Build the completed transitions bypassing the epsilons
1988 * Use a marking algorithm to avoid loops
1989 * Mark sink states too.
1990 * Process from the latest states backward to the start when
1991 * there is long cascading epsilon chains this minimize the
1992 * recursions and transition compares when adding the new ones
1993 */
1994 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
1995 state = ctxt->states[statenr];
1996 if (state == NULL)
1997 continue;
1998 if ((state->nbTrans == 0) &&
1999 (state->type != XML_REGEXP_FINAL_STATE)) {
2000 state->type = XML_REGEXP_SINK_STATE;
2001 }
2002 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2003 if ((state->trans[transnr].atom == NULL) &&
2004 (state->trans[transnr].to >= 0)) {
2005 if (state->trans[transnr].to == statenr) {
2006 state->trans[transnr].to = -1;
2007 #ifdef DEBUG_REGEXP_GRAPH
2008 printf("Removed loopback epsilon trans %d on %d\n",
2009 transnr, statenr);
2010 #endif
2011 } else if (state->trans[transnr].count < 0) {
2012 int newto = state->trans[transnr].to;
2013
2014 #ifdef DEBUG_REGEXP_GRAPH
2015 printf("Found epsilon trans %d from %d to %d\n",
2016 transnr, statenr, newto);
2017 #endif
2018 has_epsilon = 1;
2019 state->trans[transnr].to = -2;
2020 state->mark = XML_REGEXP_MARK_START;
2021 xmlFAReduceEpsilonTransitions(ctxt, statenr,
2022 newto, state->trans[transnr].counter);
2023 state->mark = XML_REGEXP_MARK_NORMAL;
2024 #ifdef DEBUG_REGEXP_GRAPH
2025 } else {
2026 printf("Found counted transition %d on %d\n",
2027 transnr, statenr);
2028 #endif
2029 }
2030 }
2031 }
2032 }
2033 /*
2034 * Eliminate the epsilon transitions
2035 */
2036 if (has_epsilon) {
2037 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2038 state = ctxt->states[statenr];
2039 if (state == NULL)
2040 continue;
2041 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2042 xmlRegTransPtr trans = &(state->trans[transnr]);
2043 if ((trans->atom == NULL) &&
2044 (trans->count < 0) &&
2045 (trans->to >= 0)) {
2046 trans->to = -1;
2047 }
2048 }
2049 }
2050 }
2051
2052 /*
2053 * Use this pass to detect unreachable states too
2054 */
2055 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2056 state = ctxt->states[statenr];
2057 if (state != NULL)
2058 state->reached = XML_REGEXP_MARK_NORMAL;
2059 }
2060 state = ctxt->states[0];
2061 if (state != NULL)
2062 state->reached = XML_REGEXP_MARK_START;
2063 while (state != NULL) {
2064 xmlRegStatePtr target = NULL;
2065 state->reached = XML_REGEXP_MARK_VISITED;
2066 /*
2067 * Mark all states reachable from the current reachable state
2068 */
2069 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2070 if ((state->trans[transnr].to >= 0) &&
2071 ((state->trans[transnr].atom != NULL) ||
2072 (state->trans[transnr].count >= 0))) {
2073 int newto = state->trans[transnr].to;
2074
2075 if (ctxt->states[newto] == NULL)
2076 continue;
2077 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
2078 ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
2079 target = ctxt->states[newto];
2080 }
2081 }
2082 }
2083
2084 /*
2085 * find the next accessible state not explored
2086 */
2087 if (target == NULL) {
2088 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
2089 state = ctxt->states[statenr];
2090 if ((state != NULL) && (state->reached ==
2091 XML_REGEXP_MARK_START)) {
2092 target = state;
2093 break;
2094 }
2095 }
2096 }
2097 state = target;
2098 }
2099 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2100 state = ctxt->states[statenr];
2101 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
2102 #ifdef DEBUG_REGEXP_GRAPH
2103 printf("Removed unreachable state %d\n", statenr);
2104 #endif
2105 xmlRegFreeState(state);
2106 ctxt->states[statenr] = NULL;
2107 }
2108 }
2109
2110 }
2111
2112 static int
xmlFACompareRanges(xmlRegRangePtr range1,xmlRegRangePtr range2)2113 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
2114 int ret = 0;
2115
2116 if ((range1->type == XML_REGEXP_RANGES) ||
2117 (range2->type == XML_REGEXP_RANGES) ||
2118 (range2->type == XML_REGEXP_SUBREG) ||
2119 (range1->type == XML_REGEXP_SUBREG) ||
2120 (range1->type == XML_REGEXP_STRING) ||
2121 (range2->type == XML_REGEXP_STRING))
2122 return(-1);
2123
2124 /* put them in order */
2125 if (range1->type > range2->type) {
2126 xmlRegRangePtr tmp;
2127
2128 tmp = range1;
2129 range1 = range2;
2130 range2 = tmp;
2131 }
2132 if ((range1->type == XML_REGEXP_ANYCHAR) ||
2133 (range2->type == XML_REGEXP_ANYCHAR)) {
2134 ret = 1;
2135 } else if ((range1->type == XML_REGEXP_EPSILON) ||
2136 (range2->type == XML_REGEXP_EPSILON)) {
2137 return(0);
2138 } else if (range1->type == range2->type) {
2139 if (range1->type != XML_REGEXP_CHARVAL)
2140 ret = 1;
2141 else if ((range1->end < range2->start) ||
2142 (range2->end < range1->start))
2143 ret = 0;
2144 else
2145 ret = 1;
2146 } else if (range1->type == XML_REGEXP_CHARVAL) {
2147 int codepoint;
2148 int neg = 0;
2149
2150 /*
2151 * just check all codepoints in the range for acceptance,
2152 * this is usually way cheaper since done only once at
2153 * compilation than testing over and over at runtime or
2154 * pushing too many states when evaluating.
2155 */
2156 if (((range1->neg == 0) && (range2->neg != 0)) ||
2157 ((range1->neg != 0) && (range2->neg == 0)))
2158 neg = 1;
2159
2160 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2161 ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2162 0, range2->start, range2->end,
2163 range2->blockName);
2164 if (ret < 0)
2165 return(-1);
2166 if (((neg == 1) && (ret == 0)) ||
2167 ((neg == 0) && (ret == 1)))
2168 return(1);
2169 }
2170 return(0);
2171 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2172 (range2->type == XML_REGEXP_BLOCK_NAME)) {
2173 if (range1->type == range2->type) {
2174 ret = xmlStrEqual(range1->blockName, range2->blockName);
2175 } else {
2176 /*
2177 * comparing a block range with anything else is way
2178 * too costly, and maintaining the table is like too much
2179 * memory too, so let's force the automata to save state
2180 * here.
2181 */
2182 return(1);
2183 }
2184 } else if ((range1->type < XML_REGEXP_LETTER) ||
2185 (range2->type < XML_REGEXP_LETTER)) {
2186 if ((range1->type == XML_REGEXP_ANYSPACE) &&
2187 (range2->type == XML_REGEXP_NOTSPACE))
2188 ret = 0;
2189 else if ((range1->type == XML_REGEXP_INITNAME) &&
2190 (range2->type == XML_REGEXP_NOTINITNAME))
2191 ret = 0;
2192 else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2193 (range2->type == XML_REGEXP_NOTNAMECHAR))
2194 ret = 0;
2195 else if ((range1->type == XML_REGEXP_DECIMAL) &&
2196 (range2->type == XML_REGEXP_NOTDECIMAL))
2197 ret = 0;
2198 else if ((range1->type == XML_REGEXP_REALCHAR) &&
2199 (range2->type == XML_REGEXP_NOTREALCHAR))
2200 ret = 0;
2201 else {
2202 /* same thing to limit complexity */
2203 return(1);
2204 }
2205 } else {
2206 ret = 0;
2207 /* range1->type < range2->type here */
2208 switch (range1->type) {
2209 case XML_REGEXP_LETTER:
2210 /* all disjoint except in the subgroups */
2211 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2212 (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2213 (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2214 (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2215 (range2->type == XML_REGEXP_LETTER_OTHERS))
2216 ret = 1;
2217 break;
2218 case XML_REGEXP_MARK:
2219 if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2220 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2221 (range2->type == XML_REGEXP_MARK_ENCLOSING))
2222 ret = 1;
2223 break;
2224 case XML_REGEXP_NUMBER:
2225 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2226 (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2227 (range2->type == XML_REGEXP_NUMBER_OTHERS))
2228 ret = 1;
2229 break;
2230 case XML_REGEXP_PUNCT:
2231 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2232 (range2->type == XML_REGEXP_PUNCT_DASH) ||
2233 (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2234 (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2235 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2236 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2237 (range2->type == XML_REGEXP_PUNCT_OTHERS))
2238 ret = 1;
2239 break;
2240 case XML_REGEXP_SEPAR:
2241 if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2242 (range2->type == XML_REGEXP_SEPAR_LINE) ||
2243 (range2->type == XML_REGEXP_SEPAR_PARA))
2244 ret = 1;
2245 break;
2246 case XML_REGEXP_SYMBOL:
2247 if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2248 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2249 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2250 (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2251 ret = 1;
2252 break;
2253 case XML_REGEXP_OTHER:
2254 if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2255 (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2256 (range2->type == XML_REGEXP_OTHER_PRIVATE))
2257 ret = 1;
2258 break;
2259 default:
2260 if ((range2->type >= XML_REGEXP_LETTER) &&
2261 (range2->type < XML_REGEXP_BLOCK_NAME))
2262 ret = 0;
2263 else {
2264 /* safety net ! */
2265 return(1);
2266 }
2267 }
2268 }
2269 if (((range1->neg == 0) && (range2->neg != 0)) ||
2270 ((range1->neg != 0) && (range2->neg == 0)))
2271 ret = !ret;
2272 return(ret);
2273 }
2274
2275 /**
2276 * xmlFACompareAtomTypes:
2277 * @type1: an atom type
2278 * @type2: an atom type
2279 *
2280 * Compares two atoms type to check whether they intersect in some ways,
2281 * this is used by xmlFACompareAtoms only
2282 *
2283 * Returns 1 if they may intersect and 0 otherwise
2284 */
2285 static int
xmlFACompareAtomTypes(xmlRegAtomType type1,xmlRegAtomType type2)2286 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2287 if ((type1 == XML_REGEXP_EPSILON) ||
2288 (type1 == XML_REGEXP_CHARVAL) ||
2289 (type1 == XML_REGEXP_RANGES) ||
2290 (type1 == XML_REGEXP_SUBREG) ||
2291 (type1 == XML_REGEXP_STRING) ||
2292 (type1 == XML_REGEXP_ANYCHAR))
2293 return(1);
2294 if ((type2 == XML_REGEXP_EPSILON) ||
2295 (type2 == XML_REGEXP_CHARVAL) ||
2296 (type2 == XML_REGEXP_RANGES) ||
2297 (type2 == XML_REGEXP_SUBREG) ||
2298 (type2 == XML_REGEXP_STRING) ||
2299 (type2 == XML_REGEXP_ANYCHAR))
2300 return(1);
2301
2302 if (type1 == type2) return(1);
2303
2304 /* simplify subsequent compares by making sure type1 < type2 */
2305 if (type1 > type2) {
2306 xmlRegAtomType tmp = type1;
2307 type1 = type2;
2308 type2 = tmp;
2309 }
2310 switch (type1) {
2311 case XML_REGEXP_ANYSPACE: /* \s */
2312 /* can't be a letter, number, mark, punctuation, symbol */
2313 if ((type2 == XML_REGEXP_NOTSPACE) ||
2314 ((type2 >= XML_REGEXP_LETTER) &&
2315 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2316 ((type2 >= XML_REGEXP_NUMBER) &&
2317 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2318 ((type2 >= XML_REGEXP_MARK) &&
2319 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2320 ((type2 >= XML_REGEXP_PUNCT) &&
2321 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2322 ((type2 >= XML_REGEXP_SYMBOL) &&
2323 (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2324 ) return(0);
2325 break;
2326 case XML_REGEXP_NOTSPACE: /* \S */
2327 break;
2328 case XML_REGEXP_INITNAME: /* \l */
2329 /* can't be a number, mark, separator, punctuation, symbol or other */
2330 if ((type2 == XML_REGEXP_NOTINITNAME) ||
2331 ((type2 >= XML_REGEXP_NUMBER) &&
2332 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2333 ((type2 >= XML_REGEXP_MARK) &&
2334 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2335 ((type2 >= XML_REGEXP_SEPAR) &&
2336 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2337 ((type2 >= XML_REGEXP_PUNCT) &&
2338 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2339 ((type2 >= XML_REGEXP_SYMBOL) &&
2340 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2341 ((type2 >= XML_REGEXP_OTHER) &&
2342 (type2 <= XML_REGEXP_OTHER_NA))
2343 ) return(0);
2344 break;
2345 case XML_REGEXP_NOTINITNAME: /* \L */
2346 break;
2347 case XML_REGEXP_NAMECHAR: /* \c */
2348 /* can't be a mark, separator, punctuation, symbol or other */
2349 if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2350 ((type2 >= XML_REGEXP_MARK) &&
2351 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2352 ((type2 >= XML_REGEXP_PUNCT) &&
2353 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2354 ((type2 >= XML_REGEXP_SEPAR) &&
2355 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2356 ((type2 >= XML_REGEXP_SYMBOL) &&
2357 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2358 ((type2 >= XML_REGEXP_OTHER) &&
2359 (type2 <= XML_REGEXP_OTHER_NA))
2360 ) return(0);
2361 break;
2362 case XML_REGEXP_NOTNAMECHAR: /* \C */
2363 break;
2364 case XML_REGEXP_DECIMAL: /* \d */
2365 /* can't be a letter, mark, separator, punctuation, symbol or other */
2366 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2367 (type2 == XML_REGEXP_REALCHAR) ||
2368 ((type2 >= XML_REGEXP_LETTER) &&
2369 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2370 ((type2 >= XML_REGEXP_MARK) &&
2371 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2372 ((type2 >= XML_REGEXP_PUNCT) &&
2373 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2374 ((type2 >= XML_REGEXP_SEPAR) &&
2375 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2376 ((type2 >= XML_REGEXP_SYMBOL) &&
2377 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2378 ((type2 >= XML_REGEXP_OTHER) &&
2379 (type2 <= XML_REGEXP_OTHER_NA))
2380 )return(0);
2381 break;
2382 case XML_REGEXP_NOTDECIMAL: /* \D */
2383 break;
2384 case XML_REGEXP_REALCHAR: /* \w */
2385 /* can't be a mark, separator, punctuation, symbol or other */
2386 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2387 ((type2 >= XML_REGEXP_MARK) &&
2388 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2389 ((type2 >= XML_REGEXP_PUNCT) &&
2390 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2391 ((type2 >= XML_REGEXP_SEPAR) &&
2392 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2393 ((type2 >= XML_REGEXP_SYMBOL) &&
2394 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2395 ((type2 >= XML_REGEXP_OTHER) &&
2396 (type2 <= XML_REGEXP_OTHER_NA))
2397 )return(0);
2398 break;
2399 case XML_REGEXP_NOTREALCHAR: /* \W */
2400 break;
2401 /*
2402 * at that point we know both type 1 and type2 are from
2403 * character categories are ordered and are different,
2404 * it becomes simple because this is a partition
2405 */
2406 case XML_REGEXP_LETTER:
2407 if (type2 <= XML_REGEXP_LETTER_OTHERS)
2408 return(1);
2409 return(0);
2410 case XML_REGEXP_LETTER_UPPERCASE:
2411 case XML_REGEXP_LETTER_LOWERCASE:
2412 case XML_REGEXP_LETTER_TITLECASE:
2413 case XML_REGEXP_LETTER_MODIFIER:
2414 case XML_REGEXP_LETTER_OTHERS:
2415 return(0);
2416 case XML_REGEXP_MARK:
2417 if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2418 return(1);
2419 return(0);
2420 case XML_REGEXP_MARK_NONSPACING:
2421 case XML_REGEXP_MARK_SPACECOMBINING:
2422 case XML_REGEXP_MARK_ENCLOSING:
2423 return(0);
2424 case XML_REGEXP_NUMBER:
2425 if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2426 return(1);
2427 return(0);
2428 case XML_REGEXP_NUMBER_DECIMAL:
2429 case XML_REGEXP_NUMBER_LETTER:
2430 case XML_REGEXP_NUMBER_OTHERS:
2431 return(0);
2432 case XML_REGEXP_PUNCT:
2433 if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2434 return(1);
2435 return(0);
2436 case XML_REGEXP_PUNCT_CONNECTOR:
2437 case XML_REGEXP_PUNCT_DASH:
2438 case XML_REGEXP_PUNCT_OPEN:
2439 case XML_REGEXP_PUNCT_CLOSE:
2440 case XML_REGEXP_PUNCT_INITQUOTE:
2441 case XML_REGEXP_PUNCT_FINQUOTE:
2442 case XML_REGEXP_PUNCT_OTHERS:
2443 return(0);
2444 case XML_REGEXP_SEPAR:
2445 if (type2 <= XML_REGEXP_SEPAR_PARA)
2446 return(1);
2447 return(0);
2448 case XML_REGEXP_SEPAR_SPACE:
2449 case XML_REGEXP_SEPAR_LINE:
2450 case XML_REGEXP_SEPAR_PARA:
2451 return(0);
2452 case XML_REGEXP_SYMBOL:
2453 if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2454 return(1);
2455 return(0);
2456 case XML_REGEXP_SYMBOL_MATH:
2457 case XML_REGEXP_SYMBOL_CURRENCY:
2458 case XML_REGEXP_SYMBOL_MODIFIER:
2459 case XML_REGEXP_SYMBOL_OTHERS:
2460 return(0);
2461 case XML_REGEXP_OTHER:
2462 if (type2 <= XML_REGEXP_OTHER_NA)
2463 return(1);
2464 return(0);
2465 case XML_REGEXP_OTHER_CONTROL:
2466 case XML_REGEXP_OTHER_FORMAT:
2467 case XML_REGEXP_OTHER_PRIVATE:
2468 case XML_REGEXP_OTHER_NA:
2469 return(0);
2470 default:
2471 break;
2472 }
2473 return(1);
2474 }
2475
2476 /**
2477 * xmlFAEqualAtoms:
2478 * @atom1: an atom
2479 * @atom2: an atom
2480 * @deep: if not set only compare string pointers
2481 *
2482 * Compares two atoms to check whether they are the same exactly
2483 * this is used to remove equivalent transitions
2484 *
2485 * Returns 1 if same and 0 otherwise
2486 */
2487 static int
xmlFAEqualAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2,int deep)2488 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2489 int ret = 0;
2490
2491 if (atom1 == atom2)
2492 return(1);
2493 if ((atom1 == NULL) || (atom2 == NULL))
2494 return(0);
2495
2496 if (atom1->type != atom2->type)
2497 return(0);
2498 switch (atom1->type) {
2499 case XML_REGEXP_EPSILON:
2500 ret = 0;
2501 break;
2502 case XML_REGEXP_STRING:
2503 if (!deep)
2504 ret = (atom1->valuep == atom2->valuep);
2505 else
2506 ret = xmlStrEqual((xmlChar *)atom1->valuep,
2507 (xmlChar *)atom2->valuep);
2508 break;
2509 case XML_REGEXP_CHARVAL:
2510 ret = (atom1->codepoint == atom2->codepoint);
2511 break;
2512 case XML_REGEXP_RANGES:
2513 /* too hard to do in the general case */
2514 ret = 0;
2515 default:
2516 break;
2517 }
2518 return(ret);
2519 }
2520
2521 /**
2522 * xmlFACompareAtoms:
2523 * @atom1: an atom
2524 * @atom2: an atom
2525 * @deep: if not set only compare string pointers
2526 *
2527 * Compares two atoms to check whether they intersect in some ways,
2528 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2529 *
2530 * Returns 1 if yes and 0 otherwise
2531 */
2532 static int
xmlFACompareAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2,int deep)2533 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2534 int ret = 1;
2535
2536 if (atom1 == atom2)
2537 return(1);
2538 if ((atom1 == NULL) || (atom2 == NULL))
2539 return(0);
2540
2541 if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2542 (atom2->type == XML_REGEXP_ANYCHAR))
2543 return(1);
2544
2545 if (atom1->type > atom2->type) {
2546 xmlRegAtomPtr tmp;
2547 tmp = atom1;
2548 atom1 = atom2;
2549 atom2 = tmp;
2550 }
2551 if (atom1->type != atom2->type) {
2552 ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2553 /* if they can't intersect at the type level break now */
2554 if (ret == 0)
2555 return(0);
2556 }
2557 switch (atom1->type) {
2558 case XML_REGEXP_STRING:
2559 if (!deep)
2560 ret = (atom1->valuep != atom2->valuep);
2561 else {
2562 xmlChar *val1 = (xmlChar *)atom1->valuep;
2563 xmlChar *val2 = (xmlChar *)atom2->valuep;
2564 int compound1 = (xmlStrchr(val1, '|') != NULL);
2565 int compound2 = (xmlStrchr(val2, '|') != NULL);
2566
2567 /* Ignore negative match flag for ##other namespaces */
2568 if (compound1 != compound2)
2569 return(0);
2570
2571 ret = xmlRegStrEqualWildcard(val1, val2);
2572 }
2573 break;
2574 case XML_REGEXP_EPSILON:
2575 goto not_determinist;
2576 case XML_REGEXP_CHARVAL:
2577 if (atom2->type == XML_REGEXP_CHARVAL) {
2578 ret = (atom1->codepoint == atom2->codepoint);
2579 } else {
2580 ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2581 if (ret < 0)
2582 ret = 1;
2583 }
2584 break;
2585 case XML_REGEXP_RANGES:
2586 if (atom2->type == XML_REGEXP_RANGES) {
2587 int i, j, res;
2588 xmlRegRangePtr r1, r2;
2589
2590 /*
2591 * need to check that none of the ranges eventually matches
2592 */
2593 for (i = 0;i < atom1->nbRanges;i++) {
2594 for (j = 0;j < atom2->nbRanges;j++) {
2595 r1 = atom1->ranges[i];
2596 r2 = atom2->ranges[j];
2597 res = xmlFACompareRanges(r1, r2);
2598 if (res == 1) {
2599 ret = 1;
2600 goto done;
2601 }
2602 }
2603 }
2604 ret = 0;
2605 }
2606 break;
2607 default:
2608 goto not_determinist;
2609 }
2610 done:
2611 if (atom1->neg != atom2->neg) {
2612 ret = !ret;
2613 }
2614 if (ret == 0)
2615 return(0);
2616 not_determinist:
2617 return(1);
2618 }
2619
2620 /**
2621 * xmlFARecurseDeterminism:
2622 * @ctxt: a regexp parser context
2623 *
2624 * Check whether the associated regexp is determinist,
2625 * should be called after xmlFAEliminateEpsilonTransitions()
2626 *
2627 */
2628 static int
xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,int to,xmlRegAtomPtr atom)2629 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2630 int to, xmlRegAtomPtr atom) {
2631 int ret = 1;
2632 int res;
2633 int transnr, nbTrans;
2634 xmlRegTransPtr t1;
2635 int deep = 1;
2636
2637 if (state == NULL)
2638 return(ret);
2639 if (state->markd == XML_REGEXP_MARK_VISITED)
2640 return(ret);
2641
2642 if (ctxt->flags & AM_AUTOMATA_RNG)
2643 deep = 0;
2644
2645 /*
2646 * don't recurse on transitions potentially added in the course of
2647 * the elimination.
2648 */
2649 nbTrans = state->nbTrans;
2650 for (transnr = 0;transnr < nbTrans;transnr++) {
2651 t1 = &(state->trans[transnr]);
2652 /*
2653 * check transitions conflicting with the one looked at
2654 */
2655 if (t1->atom == NULL) {
2656 if (t1->to < 0)
2657 continue;
2658 state->markd = XML_REGEXP_MARK_VISITED;
2659 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2660 to, atom);
2661 if (res == 0) {
2662 ret = 0;
2663 /* t1->nd = 1; */
2664 }
2665 continue;
2666 }
2667 if (t1->to != to)
2668 continue;
2669 if (xmlFACompareAtoms(t1->atom, atom, deep)) {
2670 ret = 0;
2671 /* mark the transition as non-deterministic */
2672 t1->nd = 1;
2673 }
2674 }
2675 return(ret);
2676 }
2677
2678 /**
2679 * xmlFAFinishRecurseDeterminism:
2680 * @ctxt: a regexp parser context
2681 *
2682 * Reset flags after checking determinism.
2683 */
2684 static void
xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state)2685 xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
2686 int transnr, nbTrans;
2687
2688 if (state == NULL)
2689 return;
2690 if (state->markd != XML_REGEXP_MARK_VISITED)
2691 return;
2692 state->markd = 0;
2693
2694 nbTrans = state->nbTrans;
2695 for (transnr = 0; transnr < nbTrans; transnr++) {
2696 xmlRegTransPtr t1 = &state->trans[transnr];
2697 if ((t1->atom == NULL) && (t1->to >= 0))
2698 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2699 }
2700 }
2701
2702 /**
2703 * xmlFAComputesDeterminism:
2704 * @ctxt: a regexp parser context
2705 *
2706 * Check whether the associated regexp is determinist,
2707 * should be called after xmlFAEliminateEpsilonTransitions()
2708 *
2709 */
2710 static int
xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt)2711 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2712 int statenr, transnr;
2713 xmlRegStatePtr state;
2714 xmlRegTransPtr t1, t2, last;
2715 int i;
2716 int ret = 1;
2717 int deep = 1;
2718
2719 #ifdef DEBUG_REGEXP_GRAPH
2720 printf("xmlFAComputesDeterminism\n");
2721 xmlRegPrintCtxt(stdout, ctxt);
2722 #endif
2723 if (ctxt->determinist != -1)
2724 return(ctxt->determinist);
2725
2726 if (ctxt->flags & AM_AUTOMATA_RNG)
2727 deep = 0;
2728
2729 /*
2730 * First cleanup the automata removing cancelled transitions
2731 */
2732 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2733 state = ctxt->states[statenr];
2734 if (state == NULL)
2735 continue;
2736 if (state->nbTrans < 2)
2737 continue;
2738 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2739 t1 = &(state->trans[transnr]);
2740 /*
2741 * Determinism checks in case of counted or all transitions
2742 * will have to be handled separately
2743 */
2744 if (t1->atom == NULL) {
2745 /* t1->nd = 1; */
2746 continue;
2747 }
2748 if (t1->to == -1) /* eliminated */
2749 continue;
2750 for (i = 0;i < transnr;i++) {
2751 t2 = &(state->trans[i]);
2752 if (t2->to == -1) /* eliminated */
2753 continue;
2754 if (t2->atom != NULL) {
2755 if (t1->to == t2->to) {
2756 /*
2757 * Here we use deep because we want to keep the
2758 * transitions which indicate a conflict
2759 */
2760 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
2761 (t1->counter == t2->counter) &&
2762 (t1->count == t2->count))
2763 t2->to = -1; /* eliminated */
2764 }
2765 }
2766 }
2767 }
2768 }
2769
2770 /*
2771 * Check for all states that there aren't 2 transitions
2772 * with the same atom and a different target.
2773 */
2774 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2775 state = ctxt->states[statenr];
2776 if (state == NULL)
2777 continue;
2778 if (state->nbTrans < 2)
2779 continue;
2780 last = NULL;
2781 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2782 t1 = &(state->trans[transnr]);
2783 /*
2784 * Determinism checks in case of counted or all transitions
2785 * will have to be handled separately
2786 */
2787 if (t1->atom == NULL) {
2788 continue;
2789 }
2790 if (t1->to == -1) /* eliminated */
2791 continue;
2792 for (i = 0;i < transnr;i++) {
2793 t2 = &(state->trans[i]);
2794 if (t2->to == -1) /* eliminated */
2795 continue;
2796 if (t2->atom != NULL) {
2797 /*
2798 * But here we don't use deep because we want to
2799 * find transitions which indicate a conflict
2800 */
2801 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
2802 ret = 0;
2803 /* mark the transitions as non-deterministic ones */
2804 t1->nd = 1;
2805 t2->nd = 1;
2806 last = t1;
2807 }
2808 } else if (t1->to != -1) {
2809 /*
2810 * do the closure in case of remaining specific
2811 * epsilon transitions like choices or all
2812 */
2813 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2814 t2->to, t2->atom);
2815 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2816 /* don't shortcut the computation so all non deterministic
2817 transition get marked down
2818 if (ret == 0)
2819 return(0);
2820 */
2821 if (ret == 0) {
2822 t1->nd = 1;
2823 /* t2->nd = 1; */
2824 last = t1;
2825 }
2826 }
2827 }
2828 /* don't shortcut the computation so all non deterministic
2829 transition get marked down
2830 if (ret == 0)
2831 break; */
2832 }
2833
2834 /*
2835 * mark specifically the last non-deterministic transition
2836 * from a state since there is no need to set-up rollback
2837 * from it
2838 */
2839 if (last != NULL) {
2840 last->nd = 2;
2841 }
2842
2843 /* don't shortcut the computation so all non deterministic
2844 transition get marked down
2845 if (ret == 0)
2846 break; */
2847 }
2848
2849 ctxt->determinist = ret;
2850 return(ret);
2851 }
2852
2853 /************************************************************************
2854 * *
2855 * Routines to check input against transition atoms *
2856 * *
2857 ************************************************************************/
2858
2859 static int
xmlRegCheckCharacterRange(xmlRegAtomType type,int codepoint,int neg,int start,int end,const xmlChar * blockName)2860 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2861 int start, int end, const xmlChar *blockName) {
2862 int ret = 0;
2863
2864 switch (type) {
2865 case XML_REGEXP_STRING:
2866 case XML_REGEXP_SUBREG:
2867 case XML_REGEXP_RANGES:
2868 case XML_REGEXP_EPSILON:
2869 return(-1);
2870 case XML_REGEXP_ANYCHAR:
2871 ret = ((codepoint != '\n') && (codepoint != '\r'));
2872 break;
2873 case XML_REGEXP_CHARVAL:
2874 ret = ((codepoint >= start) && (codepoint <= end));
2875 break;
2876 case XML_REGEXP_NOTSPACE:
2877 neg = !neg;
2878 /* Falls through. */
2879 case XML_REGEXP_ANYSPACE:
2880 ret = ((codepoint == '\n') || (codepoint == '\r') ||
2881 (codepoint == '\t') || (codepoint == ' '));
2882 break;
2883 case XML_REGEXP_NOTINITNAME:
2884 neg = !neg;
2885 /* Falls through. */
2886 case XML_REGEXP_INITNAME:
2887 ret = (IS_LETTER(codepoint) ||
2888 (codepoint == '_') || (codepoint == ':'));
2889 break;
2890 case XML_REGEXP_NOTNAMECHAR:
2891 neg = !neg;
2892 /* Falls through. */
2893 case XML_REGEXP_NAMECHAR:
2894 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
2895 (codepoint == '.') || (codepoint == '-') ||
2896 (codepoint == '_') || (codepoint == ':') ||
2897 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
2898 break;
2899 case XML_REGEXP_NOTDECIMAL:
2900 neg = !neg;
2901 /* Falls through. */
2902 case XML_REGEXP_DECIMAL:
2903 ret = xmlUCSIsCatNd(codepoint);
2904 break;
2905 case XML_REGEXP_REALCHAR:
2906 neg = !neg;
2907 /* Falls through. */
2908 case XML_REGEXP_NOTREALCHAR:
2909 ret = xmlUCSIsCatP(codepoint);
2910 if (ret == 0)
2911 ret = xmlUCSIsCatZ(codepoint);
2912 if (ret == 0)
2913 ret = xmlUCSIsCatC(codepoint);
2914 break;
2915 case XML_REGEXP_LETTER:
2916 ret = xmlUCSIsCatL(codepoint);
2917 break;
2918 case XML_REGEXP_LETTER_UPPERCASE:
2919 ret = xmlUCSIsCatLu(codepoint);
2920 break;
2921 case XML_REGEXP_LETTER_LOWERCASE:
2922 ret = xmlUCSIsCatLl(codepoint);
2923 break;
2924 case XML_REGEXP_LETTER_TITLECASE:
2925 ret = xmlUCSIsCatLt(codepoint);
2926 break;
2927 case XML_REGEXP_LETTER_MODIFIER:
2928 ret = xmlUCSIsCatLm(codepoint);
2929 break;
2930 case XML_REGEXP_LETTER_OTHERS:
2931 ret = xmlUCSIsCatLo(codepoint);
2932 break;
2933 case XML_REGEXP_MARK:
2934 ret = xmlUCSIsCatM(codepoint);
2935 break;
2936 case XML_REGEXP_MARK_NONSPACING:
2937 ret = xmlUCSIsCatMn(codepoint);
2938 break;
2939 case XML_REGEXP_MARK_SPACECOMBINING:
2940 ret = xmlUCSIsCatMc(codepoint);
2941 break;
2942 case XML_REGEXP_MARK_ENCLOSING:
2943 ret = xmlUCSIsCatMe(codepoint);
2944 break;
2945 case XML_REGEXP_NUMBER:
2946 ret = xmlUCSIsCatN(codepoint);
2947 break;
2948 case XML_REGEXP_NUMBER_DECIMAL:
2949 ret = xmlUCSIsCatNd(codepoint);
2950 break;
2951 case XML_REGEXP_NUMBER_LETTER:
2952 ret = xmlUCSIsCatNl(codepoint);
2953 break;
2954 case XML_REGEXP_NUMBER_OTHERS:
2955 ret = xmlUCSIsCatNo(codepoint);
2956 break;
2957 case XML_REGEXP_PUNCT:
2958 ret = xmlUCSIsCatP(codepoint);
2959 break;
2960 case XML_REGEXP_PUNCT_CONNECTOR:
2961 ret = xmlUCSIsCatPc(codepoint);
2962 break;
2963 case XML_REGEXP_PUNCT_DASH:
2964 ret = xmlUCSIsCatPd(codepoint);
2965 break;
2966 case XML_REGEXP_PUNCT_OPEN:
2967 ret = xmlUCSIsCatPs(codepoint);
2968 break;
2969 case XML_REGEXP_PUNCT_CLOSE:
2970 ret = xmlUCSIsCatPe(codepoint);
2971 break;
2972 case XML_REGEXP_PUNCT_INITQUOTE:
2973 ret = xmlUCSIsCatPi(codepoint);
2974 break;
2975 case XML_REGEXP_PUNCT_FINQUOTE:
2976 ret = xmlUCSIsCatPf(codepoint);
2977 break;
2978 case XML_REGEXP_PUNCT_OTHERS:
2979 ret = xmlUCSIsCatPo(codepoint);
2980 break;
2981 case XML_REGEXP_SEPAR:
2982 ret = xmlUCSIsCatZ(codepoint);
2983 break;
2984 case XML_REGEXP_SEPAR_SPACE:
2985 ret = xmlUCSIsCatZs(codepoint);
2986 break;
2987 case XML_REGEXP_SEPAR_LINE:
2988 ret = xmlUCSIsCatZl(codepoint);
2989 break;
2990 case XML_REGEXP_SEPAR_PARA:
2991 ret = xmlUCSIsCatZp(codepoint);
2992 break;
2993 case XML_REGEXP_SYMBOL:
2994 ret = xmlUCSIsCatS(codepoint);
2995 break;
2996 case XML_REGEXP_SYMBOL_MATH:
2997 ret = xmlUCSIsCatSm(codepoint);
2998 break;
2999 case XML_REGEXP_SYMBOL_CURRENCY:
3000 ret = xmlUCSIsCatSc(codepoint);
3001 break;
3002 case XML_REGEXP_SYMBOL_MODIFIER:
3003 ret = xmlUCSIsCatSk(codepoint);
3004 break;
3005 case XML_REGEXP_SYMBOL_OTHERS:
3006 ret = xmlUCSIsCatSo(codepoint);
3007 break;
3008 case XML_REGEXP_OTHER:
3009 ret = xmlUCSIsCatC(codepoint);
3010 break;
3011 case XML_REGEXP_OTHER_CONTROL:
3012 ret = xmlUCSIsCatCc(codepoint);
3013 break;
3014 case XML_REGEXP_OTHER_FORMAT:
3015 ret = xmlUCSIsCatCf(codepoint);
3016 break;
3017 case XML_REGEXP_OTHER_PRIVATE:
3018 ret = xmlUCSIsCatCo(codepoint);
3019 break;
3020 case XML_REGEXP_OTHER_NA:
3021 /* ret = xmlUCSIsCatCn(codepoint); */
3022 /* Seems it doesn't exist anymore in recent Unicode releases */
3023 ret = 0;
3024 break;
3025 case XML_REGEXP_BLOCK_NAME:
3026 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
3027 break;
3028 }
3029 if (neg)
3030 return(!ret);
3031 return(ret);
3032 }
3033
3034 static int
xmlRegCheckCharacter(xmlRegAtomPtr atom,int codepoint)3035 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
3036 int i, ret = 0;
3037 xmlRegRangePtr range;
3038
3039 if ((atom == NULL) || (!IS_CHAR(codepoint)))
3040 return(-1);
3041
3042 switch (atom->type) {
3043 case XML_REGEXP_SUBREG:
3044 case XML_REGEXP_EPSILON:
3045 return(-1);
3046 case XML_REGEXP_CHARVAL:
3047 return(codepoint == atom->codepoint);
3048 case XML_REGEXP_RANGES: {
3049 int accept = 0;
3050
3051 for (i = 0;i < atom->nbRanges;i++) {
3052 range = atom->ranges[i];
3053 if (range->neg == 2) {
3054 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3055 0, range->start, range->end,
3056 range->blockName);
3057 if (ret != 0)
3058 return(0); /* excluded char */
3059 } else if (range->neg) {
3060 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3061 0, range->start, range->end,
3062 range->blockName);
3063 if (ret == 0)
3064 accept = 1;
3065 else
3066 return(0);
3067 } else {
3068 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3069 0, range->start, range->end,
3070 range->blockName);
3071 if (ret != 0)
3072 accept = 1; /* might still be excluded */
3073 }
3074 }
3075 return(accept);
3076 }
3077 case XML_REGEXP_STRING:
3078 printf("TODO: XML_REGEXP_STRING\n");
3079 return(-1);
3080 case XML_REGEXP_ANYCHAR:
3081 case XML_REGEXP_ANYSPACE:
3082 case XML_REGEXP_NOTSPACE:
3083 case XML_REGEXP_INITNAME:
3084 case XML_REGEXP_NOTINITNAME:
3085 case XML_REGEXP_NAMECHAR:
3086 case XML_REGEXP_NOTNAMECHAR:
3087 case XML_REGEXP_DECIMAL:
3088 case XML_REGEXP_NOTDECIMAL:
3089 case XML_REGEXP_REALCHAR:
3090 case XML_REGEXP_NOTREALCHAR:
3091 case XML_REGEXP_LETTER:
3092 case XML_REGEXP_LETTER_UPPERCASE:
3093 case XML_REGEXP_LETTER_LOWERCASE:
3094 case XML_REGEXP_LETTER_TITLECASE:
3095 case XML_REGEXP_LETTER_MODIFIER:
3096 case XML_REGEXP_LETTER_OTHERS:
3097 case XML_REGEXP_MARK:
3098 case XML_REGEXP_MARK_NONSPACING:
3099 case XML_REGEXP_MARK_SPACECOMBINING:
3100 case XML_REGEXP_MARK_ENCLOSING:
3101 case XML_REGEXP_NUMBER:
3102 case XML_REGEXP_NUMBER_DECIMAL:
3103 case XML_REGEXP_NUMBER_LETTER:
3104 case XML_REGEXP_NUMBER_OTHERS:
3105 case XML_REGEXP_PUNCT:
3106 case XML_REGEXP_PUNCT_CONNECTOR:
3107 case XML_REGEXP_PUNCT_DASH:
3108 case XML_REGEXP_PUNCT_OPEN:
3109 case XML_REGEXP_PUNCT_CLOSE:
3110 case XML_REGEXP_PUNCT_INITQUOTE:
3111 case XML_REGEXP_PUNCT_FINQUOTE:
3112 case XML_REGEXP_PUNCT_OTHERS:
3113 case XML_REGEXP_SEPAR:
3114 case XML_REGEXP_SEPAR_SPACE:
3115 case XML_REGEXP_SEPAR_LINE:
3116 case XML_REGEXP_SEPAR_PARA:
3117 case XML_REGEXP_SYMBOL:
3118 case XML_REGEXP_SYMBOL_MATH:
3119 case XML_REGEXP_SYMBOL_CURRENCY:
3120 case XML_REGEXP_SYMBOL_MODIFIER:
3121 case XML_REGEXP_SYMBOL_OTHERS:
3122 case XML_REGEXP_OTHER:
3123 case XML_REGEXP_OTHER_CONTROL:
3124 case XML_REGEXP_OTHER_FORMAT:
3125 case XML_REGEXP_OTHER_PRIVATE:
3126 case XML_REGEXP_OTHER_NA:
3127 case XML_REGEXP_BLOCK_NAME:
3128 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3129 (const xmlChar *)atom->valuep);
3130 if (atom->neg)
3131 ret = !ret;
3132 break;
3133 }
3134 return(ret);
3135 }
3136
3137 /************************************************************************
3138 * *
3139 * Saving and restoring state of an execution context *
3140 * *
3141 ************************************************************************/
3142
3143 #ifdef DEBUG_REGEXP_EXEC
3144 static void
xmlFARegDebugExec(xmlRegExecCtxtPtr exec)3145 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
3146 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
3147 if (exec->inputStack != NULL) {
3148 int i;
3149 printf(": ");
3150 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
3151 printf("%s ", (const char *)
3152 exec->inputStack[exec->inputStackNr - (i + 1)].value);
3153 } else {
3154 printf(": %s", &(exec->inputString[exec->index]));
3155 }
3156 printf("\n");
3157 }
3158 #endif
3159
3160 static void
xmlFARegExecSave(xmlRegExecCtxtPtr exec)3161 xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3162 #ifdef DEBUG_REGEXP_EXEC
3163 printf("saving ");
3164 exec->transno++;
3165 xmlFARegDebugExec(exec);
3166 exec->transno--;
3167 #endif
3168 #ifdef MAX_PUSH
3169 if (exec->nbPush > MAX_PUSH) {
3170 return;
3171 }
3172 exec->nbPush++;
3173 #endif
3174
3175 if (exec->maxRollbacks == 0) {
3176 exec->maxRollbacks = 4;
3177 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3178 sizeof(xmlRegExecRollback));
3179 if (exec->rollbacks == NULL) {
3180 xmlRegexpErrMemory(NULL, "saving regexp");
3181 exec->maxRollbacks = 0;
3182 return;
3183 }
3184 memset(exec->rollbacks, 0,
3185 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3186 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
3187 xmlRegExecRollback *tmp;
3188 int len = exec->maxRollbacks;
3189
3190 exec->maxRollbacks *= 2;
3191 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3192 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3193 if (tmp == NULL) {
3194 xmlRegexpErrMemory(NULL, "saving regexp");
3195 exec->maxRollbacks /= 2;
3196 return;
3197 }
3198 exec->rollbacks = tmp;
3199 tmp = &exec->rollbacks[len];
3200 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3201 }
3202 exec->rollbacks[exec->nbRollbacks].state = exec->state;
3203 exec->rollbacks[exec->nbRollbacks].index = exec->index;
3204 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3205 if (exec->comp->nbCounters > 0) {
3206 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3207 exec->rollbacks[exec->nbRollbacks].counts = (int *)
3208 xmlMalloc(exec->comp->nbCounters * sizeof(int));
3209 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3210 xmlRegexpErrMemory(NULL, "saving regexp");
3211 exec->status = -5;
3212 return;
3213 }
3214 }
3215 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3216 exec->comp->nbCounters * sizeof(int));
3217 }
3218 exec->nbRollbacks++;
3219 }
3220
3221 static void
xmlFARegExecRollBack(xmlRegExecCtxtPtr exec)3222 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3223 if (exec->nbRollbacks <= 0) {
3224 exec->status = -1;
3225 #ifdef DEBUG_REGEXP_EXEC
3226 printf("rollback failed on empty stack\n");
3227 #endif
3228 return;
3229 }
3230 exec->nbRollbacks--;
3231 exec->state = exec->rollbacks[exec->nbRollbacks].state;
3232 exec->index = exec->rollbacks[exec->nbRollbacks].index;
3233 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3234 if (exec->comp->nbCounters > 0) {
3235 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3236 fprintf(stderr, "exec save: allocation failed");
3237 exec->status = -6;
3238 return;
3239 }
3240 if (exec->counts) {
3241 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
3242 exec->comp->nbCounters * sizeof(int));
3243 }
3244 }
3245
3246 #ifdef DEBUG_REGEXP_EXEC
3247 printf("restored ");
3248 xmlFARegDebugExec(exec);
3249 #endif
3250 }
3251
3252 /************************************************************************
3253 * *
3254 * Verifier, running an input against a compiled regexp *
3255 * *
3256 ************************************************************************/
3257
3258 static int
xmlFARegExec(xmlRegexpPtr comp,const xmlChar * content)3259 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3260 xmlRegExecCtxt execval;
3261 xmlRegExecCtxtPtr exec = &execval;
3262 int ret, codepoint = 0, len, deter;
3263
3264 exec->inputString = content;
3265 exec->index = 0;
3266 exec->nbPush = 0;
3267 exec->determinist = 1;
3268 exec->maxRollbacks = 0;
3269 exec->nbRollbacks = 0;
3270 exec->rollbacks = NULL;
3271 exec->status = 0;
3272 exec->comp = comp;
3273 exec->state = comp->states[0];
3274 exec->transno = 0;
3275 exec->transcount = 0;
3276 exec->inputStack = NULL;
3277 exec->inputStackMax = 0;
3278 if (comp->nbCounters > 0) {
3279 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
3280 if (exec->counts == NULL) {
3281 xmlRegexpErrMemory(NULL, "running regexp");
3282 return(-1);
3283 }
3284 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3285 } else
3286 exec->counts = NULL;
3287 while ((exec->status == 0) && (exec->state != NULL) &&
3288 ((exec->inputString[exec->index] != 0) ||
3289 ((exec->state != NULL) &&
3290 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3291 xmlRegTransPtr trans;
3292 xmlRegAtomPtr atom;
3293
3294 /*
3295 * If end of input on non-terminal state, rollback, however we may
3296 * still have epsilon like transition for counted transitions
3297 * on counters, in that case don't break too early. Additionally,
3298 * if we are working on a range like "AB{0,2}", where B is not present,
3299 * we don't want to break.
3300 */
3301 len = 1;
3302 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
3303 /*
3304 * if there is a transition, we must check if
3305 * atom allows minOccurs of 0
3306 */
3307 if (exec->transno < exec->state->nbTrans) {
3308 trans = &exec->state->trans[exec->transno];
3309 if (trans->to >=0) {
3310 atom = trans->atom;
3311 if (!((atom->min == 0) && (atom->max > 0)))
3312 goto rollback;
3313 }
3314 } else
3315 goto rollback;
3316 }
3317
3318 exec->transcount = 0;
3319 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3320 trans = &exec->state->trans[exec->transno];
3321 if (trans->to < 0)
3322 continue;
3323 atom = trans->atom;
3324 ret = 0;
3325 deter = 1;
3326 if (trans->count >= 0) {
3327 int count;
3328 xmlRegCounterPtr counter;
3329
3330 if (exec->counts == NULL) {
3331 exec->status = -1;
3332 goto error;
3333 }
3334 /*
3335 * A counted transition.
3336 */
3337
3338 count = exec->counts[trans->count];
3339 counter = &exec->comp->counters[trans->count];
3340 #ifdef DEBUG_REGEXP_EXEC
3341 printf("testing count %d: val %d, min %d, max %d\n",
3342 trans->count, count, counter->min, counter->max);
3343 #endif
3344 ret = ((count >= counter->min) && (count <= counter->max));
3345 if ((ret) && (counter->min != counter->max))
3346 deter = 0;
3347 } else if (atom == NULL) {
3348 fprintf(stderr, "epsilon transition left at runtime\n");
3349 exec->status = -2;
3350 break;
3351 } else if (exec->inputString[exec->index] != 0) {
3352 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3353 ret = xmlRegCheckCharacter(atom, codepoint);
3354 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
3355 xmlRegStatePtr to = comp->states[trans->to];
3356
3357 /*
3358 * this is a multiple input sequence
3359 * If there is a counter associated increment it now.
3360 * before potentially saving and rollback
3361 * do not increment if the counter is already over the
3362 * maximum limit in which case get to next transition
3363 */
3364 if (trans->counter >= 0) {
3365 xmlRegCounterPtr counter;
3366
3367 if ((exec->counts == NULL) ||
3368 (exec->comp == NULL) ||
3369 (exec->comp->counters == NULL)) {
3370 exec->status = -1;
3371 goto error;
3372 }
3373 counter = &exec->comp->counters[trans->counter];
3374 if (exec->counts[trans->counter] >= counter->max)
3375 continue; /* for loop on transitions */
3376
3377 #ifdef DEBUG_REGEXP_EXEC
3378 printf("Increasing count %d\n", trans->counter);
3379 #endif
3380 exec->counts[trans->counter]++;
3381 }
3382 if (exec->state->nbTrans > exec->transno + 1) {
3383 xmlFARegExecSave(exec);
3384 }
3385 exec->transcount = 1;
3386 do {
3387 /*
3388 * Try to progress as much as possible on the input
3389 */
3390 if (exec->transcount == atom->max) {
3391 break;
3392 }
3393 exec->index += len;
3394 /*
3395 * End of input: stop here
3396 */
3397 if (exec->inputString[exec->index] == 0) {
3398 exec->index -= len;
3399 break;
3400 }
3401 if (exec->transcount >= atom->min) {
3402 int transno = exec->transno;
3403 xmlRegStatePtr state = exec->state;
3404
3405 /*
3406 * The transition is acceptable save it
3407 */
3408 exec->transno = -1; /* trick */
3409 exec->state = to;
3410 xmlFARegExecSave(exec);
3411 exec->transno = transno;
3412 exec->state = state;
3413 }
3414 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3415 len);
3416 ret = xmlRegCheckCharacter(atom, codepoint);
3417 exec->transcount++;
3418 } while (ret == 1);
3419 if (exec->transcount < atom->min)
3420 ret = 0;
3421
3422 /*
3423 * If the last check failed but one transition was found
3424 * possible, rollback
3425 */
3426 if (ret < 0)
3427 ret = 0;
3428 if (ret == 0) {
3429 goto rollback;
3430 }
3431 if (trans->counter >= 0) {
3432 if (exec->counts == NULL) {
3433 exec->status = -1;
3434 goto error;
3435 }
3436 #ifdef DEBUG_REGEXP_EXEC
3437 printf("Decreasing count %d\n", trans->counter);
3438 #endif
3439 exec->counts[trans->counter]--;
3440 }
3441 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3442 /*
3443 * we don't match on the codepoint, but minOccurs of 0
3444 * says that's ok. Setting len to 0 inhibits stepping
3445 * over the codepoint.
3446 */
3447 exec->transcount = 1;
3448 len = 0;
3449 ret = 1;
3450 }
3451 } else if ((atom->min == 0) && (atom->max > 0)) {
3452 /* another spot to match when minOccurs is 0 */
3453 exec->transcount = 1;
3454 len = 0;
3455 ret = 1;
3456 }
3457 if (ret == 1) {
3458 if ((trans->nd == 1) ||
3459 ((trans->count >= 0) && (deter == 0) &&
3460 (exec->state->nbTrans > exec->transno + 1))) {
3461 #ifdef DEBUG_REGEXP_EXEC
3462 if (trans->nd == 1)
3463 printf("Saving on nd transition atom %d for %c at %d\n",
3464 trans->atom->no, codepoint, exec->index);
3465 else
3466 printf("Saving on counted transition count %d for %c at %d\n",
3467 trans->count, codepoint, exec->index);
3468 #endif
3469 xmlFARegExecSave(exec);
3470 }
3471 if (trans->counter >= 0) {
3472 xmlRegCounterPtr counter;
3473
3474 /* make sure we don't go over the counter maximum value */
3475 if ((exec->counts == NULL) ||
3476 (exec->comp == NULL) ||
3477 (exec->comp->counters == NULL)) {
3478 exec->status = -1;
3479 goto error;
3480 }
3481 counter = &exec->comp->counters[trans->counter];
3482 if (exec->counts[trans->counter] >= counter->max)
3483 continue; /* for loop on transitions */
3484 #ifdef DEBUG_REGEXP_EXEC
3485 printf("Increasing count %d\n", trans->counter);
3486 #endif
3487 exec->counts[trans->counter]++;
3488 }
3489 if ((trans->count >= 0) &&
3490 (trans->count < REGEXP_ALL_COUNTER)) {
3491 if (exec->counts == NULL) {
3492 exec->status = -1;
3493 goto error;
3494 }
3495 #ifdef DEBUG_REGEXP_EXEC
3496 printf("resetting count %d on transition\n",
3497 trans->count);
3498 #endif
3499 exec->counts[trans->count] = 0;
3500 }
3501 #ifdef DEBUG_REGEXP_EXEC
3502 printf("entering state %d\n", trans->to);
3503 #endif
3504 exec->state = comp->states[trans->to];
3505 exec->transno = 0;
3506 if (trans->atom != NULL) {
3507 exec->index += len;
3508 }
3509 goto progress;
3510 } else if (ret < 0) {
3511 exec->status = -4;
3512 break;
3513 }
3514 }
3515 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3516 rollback:
3517 /*
3518 * Failed to find a way out
3519 */
3520 exec->determinist = 0;
3521 #ifdef DEBUG_REGEXP_EXEC
3522 printf("rollback from state %d on %d:%c\n", exec->state->no,
3523 codepoint,codepoint);
3524 #endif
3525 xmlFARegExecRollBack(exec);
3526 }
3527 progress:
3528 continue;
3529 }
3530 error:
3531 if (exec->rollbacks != NULL) {
3532 if (exec->counts != NULL) {
3533 int i;
3534
3535 for (i = 0;i < exec->maxRollbacks;i++)
3536 if (exec->rollbacks[i].counts != NULL)
3537 xmlFree(exec->rollbacks[i].counts);
3538 }
3539 xmlFree(exec->rollbacks);
3540 }
3541 if (exec->state == NULL)
3542 return(-1);
3543 if (exec->counts != NULL)
3544 xmlFree(exec->counts);
3545 if (exec->status == 0)
3546 return(1);
3547 if (exec->status == -1) {
3548 if (exec->nbPush > MAX_PUSH)
3549 return(-1);
3550 return(0);
3551 }
3552 return(exec->status);
3553 }
3554
3555 /************************************************************************
3556 * *
3557 * Progressive interface to the verifier one atom at a time *
3558 * *
3559 ************************************************************************/
3560 #ifdef DEBUG_ERR
3561 static void testerr(xmlRegExecCtxtPtr exec);
3562 #endif
3563
3564 /**
3565 * xmlRegNewExecCtxt:
3566 * @comp: a precompiled regular expression
3567 * @callback: a callback function used for handling progresses in the
3568 * automata matching phase
3569 * @data: the context data associated to the callback in this context
3570 *
3571 * Build a context used for progressive evaluation of a regexp.
3572 *
3573 * Returns the new context
3574 */
3575 xmlRegExecCtxtPtr
xmlRegNewExecCtxt(xmlRegexpPtr comp,xmlRegExecCallbacks callback,void * data)3576 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3577 xmlRegExecCtxtPtr exec;
3578
3579 if (comp == NULL)
3580 return(NULL);
3581 if ((comp->compact == NULL) && (comp->states == NULL))
3582 return(NULL);
3583 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3584 if (exec == NULL) {
3585 xmlRegexpErrMemory(NULL, "creating execution context");
3586 return(NULL);
3587 }
3588 memset(exec, 0, sizeof(xmlRegExecCtxt));
3589 exec->inputString = NULL;
3590 exec->index = 0;
3591 exec->determinist = 1;
3592 exec->maxRollbacks = 0;
3593 exec->nbRollbacks = 0;
3594 exec->rollbacks = NULL;
3595 exec->status = 0;
3596 exec->comp = comp;
3597 if (comp->compact == NULL)
3598 exec->state = comp->states[0];
3599 exec->transno = 0;
3600 exec->transcount = 0;
3601 exec->callback = callback;
3602 exec->data = data;
3603 if (comp->nbCounters > 0) {
3604 /*
3605 * For error handling, exec->counts is allocated twice the size
3606 * the second half is used to store the data in case of rollback
3607 */
3608 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3609 * 2);
3610 if (exec->counts == NULL) {
3611 xmlRegexpErrMemory(NULL, "creating execution context");
3612 xmlFree(exec);
3613 return(NULL);
3614 }
3615 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3616 exec->errCounts = &exec->counts[comp->nbCounters];
3617 } else {
3618 exec->counts = NULL;
3619 exec->errCounts = NULL;
3620 }
3621 exec->inputStackMax = 0;
3622 exec->inputStackNr = 0;
3623 exec->inputStack = NULL;
3624 exec->errStateNo = -1;
3625 exec->errString = NULL;
3626 exec->nbPush = 0;
3627 return(exec);
3628 }
3629
3630 /**
3631 * xmlRegFreeExecCtxt:
3632 * @exec: a regular expression evaluation context
3633 *
3634 * Free the structures associated to a regular expression evaluation context.
3635 */
3636 void
xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec)3637 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3638 if (exec == NULL)
3639 return;
3640
3641 if (exec->rollbacks != NULL) {
3642 if (exec->counts != NULL) {
3643 int i;
3644
3645 for (i = 0;i < exec->maxRollbacks;i++)
3646 if (exec->rollbacks[i].counts != NULL)
3647 xmlFree(exec->rollbacks[i].counts);
3648 }
3649 xmlFree(exec->rollbacks);
3650 }
3651 if (exec->counts != NULL)
3652 xmlFree(exec->counts);
3653 if (exec->inputStack != NULL) {
3654 int i;
3655
3656 for (i = 0;i < exec->inputStackNr;i++) {
3657 if (exec->inputStack[i].value != NULL)
3658 xmlFree(exec->inputStack[i].value);
3659 }
3660 xmlFree(exec->inputStack);
3661 }
3662 if (exec->errString != NULL)
3663 xmlFree(exec->errString);
3664 xmlFree(exec);
3665 }
3666
3667 static void
xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)3668 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3669 void *data) {
3670 #ifdef DEBUG_PUSH
3671 printf("saving value: %d:%s\n", exec->inputStackNr, value);
3672 #endif
3673 if (exec->inputStackMax == 0) {
3674 exec->inputStackMax = 4;
3675 exec->inputStack = (xmlRegInputTokenPtr)
3676 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3677 if (exec->inputStack == NULL) {
3678 xmlRegexpErrMemory(NULL, "pushing input string");
3679 exec->inputStackMax = 0;
3680 return;
3681 }
3682 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3683 xmlRegInputTokenPtr tmp;
3684
3685 exec->inputStackMax *= 2;
3686 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3687 exec->inputStackMax * sizeof(xmlRegInputToken));
3688 if (tmp == NULL) {
3689 xmlRegexpErrMemory(NULL, "pushing input string");
3690 exec->inputStackMax /= 2;
3691 return;
3692 }
3693 exec->inputStack = tmp;
3694 }
3695 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3696 exec->inputStack[exec->inputStackNr].data = data;
3697 exec->inputStackNr++;
3698 exec->inputStack[exec->inputStackNr].value = NULL;
3699 exec->inputStack[exec->inputStackNr].data = NULL;
3700 }
3701
3702 /**
3703 * xmlRegStrEqualWildcard:
3704 * @expStr: the string to be evaluated
3705 * @valStr: the validation string
3706 *
3707 * Checks if both strings are equal or have the same content. "*"
3708 * can be used as a wildcard in @valStr; "|" is used as a separator of
3709 * substrings in both @expStr and @valStr.
3710 *
3711 * Returns 1 if the comparison is satisfied and the number of substrings
3712 * is equal, 0 otherwise.
3713 */
3714
3715 static int
xmlRegStrEqualWildcard(const xmlChar * expStr,const xmlChar * valStr)3716 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3717 if (expStr == valStr) return(1);
3718 if (expStr == NULL) return(0);
3719 if (valStr == NULL) return(0);
3720 do {
3721 /*
3722 * Eval if we have a wildcard for the current item.
3723 */
3724 if (*expStr != *valStr) {
3725 /* if one of them starts with a wildcard make valStr be it */
3726 if (*valStr == '*') {
3727 const xmlChar *tmp;
3728
3729 tmp = valStr;
3730 valStr = expStr;
3731 expStr = tmp;
3732 }
3733 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3734 do {
3735 if (*valStr == XML_REG_STRING_SEPARATOR)
3736 break;
3737 valStr++;
3738 } while (*valStr != 0);
3739 continue;
3740 } else
3741 return(0);
3742 }
3743 expStr++;
3744 valStr++;
3745 } while (*valStr != 0);
3746 if (*expStr != 0)
3747 return (0);
3748 else
3749 return (1);
3750 }
3751
3752 /**
3753 * xmlRegCompactPushString:
3754 * @exec: a regexp execution context
3755 * @comp: the precompiled exec with a compact table
3756 * @value: a string token input
3757 * @data: data associated to the token to reuse in callbacks
3758 *
3759 * Push one input token in the execution context
3760 *
3761 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3762 * a negative value in case of error.
3763 */
3764 static int
xmlRegCompactPushString(xmlRegExecCtxtPtr exec,xmlRegexpPtr comp,const xmlChar * value,void * data)3765 xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3766 xmlRegexpPtr comp,
3767 const xmlChar *value,
3768 void *data) {
3769 int state = exec->index;
3770 int i, target;
3771
3772 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3773 return(-1);
3774
3775 if (value == NULL) {
3776 /*
3777 * are we at a final state ?
3778 */
3779 if (comp->compact[state * (comp->nbstrings + 1)] ==
3780 XML_REGEXP_FINAL_STATE)
3781 return(1);
3782 return(0);
3783 }
3784
3785 #ifdef DEBUG_PUSH
3786 printf("value pushed: %s\n", value);
3787 #endif
3788
3789 /*
3790 * Examine all outside transitions from current state
3791 */
3792 for (i = 0;i < comp->nbstrings;i++) {
3793 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3794 if ((target > 0) && (target <= comp->nbstates)) {
3795 target--; /* to avoid 0 */
3796 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3797 exec->index = target;
3798 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3799 exec->callback(exec->data, value,
3800 comp->transdata[state * comp->nbstrings + i], data);
3801 }
3802 #ifdef DEBUG_PUSH
3803 printf("entering state %d\n", target);
3804 #endif
3805 if (comp->compact[target * (comp->nbstrings + 1)] ==
3806 XML_REGEXP_SINK_STATE)
3807 goto error;
3808
3809 if (comp->compact[target * (comp->nbstrings + 1)] ==
3810 XML_REGEXP_FINAL_STATE)
3811 return(1);
3812 return(0);
3813 }
3814 }
3815 }
3816 /*
3817 * Failed to find an exit transition out from current state for the
3818 * current token
3819 */
3820 #ifdef DEBUG_PUSH
3821 printf("failed to find a transition for %s on state %d\n", value, state);
3822 #endif
3823 error:
3824 if (exec->errString != NULL)
3825 xmlFree(exec->errString);
3826 exec->errString = xmlStrdup(value);
3827 exec->errStateNo = state;
3828 exec->status = -1;
3829 #ifdef DEBUG_ERR
3830 testerr(exec);
3831 #endif
3832 return(-1);
3833 }
3834
3835 /**
3836 * xmlRegExecPushStringInternal:
3837 * @exec: a regexp execution context or NULL to indicate the end
3838 * @value: a string token input
3839 * @data: data associated to the token to reuse in callbacks
3840 * @compound: value was assembled from 2 strings
3841 *
3842 * Push one input token in the execution context
3843 *
3844 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3845 * a negative value in case of error.
3846 */
3847 static int
xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data,int compound)3848 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3849 void *data, int compound) {
3850 xmlRegTransPtr trans;
3851 xmlRegAtomPtr atom;
3852 int ret;
3853 int final = 0;
3854 int progress = 1;
3855
3856 if (exec == NULL)
3857 return(-1);
3858 if (exec->comp == NULL)
3859 return(-1);
3860 if (exec->status != 0)
3861 return(exec->status);
3862
3863 if (exec->comp->compact != NULL)
3864 return(xmlRegCompactPushString(exec, exec->comp, value, data));
3865
3866 if (value == NULL) {
3867 if (exec->state->type == XML_REGEXP_FINAL_STATE)
3868 return(1);
3869 final = 1;
3870 }
3871
3872 #ifdef DEBUG_PUSH
3873 printf("value pushed: %s\n", value);
3874 #endif
3875 /*
3876 * If we have an active rollback stack push the new value there
3877 * and get back to where we were left
3878 */
3879 if ((value != NULL) && (exec->inputStackNr > 0)) {
3880 xmlFARegExecSaveInputString(exec, value, data);
3881 value = exec->inputStack[exec->index].value;
3882 data = exec->inputStack[exec->index].data;
3883 #ifdef DEBUG_PUSH
3884 printf("value loaded: %s\n", value);
3885 #endif
3886 }
3887
3888 while ((exec->status == 0) &&
3889 ((value != NULL) ||
3890 ((final == 1) &&
3891 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3892
3893 /*
3894 * End of input on non-terminal state, rollback, however we may
3895 * still have epsilon like transition for counted transitions
3896 * on counters, in that case don't break too early.
3897 */
3898 if ((value == NULL) && (exec->counts == NULL))
3899 goto rollback;
3900
3901 exec->transcount = 0;
3902 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3903 trans = &exec->state->trans[exec->transno];
3904 if (trans->to < 0)
3905 continue;
3906 atom = trans->atom;
3907 ret = 0;
3908 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3909 int i;
3910 int count;
3911 xmlRegTransPtr t;
3912 xmlRegCounterPtr counter;
3913
3914 ret = 0;
3915
3916 #ifdef DEBUG_PUSH
3917 printf("testing all lax %d\n", trans->count);
3918 #endif
3919 /*
3920 * Check all counted transitions from the current state
3921 */
3922 if ((value == NULL) && (final)) {
3923 ret = 1;
3924 } else if (value != NULL) {
3925 for (i = 0;i < exec->state->nbTrans;i++) {
3926 t = &exec->state->trans[i];
3927 if ((t->counter < 0) || (t == trans))
3928 continue;
3929 counter = &exec->comp->counters[t->counter];
3930 count = exec->counts[t->counter];
3931 if ((count < counter->max) &&
3932 (t->atom != NULL) &&
3933 (xmlStrEqual(value, t->atom->valuep))) {
3934 ret = 0;
3935 break;
3936 }
3937 if ((count >= counter->min) &&
3938 (count < counter->max) &&
3939 (t->atom != NULL) &&
3940 (xmlStrEqual(value, t->atom->valuep))) {
3941 ret = 1;
3942 break;
3943 }
3944 }
3945 }
3946 } else if (trans->count == REGEXP_ALL_COUNTER) {
3947 int i;
3948 int count;
3949 xmlRegTransPtr t;
3950 xmlRegCounterPtr counter;
3951
3952 ret = 1;
3953
3954 #ifdef DEBUG_PUSH
3955 printf("testing all %d\n", trans->count);
3956 #endif
3957 /*
3958 * Check all counted transitions from the current state
3959 */
3960 for (i = 0;i < exec->state->nbTrans;i++) {
3961 t = &exec->state->trans[i];
3962 if ((t->counter < 0) || (t == trans))
3963 continue;
3964 counter = &exec->comp->counters[t->counter];
3965 count = exec->counts[t->counter];
3966 if ((count < counter->min) || (count > counter->max)) {
3967 ret = 0;
3968 break;
3969 }
3970 }
3971 } else if (trans->count >= 0) {
3972 int count;
3973 xmlRegCounterPtr counter;
3974
3975 /*
3976 * A counted transition.
3977 */
3978
3979 count = exec->counts[trans->count];
3980 counter = &exec->comp->counters[trans->count];
3981 #ifdef DEBUG_PUSH
3982 printf("testing count %d: val %d, min %d, max %d\n",
3983 trans->count, count, counter->min, counter->max);
3984 #endif
3985 ret = ((count >= counter->min) && (count <= counter->max));
3986 } else if (atom == NULL) {
3987 fprintf(stderr, "epsilon transition left at runtime\n");
3988 exec->status = -2;
3989 break;
3990 } else if (value != NULL) {
3991 ret = xmlRegStrEqualWildcard(atom->valuep, value);
3992 if (atom->neg) {
3993 ret = !ret;
3994 if (!compound)
3995 ret = 0;
3996 }
3997 if ((ret == 1) && (trans->counter >= 0)) {
3998 xmlRegCounterPtr counter;
3999 int count;
4000
4001 count = exec->counts[trans->counter];
4002 counter = &exec->comp->counters[trans->counter];
4003 if (count >= counter->max)
4004 ret = 0;
4005 }
4006
4007 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4008 xmlRegStatePtr to = exec->comp->states[trans->to];
4009
4010 /*
4011 * this is a multiple input sequence
4012 */
4013 if (exec->state->nbTrans > exec->transno + 1) {
4014 if (exec->inputStackNr <= 0) {
4015 xmlFARegExecSaveInputString(exec, value, data);
4016 }
4017 xmlFARegExecSave(exec);
4018 }
4019 exec->transcount = 1;
4020 do {
4021 /*
4022 * Try to progress as much as possible on the input
4023 */
4024 if (exec->transcount == atom->max) {
4025 break;
4026 }
4027 exec->index++;
4028 value = exec->inputStack[exec->index].value;
4029 data = exec->inputStack[exec->index].data;
4030 #ifdef DEBUG_PUSH
4031 printf("value loaded: %s\n", value);
4032 #endif
4033
4034 /*
4035 * End of input: stop here
4036 */
4037 if (value == NULL) {
4038 exec->index --;
4039 break;
4040 }
4041 if (exec->transcount >= atom->min) {
4042 int transno = exec->transno;
4043 xmlRegStatePtr state = exec->state;
4044
4045 /*
4046 * The transition is acceptable save it
4047 */
4048 exec->transno = -1; /* trick */
4049 exec->state = to;
4050 if (exec->inputStackNr <= 0) {
4051 xmlFARegExecSaveInputString(exec, value, data);
4052 }
4053 xmlFARegExecSave(exec);
4054 exec->transno = transno;
4055 exec->state = state;
4056 }
4057 ret = xmlStrEqual(value, atom->valuep);
4058 exec->transcount++;
4059 } while (ret == 1);
4060 if (exec->transcount < atom->min)
4061 ret = 0;
4062
4063 /*
4064 * If the last check failed but one transition was found
4065 * possible, rollback
4066 */
4067 if (ret < 0)
4068 ret = 0;
4069 if (ret == 0) {
4070 goto rollback;
4071 }
4072 }
4073 }
4074 if (ret == 1) {
4075 if ((exec->callback != NULL) && (atom != NULL) &&
4076 (data != NULL)) {
4077 exec->callback(exec->data, atom->valuep,
4078 atom->data, data);
4079 }
4080 if (exec->state->nbTrans > exec->transno + 1) {
4081 if (exec->inputStackNr <= 0) {
4082 xmlFARegExecSaveInputString(exec, value, data);
4083 }
4084 xmlFARegExecSave(exec);
4085 }
4086 if (trans->counter >= 0) {
4087 #ifdef DEBUG_PUSH
4088 printf("Increasing count %d\n", trans->counter);
4089 #endif
4090 exec->counts[trans->counter]++;
4091 }
4092 if ((trans->count >= 0) &&
4093 (trans->count < REGEXP_ALL_COUNTER)) {
4094 #ifdef DEBUG_REGEXP_EXEC
4095 printf("resetting count %d on transition\n",
4096 trans->count);
4097 #endif
4098 exec->counts[trans->count] = 0;
4099 }
4100 #ifdef DEBUG_PUSH
4101 printf("entering state %d\n", trans->to);
4102 #endif
4103 if ((exec->comp->states[trans->to] != NULL) &&
4104 (exec->comp->states[trans->to]->type ==
4105 XML_REGEXP_SINK_STATE)) {
4106 /*
4107 * entering a sink state, save the current state as error
4108 * state.
4109 */
4110 if (exec->errString != NULL)
4111 xmlFree(exec->errString);
4112 exec->errString = xmlStrdup(value);
4113 exec->errState = exec->state;
4114 memcpy(exec->errCounts, exec->counts,
4115 exec->comp->nbCounters * sizeof(int));
4116 }
4117 exec->state = exec->comp->states[trans->to];
4118 exec->transno = 0;
4119 if (trans->atom != NULL) {
4120 if (exec->inputStack != NULL) {
4121 exec->index++;
4122 if (exec->index < exec->inputStackNr) {
4123 value = exec->inputStack[exec->index].value;
4124 data = exec->inputStack[exec->index].data;
4125 #ifdef DEBUG_PUSH
4126 printf("value loaded: %s\n", value);
4127 #endif
4128 } else {
4129 value = NULL;
4130 data = NULL;
4131 #ifdef DEBUG_PUSH
4132 printf("end of input\n");
4133 #endif
4134 }
4135 } else {
4136 value = NULL;
4137 data = NULL;
4138 #ifdef DEBUG_PUSH
4139 printf("end of input\n");
4140 #endif
4141 }
4142 }
4143 goto progress;
4144 } else if (ret < 0) {
4145 exec->status = -4;
4146 break;
4147 }
4148 }
4149 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4150 rollback:
4151 /*
4152 * if we didn't yet rollback on the current input
4153 * store the current state as the error state.
4154 */
4155 if ((progress) && (exec->state != NULL) &&
4156 (exec->state->type != XML_REGEXP_SINK_STATE)) {
4157 progress = 0;
4158 if (exec->errString != NULL)
4159 xmlFree(exec->errString);
4160 exec->errString = xmlStrdup(value);
4161 exec->errState = exec->state;
4162 if (exec->comp->nbCounters)
4163 memcpy(exec->errCounts, exec->counts,
4164 exec->comp->nbCounters * sizeof(int));
4165 }
4166
4167 /*
4168 * Failed to find a way out
4169 */
4170 exec->determinist = 0;
4171 xmlFARegExecRollBack(exec);
4172 if ((exec->inputStack != NULL ) && (exec->status == 0)) {
4173 value = exec->inputStack[exec->index].value;
4174 data = exec->inputStack[exec->index].data;
4175 #ifdef DEBUG_PUSH
4176 printf("value loaded: %s\n", value);
4177 #endif
4178 }
4179 }
4180 continue;
4181 progress:
4182 progress = 1;
4183 continue;
4184 }
4185 if (exec->status == 0) {
4186 return(exec->state->type == XML_REGEXP_FINAL_STATE);
4187 }
4188 #ifdef DEBUG_ERR
4189 if (exec->status < 0) {
4190 testerr(exec);
4191 }
4192 #endif
4193 return(exec->status);
4194 }
4195
4196 /**
4197 * xmlRegExecPushString:
4198 * @exec: a regexp execution context or NULL to indicate the end
4199 * @value: a string token input
4200 * @data: data associated to the token to reuse in callbacks
4201 *
4202 * Push one input token in the execution context
4203 *
4204 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4205 * a negative value in case of error.
4206 */
4207 int
xmlRegExecPushString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)4208 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4209 void *data) {
4210 return(xmlRegExecPushStringInternal(exec, value, data, 0));
4211 }
4212
4213 /**
4214 * xmlRegExecPushString2:
4215 * @exec: a regexp execution context or NULL to indicate the end
4216 * @value: the first string token input
4217 * @value2: the second string token input
4218 * @data: data associated to the token to reuse in callbacks
4219 *
4220 * Push one input token in the execution context
4221 *
4222 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4223 * a negative value in case of error.
4224 */
4225 int
xmlRegExecPushString2(xmlRegExecCtxtPtr exec,const xmlChar * value,const xmlChar * value2,void * data)4226 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4227 const xmlChar *value2, void *data) {
4228 xmlChar buf[150];
4229 int lenn, lenp, ret;
4230 xmlChar *str;
4231
4232 if (exec == NULL)
4233 return(-1);
4234 if (exec->comp == NULL)
4235 return(-1);
4236 if (exec->status != 0)
4237 return(exec->status);
4238
4239 if (value2 == NULL)
4240 return(xmlRegExecPushString(exec, value, data));
4241
4242 lenn = strlen((char *) value2);
4243 lenp = strlen((char *) value);
4244
4245 if (150 < lenn + lenp + 2) {
4246 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4247 if (str == NULL) {
4248 exec->status = -1;
4249 return(-1);
4250 }
4251 } else {
4252 str = buf;
4253 }
4254 memcpy(&str[0], value, lenp);
4255 str[lenp] = XML_REG_STRING_SEPARATOR;
4256 memcpy(&str[lenp + 1], value2, lenn);
4257 str[lenn + lenp + 1] = 0;
4258
4259 if (exec->comp->compact != NULL)
4260 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4261 else
4262 ret = xmlRegExecPushStringInternal(exec, str, data, 1);
4263
4264 if (str != buf)
4265 xmlFree(str);
4266 return(ret);
4267 }
4268
4269 /**
4270 * xmlRegExecGetValues:
4271 * @exec: a regexp execution context
4272 * @err: error extraction or normal one
4273 * @nbval: pointer to the number of accepted values IN/OUT
4274 * @nbneg: return number of negative transitions
4275 * @values: pointer to the array of acceptable values
4276 * @terminal: return value if this was a terminal state
4277 *
4278 * Extract information from the regexp execution, internal routine to
4279 * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
4280 *
4281 * Returns: 0 in case of success or -1 in case of error.
4282 */
4283 static int
xmlRegExecGetValues(xmlRegExecCtxtPtr exec,int err,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4284 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
4285 int *nbval, int *nbneg,
4286 xmlChar **values, int *terminal) {
4287 int maxval;
4288 int nb = 0;
4289
4290 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
4291 (values == NULL) || (*nbval <= 0))
4292 return(-1);
4293
4294 maxval = *nbval;
4295 *nbval = 0;
4296 *nbneg = 0;
4297 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4298 xmlRegexpPtr comp;
4299 int target, i, state;
4300
4301 comp = exec->comp;
4302
4303 if (err) {
4304 if (exec->errStateNo == -1) return(-1);
4305 state = exec->errStateNo;
4306 } else {
4307 state = exec->index;
4308 }
4309 if (terminal != NULL) {
4310 if (comp->compact[state * (comp->nbstrings + 1)] ==
4311 XML_REGEXP_FINAL_STATE)
4312 *terminal = 1;
4313 else
4314 *terminal = 0;
4315 }
4316 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4317 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4318 if ((target > 0) && (target <= comp->nbstates) &&
4319 (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4320 XML_REGEXP_SINK_STATE)) {
4321 values[nb++] = comp->stringMap[i];
4322 (*nbval)++;
4323 }
4324 }
4325 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4326 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4327 if ((target > 0) && (target <= comp->nbstates) &&
4328 (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4329 XML_REGEXP_SINK_STATE)) {
4330 values[nb++] = comp->stringMap[i];
4331 (*nbneg)++;
4332 }
4333 }
4334 } else {
4335 int transno;
4336 xmlRegTransPtr trans;
4337 xmlRegAtomPtr atom;
4338 xmlRegStatePtr state;
4339
4340 if (terminal != NULL) {
4341 if (exec->state->type == XML_REGEXP_FINAL_STATE)
4342 *terminal = 1;
4343 else
4344 *terminal = 0;
4345 }
4346
4347 if (err) {
4348 if (exec->errState == NULL) return(-1);
4349 state = exec->errState;
4350 } else {
4351 if (exec->state == NULL) return(-1);
4352 state = exec->state;
4353 }
4354 for (transno = 0;
4355 (transno < state->nbTrans) && (nb < maxval);
4356 transno++) {
4357 trans = &state->trans[transno];
4358 if (trans->to < 0)
4359 continue;
4360 atom = trans->atom;
4361 if ((atom == NULL) || (atom->valuep == NULL))
4362 continue;
4363 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4364 /* this should not be reached but ... */
4365 TODO;
4366 } else if (trans->count == REGEXP_ALL_COUNTER) {
4367 /* this should not be reached but ... */
4368 TODO;
4369 } else if (trans->counter >= 0) {
4370 xmlRegCounterPtr counter = NULL;
4371 int count;
4372
4373 if (err)
4374 count = exec->errCounts[trans->counter];
4375 else
4376 count = exec->counts[trans->counter];
4377 if (exec->comp != NULL)
4378 counter = &exec->comp->counters[trans->counter];
4379 if ((counter == NULL) || (count < counter->max)) {
4380 if (atom->neg)
4381 values[nb++] = (xmlChar *) atom->valuep2;
4382 else
4383 values[nb++] = (xmlChar *) atom->valuep;
4384 (*nbval)++;
4385 }
4386 } else {
4387 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
4388 (exec->comp->states[trans->to]->type !=
4389 XML_REGEXP_SINK_STATE)) {
4390 if (atom->neg)
4391 values[nb++] = (xmlChar *) atom->valuep2;
4392 else
4393 values[nb++] = (xmlChar *) atom->valuep;
4394 (*nbval)++;
4395 }
4396 }
4397 }
4398 for (transno = 0;
4399 (transno < state->nbTrans) && (nb < maxval);
4400 transno++) {
4401 trans = &state->trans[transno];
4402 if (trans->to < 0)
4403 continue;
4404 atom = trans->atom;
4405 if ((atom == NULL) || (atom->valuep == NULL))
4406 continue;
4407 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4408 continue;
4409 } else if (trans->count == REGEXP_ALL_COUNTER) {
4410 continue;
4411 } else if (trans->counter >= 0) {
4412 continue;
4413 } else {
4414 if ((exec->comp->states[trans->to] != NULL) &&
4415 (exec->comp->states[trans->to]->type ==
4416 XML_REGEXP_SINK_STATE)) {
4417 if (atom->neg)
4418 values[nb++] = (xmlChar *) atom->valuep2;
4419 else
4420 values[nb++] = (xmlChar *) atom->valuep;
4421 (*nbneg)++;
4422 }
4423 }
4424 }
4425 }
4426 return(0);
4427 }
4428
4429 /**
4430 * xmlRegExecNextValues:
4431 * @exec: a regexp execution context
4432 * @nbval: pointer to the number of accepted values IN/OUT
4433 * @nbneg: return number of negative transitions
4434 * @values: pointer to the array of acceptable values
4435 * @terminal: return value if this was a terminal state
4436 *
4437 * Extract information from the regexp execution,
4438 * the parameter @values must point to an array of @nbval string pointers
4439 * on return nbval will contain the number of possible strings in that
4440 * state and the @values array will be updated with them. The string values
4441 * returned will be freed with the @exec context and don't need to be
4442 * deallocated.
4443 *
4444 * Returns: 0 in case of success or -1 in case of error.
4445 */
4446 int
xmlRegExecNextValues(xmlRegExecCtxtPtr exec,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4447 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4448 xmlChar **values, int *terminal) {
4449 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
4450 }
4451
4452 /**
4453 * xmlRegExecErrInfo:
4454 * @exec: a regexp execution context generating an error
4455 * @string: return value for the error string
4456 * @nbval: pointer to the number of accepted values IN/OUT
4457 * @nbneg: return number of negative transitions
4458 * @values: pointer to the array of acceptable values
4459 * @terminal: return value if this was a terminal state
4460 *
4461 * Extract error information from the regexp execution, the parameter
4462 * @string will be updated with the value pushed and not accepted,
4463 * the parameter @values must point to an array of @nbval string pointers
4464 * on return nbval will contain the number of possible strings in that
4465 * state and the @values array will be updated with them. The string values
4466 * returned will be freed with the @exec context and don't need to be
4467 * deallocated.
4468 *
4469 * Returns: 0 in case of success or -1 in case of error.
4470 */
4471 int
xmlRegExecErrInfo(xmlRegExecCtxtPtr exec,const xmlChar ** string,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4472 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
4473 int *nbval, int *nbneg, xmlChar **values, int *terminal) {
4474 if (exec == NULL)
4475 return(-1);
4476 if (string != NULL) {
4477 if (exec->status != 0)
4478 *string = exec->errString;
4479 else
4480 *string = NULL;
4481 }
4482 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
4483 }
4484
4485 #ifdef DEBUG_ERR
testerr(xmlRegExecCtxtPtr exec)4486 static void testerr(xmlRegExecCtxtPtr exec) {
4487 const xmlChar *string;
4488 xmlChar *values[5];
4489 int nb = 5;
4490 int nbneg;
4491 int terminal;
4492 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
4493 }
4494 #endif
4495
4496 #if 0
4497 static int
4498 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4499 xmlRegTransPtr trans;
4500 xmlRegAtomPtr atom;
4501 int ret;
4502 int codepoint, len;
4503
4504 if (exec == NULL)
4505 return(-1);
4506 if (exec->status != 0)
4507 return(exec->status);
4508
4509 while ((exec->status == 0) &&
4510 ((exec->inputString[exec->index] != 0) ||
4511 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4512
4513 /*
4514 * End of input on non-terminal state, rollback, however we may
4515 * still have epsilon like transition for counted transitions
4516 * on counters, in that case don't break too early.
4517 */
4518 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4519 goto rollback;
4520
4521 exec->transcount = 0;
4522 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4523 trans = &exec->state->trans[exec->transno];
4524 if (trans->to < 0)
4525 continue;
4526 atom = trans->atom;
4527 ret = 0;
4528 if (trans->count >= 0) {
4529 int count;
4530 xmlRegCounterPtr counter;
4531
4532 /*
4533 * A counted transition.
4534 */
4535
4536 count = exec->counts[trans->count];
4537 counter = &exec->comp->counters[trans->count];
4538 #ifdef DEBUG_REGEXP_EXEC
4539 printf("testing count %d: val %d, min %d, max %d\n",
4540 trans->count, count, counter->min, counter->max);
4541 #endif
4542 ret = ((count >= counter->min) && (count <= counter->max));
4543 } else if (atom == NULL) {
4544 fprintf(stderr, "epsilon transition left at runtime\n");
4545 exec->status = -2;
4546 break;
4547 } else if (exec->inputString[exec->index] != 0) {
4548 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4549 ret = xmlRegCheckCharacter(atom, codepoint);
4550 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4551 xmlRegStatePtr to = exec->comp->states[trans->to];
4552
4553 /*
4554 * this is a multiple input sequence
4555 */
4556 if (exec->state->nbTrans > exec->transno + 1) {
4557 xmlFARegExecSave(exec);
4558 }
4559 exec->transcount = 1;
4560 do {
4561 /*
4562 * Try to progress as much as possible on the input
4563 */
4564 if (exec->transcount == atom->max) {
4565 break;
4566 }
4567 exec->index += len;
4568 /*
4569 * End of input: stop here
4570 */
4571 if (exec->inputString[exec->index] == 0) {
4572 exec->index -= len;
4573 break;
4574 }
4575 if (exec->transcount >= atom->min) {
4576 int transno = exec->transno;
4577 xmlRegStatePtr state = exec->state;
4578
4579 /*
4580 * The transition is acceptable save it
4581 */
4582 exec->transno = -1; /* trick */
4583 exec->state = to;
4584 xmlFARegExecSave(exec);
4585 exec->transno = transno;
4586 exec->state = state;
4587 }
4588 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4589 len);
4590 ret = xmlRegCheckCharacter(atom, codepoint);
4591 exec->transcount++;
4592 } while (ret == 1);
4593 if (exec->transcount < atom->min)
4594 ret = 0;
4595
4596 /*
4597 * If the last check failed but one transition was found
4598 * possible, rollback
4599 */
4600 if (ret < 0)
4601 ret = 0;
4602 if (ret == 0) {
4603 goto rollback;
4604 }
4605 }
4606 }
4607 if (ret == 1) {
4608 if (exec->state->nbTrans > exec->transno + 1) {
4609 xmlFARegExecSave(exec);
4610 }
4611 /*
4612 * restart count for expressions like this ((abc){2})*
4613 */
4614 if (trans->count >= 0) {
4615 #ifdef DEBUG_REGEXP_EXEC
4616 printf("Reset count %d\n", trans->count);
4617 #endif
4618 exec->counts[trans->count] = 0;
4619 }
4620 if (trans->counter >= 0) {
4621 #ifdef DEBUG_REGEXP_EXEC
4622 printf("Increasing count %d\n", trans->counter);
4623 #endif
4624 exec->counts[trans->counter]++;
4625 }
4626 #ifdef DEBUG_REGEXP_EXEC
4627 printf("entering state %d\n", trans->to);
4628 #endif
4629 exec->state = exec->comp->states[trans->to];
4630 exec->transno = 0;
4631 if (trans->atom != NULL) {
4632 exec->index += len;
4633 }
4634 goto progress;
4635 } else if (ret < 0) {
4636 exec->status = -4;
4637 break;
4638 }
4639 }
4640 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4641 rollback:
4642 /*
4643 * Failed to find a way out
4644 */
4645 exec->determinist = 0;
4646 xmlFARegExecRollBack(exec);
4647 }
4648 progress:
4649 continue;
4650 }
4651 }
4652 #endif
4653 /************************************************************************
4654 * *
4655 * Parser for the Schemas Datatype Regular Expressions *
4656 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
4657 * *
4658 ************************************************************************/
4659
4660 /**
4661 * xmlFAIsChar:
4662 * @ctxt: a regexp parser context
4663 *
4664 * [10] Char ::= [^.\?*+()|#x5B#x5D]
4665 */
4666 static int
xmlFAIsChar(xmlRegParserCtxtPtr ctxt)4667 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4668 int cur;
4669 int len;
4670
4671 cur = CUR_SCHAR(ctxt->cur, len);
4672 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4673 (cur == '*') || (cur == '+') || (cur == '(') ||
4674 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
4675 (cur == 0x5D) || (cur == 0))
4676 return(-1);
4677 return(cur);
4678 }
4679
4680 /**
4681 * xmlFAParseCharProp:
4682 * @ctxt: a regexp parser context
4683 *
4684 * [27] charProp ::= IsCategory | IsBlock
4685 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
4686 * Separators | Symbols | Others
4687 * [29] Letters ::= 'L' [ultmo]?
4688 * [30] Marks ::= 'M' [nce]?
4689 * [31] Numbers ::= 'N' [dlo]?
4690 * [32] Punctuation ::= 'P' [cdseifo]?
4691 * [33] Separators ::= 'Z' [slp]?
4692 * [34] Symbols ::= 'S' [mcko]?
4693 * [35] Others ::= 'C' [cfon]?
4694 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
4695 */
4696 static void
xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt)4697 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4698 int cur;
4699 xmlRegAtomType type = (xmlRegAtomType) 0;
4700 xmlChar *blockName = NULL;
4701
4702 cur = CUR;
4703 if (cur == 'L') {
4704 NEXT;
4705 cur = CUR;
4706 if (cur == 'u') {
4707 NEXT;
4708 type = XML_REGEXP_LETTER_UPPERCASE;
4709 } else if (cur == 'l') {
4710 NEXT;
4711 type = XML_REGEXP_LETTER_LOWERCASE;
4712 } else if (cur == 't') {
4713 NEXT;
4714 type = XML_REGEXP_LETTER_TITLECASE;
4715 } else if (cur == 'm') {
4716 NEXT;
4717 type = XML_REGEXP_LETTER_MODIFIER;
4718 } else if (cur == 'o') {
4719 NEXT;
4720 type = XML_REGEXP_LETTER_OTHERS;
4721 } else {
4722 type = XML_REGEXP_LETTER;
4723 }
4724 } else if (cur == 'M') {
4725 NEXT;
4726 cur = CUR;
4727 if (cur == 'n') {
4728 NEXT;
4729 /* nonspacing */
4730 type = XML_REGEXP_MARK_NONSPACING;
4731 } else if (cur == 'c') {
4732 NEXT;
4733 /* spacing combining */
4734 type = XML_REGEXP_MARK_SPACECOMBINING;
4735 } else if (cur == 'e') {
4736 NEXT;
4737 /* enclosing */
4738 type = XML_REGEXP_MARK_ENCLOSING;
4739 } else {
4740 /* all marks */
4741 type = XML_REGEXP_MARK;
4742 }
4743 } else if (cur == 'N') {
4744 NEXT;
4745 cur = CUR;
4746 if (cur == 'd') {
4747 NEXT;
4748 /* digital */
4749 type = XML_REGEXP_NUMBER_DECIMAL;
4750 } else if (cur == 'l') {
4751 NEXT;
4752 /* letter */
4753 type = XML_REGEXP_NUMBER_LETTER;
4754 } else if (cur == 'o') {
4755 NEXT;
4756 /* other */
4757 type = XML_REGEXP_NUMBER_OTHERS;
4758 } else {
4759 /* all numbers */
4760 type = XML_REGEXP_NUMBER;
4761 }
4762 } else if (cur == 'P') {
4763 NEXT;
4764 cur = CUR;
4765 if (cur == 'c') {
4766 NEXT;
4767 /* connector */
4768 type = XML_REGEXP_PUNCT_CONNECTOR;
4769 } else if (cur == 'd') {
4770 NEXT;
4771 /* dash */
4772 type = XML_REGEXP_PUNCT_DASH;
4773 } else if (cur == 's') {
4774 NEXT;
4775 /* open */
4776 type = XML_REGEXP_PUNCT_OPEN;
4777 } else if (cur == 'e') {
4778 NEXT;
4779 /* close */
4780 type = XML_REGEXP_PUNCT_CLOSE;
4781 } else if (cur == 'i') {
4782 NEXT;
4783 /* initial quote */
4784 type = XML_REGEXP_PUNCT_INITQUOTE;
4785 } else if (cur == 'f') {
4786 NEXT;
4787 /* final quote */
4788 type = XML_REGEXP_PUNCT_FINQUOTE;
4789 } else if (cur == 'o') {
4790 NEXT;
4791 /* other */
4792 type = XML_REGEXP_PUNCT_OTHERS;
4793 } else {
4794 /* all punctuation */
4795 type = XML_REGEXP_PUNCT;
4796 }
4797 } else if (cur == 'Z') {
4798 NEXT;
4799 cur = CUR;
4800 if (cur == 's') {
4801 NEXT;
4802 /* space */
4803 type = XML_REGEXP_SEPAR_SPACE;
4804 } else if (cur == 'l') {
4805 NEXT;
4806 /* line */
4807 type = XML_REGEXP_SEPAR_LINE;
4808 } else if (cur == 'p') {
4809 NEXT;
4810 /* paragraph */
4811 type = XML_REGEXP_SEPAR_PARA;
4812 } else {
4813 /* all separators */
4814 type = XML_REGEXP_SEPAR;
4815 }
4816 } else if (cur == 'S') {
4817 NEXT;
4818 cur = CUR;
4819 if (cur == 'm') {
4820 NEXT;
4821 type = XML_REGEXP_SYMBOL_MATH;
4822 /* math */
4823 } else if (cur == 'c') {
4824 NEXT;
4825 type = XML_REGEXP_SYMBOL_CURRENCY;
4826 /* currency */
4827 } else if (cur == 'k') {
4828 NEXT;
4829 type = XML_REGEXP_SYMBOL_MODIFIER;
4830 /* modifiers */
4831 } else if (cur == 'o') {
4832 NEXT;
4833 type = XML_REGEXP_SYMBOL_OTHERS;
4834 /* other */
4835 } else {
4836 /* all symbols */
4837 type = XML_REGEXP_SYMBOL;
4838 }
4839 } else if (cur == 'C') {
4840 NEXT;
4841 cur = CUR;
4842 if (cur == 'c') {
4843 NEXT;
4844 /* control */
4845 type = XML_REGEXP_OTHER_CONTROL;
4846 } else if (cur == 'f') {
4847 NEXT;
4848 /* format */
4849 type = XML_REGEXP_OTHER_FORMAT;
4850 } else if (cur == 'o') {
4851 NEXT;
4852 /* private use */
4853 type = XML_REGEXP_OTHER_PRIVATE;
4854 } else if (cur == 'n') {
4855 NEXT;
4856 /* not assigned */
4857 type = XML_REGEXP_OTHER_NA;
4858 } else {
4859 /* all others */
4860 type = XML_REGEXP_OTHER;
4861 }
4862 } else if (cur == 'I') {
4863 const xmlChar *start;
4864 NEXT;
4865 cur = CUR;
4866 if (cur != 's') {
4867 ERROR("IsXXXX expected");
4868 return;
4869 }
4870 NEXT;
4871 start = ctxt->cur;
4872 cur = CUR;
4873 if (((cur >= 'a') && (cur <= 'z')) ||
4874 ((cur >= 'A') && (cur <= 'Z')) ||
4875 ((cur >= '0') && (cur <= '9')) ||
4876 (cur == 0x2D)) {
4877 NEXT;
4878 cur = CUR;
4879 while (((cur >= 'a') && (cur <= 'z')) ||
4880 ((cur >= 'A') && (cur <= 'Z')) ||
4881 ((cur >= '0') && (cur <= '9')) ||
4882 (cur == 0x2D)) {
4883 NEXT;
4884 cur = CUR;
4885 }
4886 }
4887 type = XML_REGEXP_BLOCK_NAME;
4888 blockName = xmlStrndup(start, ctxt->cur - start);
4889 } else {
4890 ERROR("Unknown char property");
4891 return;
4892 }
4893 if (ctxt->atom == NULL) {
4894 ctxt->atom = xmlRegNewAtom(ctxt, type);
4895 if (ctxt->atom != NULL)
4896 ctxt->atom->valuep = blockName;
4897 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4898 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4899 type, 0, 0, blockName);
4900 }
4901 }
4902
4903 /**
4904 * xmlFAParseCharClassEsc:
4905 * @ctxt: a regexp parser context
4906 *
4907 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4908 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4909 * [25] catEsc ::= '\p{' charProp '}'
4910 * [26] complEsc ::= '\P{' charProp '}'
4911 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4912 */
4913 static void
xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt)4914 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4915 int cur;
4916
4917 if (CUR == '.') {
4918 if (ctxt->atom == NULL) {
4919 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4920 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4921 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4922 XML_REGEXP_ANYCHAR, 0, 0, NULL);
4923 }
4924 NEXT;
4925 return;
4926 }
4927 if (CUR != '\\') {
4928 ERROR("Escaped sequence: expecting \\");
4929 return;
4930 }
4931 NEXT;
4932 cur = CUR;
4933 if (cur == 'p') {
4934 NEXT;
4935 if (CUR != '{') {
4936 ERROR("Expecting '{'");
4937 return;
4938 }
4939 NEXT;
4940 xmlFAParseCharProp(ctxt);
4941 if (CUR != '}') {
4942 ERROR("Expecting '}'");
4943 return;
4944 }
4945 NEXT;
4946 } else if (cur == 'P') {
4947 NEXT;
4948 if (CUR != '{') {
4949 ERROR("Expecting '{'");
4950 return;
4951 }
4952 NEXT;
4953 xmlFAParseCharProp(ctxt);
4954 if (ctxt->atom != NULL)
4955 ctxt->atom->neg = 1;
4956 if (CUR != '}') {
4957 ERROR("Expecting '}'");
4958 return;
4959 }
4960 NEXT;
4961 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4962 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4963 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4964 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4965 (cur == 0x5E)) {
4966 if (ctxt->atom == NULL) {
4967 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
4968 if (ctxt->atom != NULL) {
4969 switch (cur) {
4970 case 'n':
4971 ctxt->atom->codepoint = '\n';
4972 break;
4973 case 'r':
4974 ctxt->atom->codepoint = '\r';
4975 break;
4976 case 't':
4977 ctxt->atom->codepoint = '\t';
4978 break;
4979 default:
4980 ctxt->atom->codepoint = cur;
4981 }
4982 }
4983 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4984 switch (cur) {
4985 case 'n':
4986 cur = '\n';
4987 break;
4988 case 'r':
4989 cur = '\r';
4990 break;
4991 case 't':
4992 cur = '\t';
4993 break;
4994 }
4995 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4996 XML_REGEXP_CHARVAL, cur, cur, NULL);
4997 }
4998 NEXT;
4999 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
5000 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
5001 (cur == 'w') || (cur == 'W')) {
5002 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
5003
5004 switch (cur) {
5005 case 's':
5006 type = XML_REGEXP_ANYSPACE;
5007 break;
5008 case 'S':
5009 type = XML_REGEXP_NOTSPACE;
5010 break;
5011 case 'i':
5012 type = XML_REGEXP_INITNAME;
5013 break;
5014 case 'I':
5015 type = XML_REGEXP_NOTINITNAME;
5016 break;
5017 case 'c':
5018 type = XML_REGEXP_NAMECHAR;
5019 break;
5020 case 'C':
5021 type = XML_REGEXP_NOTNAMECHAR;
5022 break;
5023 case 'd':
5024 type = XML_REGEXP_DECIMAL;
5025 break;
5026 case 'D':
5027 type = XML_REGEXP_NOTDECIMAL;
5028 break;
5029 case 'w':
5030 type = XML_REGEXP_REALCHAR;
5031 break;
5032 case 'W':
5033 type = XML_REGEXP_NOTREALCHAR;
5034 break;
5035 }
5036 NEXT;
5037 if (ctxt->atom == NULL) {
5038 ctxt->atom = xmlRegNewAtom(ctxt, type);
5039 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
5040 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5041 type, 0, 0, NULL);
5042 }
5043 } else {
5044 ERROR("Wrong escape sequence, misuse of character '\\'");
5045 }
5046 }
5047
5048 /**
5049 * xmlFAParseCharRange:
5050 * @ctxt: a regexp parser context
5051 *
5052 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
5053 * [18] seRange ::= charOrEsc '-' charOrEsc
5054 * [20] charOrEsc ::= XmlChar | SingleCharEsc
5055 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
5056 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
5057 */
5058 static void
xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt)5059 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
5060 int cur, len;
5061 int start = -1;
5062 int end = -1;
5063
5064 if (CUR == '\0') {
5065 ERROR("Expecting ']'");
5066 return;
5067 }
5068
5069 cur = CUR;
5070 if (cur == '\\') {
5071 NEXT;
5072 cur = CUR;
5073 switch (cur) {
5074 case 'n': start = 0xA; break;
5075 case 'r': start = 0xD; break;
5076 case 't': start = 0x9; break;
5077 case '\\': case '|': case '.': case '-': case '^': case '?':
5078 case '*': case '+': case '{': case '}': case '(': case ')':
5079 case '[': case ']':
5080 start = cur; break;
5081 default:
5082 ERROR("Invalid escape value");
5083 return;
5084 }
5085 end = start;
5086 len = 1;
5087 } else if ((cur != 0x5B) && (cur != 0x5D)) {
5088 end = start = CUR_SCHAR(ctxt->cur, len);
5089 } else {
5090 ERROR("Expecting a char range");
5091 return;
5092 }
5093 /*
5094 * Since we are "inside" a range, we can assume ctxt->cur is past
5095 * the start of ctxt->string, and PREV should be safe
5096 */
5097 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
5098 NEXTL(len);
5099 return;
5100 }
5101 NEXTL(len);
5102 cur = CUR;
5103 if ((cur != '-') || (NXT(1) == ']')) {
5104 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5105 XML_REGEXP_CHARVAL, start, end, NULL);
5106 return;
5107 }
5108 NEXT;
5109 cur = CUR;
5110 if (cur == '\\') {
5111 NEXT;
5112 cur = CUR;
5113 switch (cur) {
5114 case 'n': end = 0xA; break;
5115 case 'r': end = 0xD; break;
5116 case 't': end = 0x9; break;
5117 case '\\': case '|': case '.': case '-': case '^': case '?':
5118 case '*': case '+': case '{': case '}': case '(': case ')':
5119 case '[': case ']':
5120 end = cur; break;
5121 default:
5122 ERROR("Invalid escape value");
5123 return;
5124 }
5125 len = 1;
5126 } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
5127 end = CUR_SCHAR(ctxt->cur, len);
5128 } else {
5129 ERROR("Expecting the end of a char range");
5130 return;
5131 }
5132
5133 /* TODO check that the values are acceptable character ranges for XML */
5134 if (end < start) {
5135 ERROR("End of range is before start of range");
5136 } else {
5137 NEXTL(len);
5138 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5139 XML_REGEXP_CHARVAL, start, end, NULL);
5140 }
5141 return;
5142 }
5143
5144 /**
5145 * xmlFAParsePosCharGroup:
5146 * @ctxt: a regexp parser context
5147 *
5148 * [14] posCharGroup ::= ( charRange | charClassEsc )+
5149 */
5150 static void
xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt)5151 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5152 do {
5153 if (CUR == '\\') {
5154 xmlFAParseCharClassEsc(ctxt);
5155 } else {
5156 xmlFAParseCharRange(ctxt);
5157 }
5158 } while ((CUR != ']') && (CUR != '-') &&
5159 (CUR != 0) && (ctxt->error == 0));
5160 }
5161
5162 /**
5163 * xmlFAParseCharGroup:
5164 * @ctxt: a regexp parser context
5165 *
5166 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
5167 * [15] negCharGroup ::= '^' posCharGroup
5168 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
5169 * [12] charClassExpr ::= '[' charGroup ']'
5170 */
5171 static void
xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt)5172 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
5173 int neg = ctxt->neg;
5174
5175 if (CUR == '^') {
5176 NEXT;
5177 ctxt->neg = !ctxt->neg;
5178 xmlFAParsePosCharGroup(ctxt);
5179 ctxt->neg = neg;
5180 }
5181 while ((CUR != ']') && (ctxt->error == 0)) {
5182 if ((CUR == '-') && (NXT(1) == '[')) {
5183 NEXT; /* eat the '-' */
5184 NEXT; /* eat the '[' */
5185 ctxt->neg = 2;
5186 xmlFAParseCharGroup(ctxt);
5187 ctxt->neg = neg;
5188 if (CUR == ']') {
5189 NEXT;
5190 } else {
5191 ERROR("charClassExpr: ']' expected");
5192 }
5193 break;
5194 } else {
5195 xmlFAParsePosCharGroup(ctxt);
5196 }
5197 }
5198 }
5199
5200 /**
5201 * xmlFAParseCharClass:
5202 * @ctxt: a regexp parser context
5203 *
5204 * [11] charClass ::= charClassEsc | charClassExpr
5205 * [12] charClassExpr ::= '[' charGroup ']'
5206 */
5207 static void
xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt)5208 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5209 if (CUR == '[') {
5210 NEXT;
5211 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5212 if (ctxt->atom == NULL)
5213 return;
5214 xmlFAParseCharGroup(ctxt);
5215 if (CUR == ']') {
5216 NEXT;
5217 } else {
5218 ERROR("xmlFAParseCharClass: ']' expected");
5219 }
5220 } else {
5221 xmlFAParseCharClassEsc(ctxt);
5222 }
5223 }
5224
5225 /**
5226 * xmlFAParseQuantExact:
5227 * @ctxt: a regexp parser context
5228 *
5229 * [8] QuantExact ::= [0-9]+
5230 *
5231 * Returns 0 if success or -1 in case of error
5232 */
5233 static int
xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt)5234 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5235 int ret = 0;
5236 int ok = 0;
5237 int overflow = 0;
5238
5239 while ((CUR >= '0') && (CUR <= '9')) {
5240 if (ret > INT_MAX / 10) {
5241 overflow = 1;
5242 } else {
5243 int digit = CUR - '0';
5244
5245 ret *= 10;
5246 if (ret > INT_MAX - digit)
5247 overflow = 1;
5248 else
5249 ret += digit;
5250 }
5251 ok = 1;
5252 NEXT;
5253 }
5254 if ((ok != 1) || (overflow == 1)) {
5255 return(-1);
5256 }
5257 return(ret);
5258 }
5259
5260 /**
5261 * xmlFAParseQuantifier:
5262 * @ctxt: a regexp parser context
5263 *
5264 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
5265 * [5] quantity ::= quantRange | quantMin | QuantExact
5266 * [6] quantRange ::= QuantExact ',' QuantExact
5267 * [7] quantMin ::= QuantExact ','
5268 * [8] QuantExact ::= [0-9]+
5269 */
5270 static int
xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt)5271 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5272 int cur;
5273
5274 cur = CUR;
5275 if ((cur == '?') || (cur == '*') || (cur == '+')) {
5276 if (ctxt->atom != NULL) {
5277 if (cur == '?')
5278 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5279 else if (cur == '*')
5280 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5281 else if (cur == '+')
5282 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5283 }
5284 NEXT;
5285 return(1);
5286 }
5287 if (cur == '{') {
5288 int min = 0, max = 0;
5289
5290 NEXT;
5291 cur = xmlFAParseQuantExact(ctxt);
5292 if (cur >= 0)
5293 min = cur;
5294 else {
5295 ERROR("Improper quantifier");
5296 }
5297 if (CUR == ',') {
5298 NEXT;
5299 if (CUR == '}')
5300 max = INT_MAX;
5301 else {
5302 cur = xmlFAParseQuantExact(ctxt);
5303 if (cur >= 0)
5304 max = cur;
5305 else {
5306 ERROR("Improper quantifier");
5307 }
5308 }
5309 }
5310 if (CUR == '}') {
5311 NEXT;
5312 } else {
5313 ERROR("Unterminated quantifier");
5314 }
5315 if (max == 0)
5316 max = min;
5317 if (ctxt->atom != NULL) {
5318 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5319 ctxt->atom->min = min;
5320 ctxt->atom->max = max;
5321 }
5322 return(1);
5323 }
5324 return(0);
5325 }
5326
5327 /**
5328 * xmlFAParseAtom:
5329 * @ctxt: a regexp parser context
5330 *
5331 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
5332 */
5333 static int
xmlFAParseAtom(xmlRegParserCtxtPtr ctxt)5334 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5335 int codepoint, len;
5336
5337 codepoint = xmlFAIsChar(ctxt);
5338 if (codepoint > 0) {
5339 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5340 if (ctxt->atom == NULL)
5341 return(-1);
5342 codepoint = CUR_SCHAR(ctxt->cur, len);
5343 ctxt->atom->codepoint = codepoint;
5344 NEXTL(len);
5345 return(1);
5346 } else if (CUR == '|') {
5347 return(0);
5348 } else if (CUR == 0) {
5349 return(0);
5350 } else if (CUR == ')') {
5351 return(0);
5352 } else if (CUR == '(') {
5353 xmlRegStatePtr start, oldend, start0;
5354
5355 NEXT;
5356 if (ctxt->depth >= 50) {
5357 ERROR("xmlFAParseAtom: maximum nesting depth exceeded");
5358 return(-1);
5359 }
5360 /*
5361 * this extra Epsilon transition is needed if we count with 0 allowed
5362 * unfortunately this can't be known at that point
5363 */
5364 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5365 start0 = ctxt->state;
5366 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5367 start = ctxt->state;
5368 oldend = ctxt->end;
5369 ctxt->end = NULL;
5370 ctxt->atom = NULL;
5371 ctxt->depth++;
5372 xmlFAParseRegExp(ctxt, 0);
5373 ctxt->depth--;
5374 if (CUR == ')') {
5375 NEXT;
5376 } else {
5377 ERROR("xmlFAParseAtom: expecting ')'");
5378 }
5379 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5380 if (ctxt->atom == NULL)
5381 return(-1);
5382 ctxt->atom->start = start;
5383 ctxt->atom->start0 = start0;
5384 ctxt->atom->stop = ctxt->state;
5385 ctxt->end = oldend;
5386 return(1);
5387 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5388 xmlFAParseCharClass(ctxt);
5389 return(1);
5390 }
5391 return(0);
5392 }
5393
5394 /**
5395 * xmlFAParsePiece:
5396 * @ctxt: a regexp parser context
5397 *
5398 * [3] piece ::= atom quantifier?
5399 */
5400 static int
xmlFAParsePiece(xmlRegParserCtxtPtr ctxt)5401 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5402 int ret;
5403
5404 ctxt->atom = NULL;
5405 ret = xmlFAParseAtom(ctxt);
5406 if (ret == 0)
5407 return(0);
5408 if (ctxt->atom == NULL) {
5409 ERROR("internal: no atom generated");
5410 }
5411 xmlFAParseQuantifier(ctxt);
5412 return(1);
5413 }
5414
5415 /**
5416 * xmlFAParseBranch:
5417 * @ctxt: a regexp parser context
5418 * @to: optional target to the end of the branch
5419 *
5420 * @to is used to optimize by removing duplicate path in automata
5421 * in expressions like (a|b)(c|d)
5422 *
5423 * [2] branch ::= piece*
5424 */
5425 static int
xmlFAParseBranch(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr to)5426 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
5427 xmlRegStatePtr previous;
5428 int ret;
5429
5430 previous = ctxt->state;
5431 ret = xmlFAParsePiece(ctxt);
5432 if (ret == 0) {
5433 /* Empty branch */
5434 xmlFAGenerateEpsilonTransition(ctxt, previous, to);
5435 } else {
5436 if (xmlFAGenerateTransitions(ctxt, previous,
5437 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL, ctxt->atom) < 0)
5438 return(-1);
5439 previous = ctxt->state;
5440 ctxt->atom = NULL;
5441 }
5442 while ((ret != 0) && (ctxt->error == 0)) {
5443 ret = xmlFAParsePiece(ctxt);
5444 if (ret != 0) {
5445 if (xmlFAGenerateTransitions(ctxt, previous,
5446 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5447 ctxt->atom) < 0)
5448 return(-1);
5449 previous = ctxt->state;
5450 ctxt->atom = NULL;
5451 }
5452 }
5453 return(0);
5454 }
5455
5456 /**
5457 * xmlFAParseRegExp:
5458 * @ctxt: a regexp parser context
5459 * @top: is this the top-level expression ?
5460 *
5461 * [1] regExp ::= branch ( '|' branch )*
5462 */
5463 static void
xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt,int top)5464 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
5465 xmlRegStatePtr start, end;
5466
5467 /* if not top start should have been generated by an epsilon trans */
5468 start = ctxt->state;
5469 ctxt->end = NULL;
5470 xmlFAParseBranch(ctxt, NULL);
5471 if (top) {
5472 #ifdef DEBUG_REGEXP_GRAPH
5473 printf("State %d is final\n", ctxt->state->no);
5474 #endif
5475 ctxt->state->type = XML_REGEXP_FINAL_STATE;
5476 }
5477 if (CUR != '|') {
5478 ctxt->end = ctxt->state;
5479 return;
5480 }
5481 end = ctxt->state;
5482 while ((CUR == '|') && (ctxt->error == 0)) {
5483 NEXT;
5484 ctxt->state = start;
5485 ctxt->end = NULL;
5486 xmlFAParseBranch(ctxt, end);
5487 }
5488 if (!top) {
5489 ctxt->state = end;
5490 ctxt->end = end;
5491 }
5492 }
5493
5494 /************************************************************************
5495 * *
5496 * The basic API *
5497 * *
5498 ************************************************************************/
5499
5500 /**
5501 * xmlRegexpPrint:
5502 * @output: the file for the output debug
5503 * @regexp: the compiled regexp
5504 *
5505 * Print the content of the compiled regular expression
5506 */
5507 void
xmlRegexpPrint(FILE * output,xmlRegexpPtr regexp)5508 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5509 int i;
5510
5511 if (output == NULL)
5512 return;
5513 fprintf(output, " regexp: ");
5514 if (regexp == NULL) {
5515 fprintf(output, "NULL\n");
5516 return;
5517 }
5518 fprintf(output, "'%s' ", regexp->string);
5519 fprintf(output, "\n");
5520 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5521 for (i = 0;i < regexp->nbAtoms; i++) {
5522 fprintf(output, " %02d ", i);
5523 xmlRegPrintAtom(output, regexp->atoms[i]);
5524 }
5525 fprintf(output, "%d states:", regexp->nbStates);
5526 fprintf(output, "\n");
5527 for (i = 0;i < regexp->nbStates; i++) {
5528 xmlRegPrintState(output, regexp->states[i]);
5529 }
5530 fprintf(output, "%d counters:\n", regexp->nbCounters);
5531 for (i = 0;i < regexp->nbCounters; i++) {
5532 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5533 regexp->counters[i].max);
5534 }
5535 }
5536
5537 /**
5538 * xmlRegexpCompile:
5539 * @regexp: a regular expression string
5540 *
5541 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
5542 * Appendix F and builds an automata suitable for testing strings against
5543 * that regular expression
5544 *
5545 * Returns the compiled expression or NULL in case of error
5546 */
5547 xmlRegexpPtr
xmlRegexpCompile(const xmlChar * regexp)5548 xmlRegexpCompile(const xmlChar *regexp) {
5549 xmlRegexpPtr ret;
5550 xmlRegParserCtxtPtr ctxt;
5551
5552 ctxt = xmlRegNewParserCtxt(regexp);
5553 if (ctxt == NULL)
5554 return(NULL);
5555
5556 /* initialize the parser */
5557 ctxt->end = NULL;
5558 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5559 xmlRegStatePush(ctxt, ctxt->start);
5560
5561 /* parse the expression building an automata */
5562 xmlFAParseRegExp(ctxt, 1);
5563 if (CUR != 0) {
5564 ERROR("xmlFAParseRegExp: extra characters");
5565 }
5566 if (ctxt->error != 0) {
5567 xmlRegFreeParserCtxt(ctxt);
5568 return(NULL);
5569 }
5570 ctxt->end = ctxt->state;
5571 ctxt->start->type = XML_REGEXP_START_STATE;
5572 ctxt->end->type = XML_REGEXP_FINAL_STATE;
5573
5574 /* remove the Epsilon except for counted transitions */
5575 xmlFAEliminateEpsilonTransitions(ctxt);
5576
5577
5578 if (ctxt->error != 0) {
5579 xmlRegFreeParserCtxt(ctxt);
5580 return(NULL);
5581 }
5582 ret = xmlRegEpxFromParse(ctxt);
5583 xmlRegFreeParserCtxt(ctxt);
5584 return(ret);
5585 }
5586
5587 /**
5588 * xmlRegexpExec:
5589 * @comp: the compiled regular expression
5590 * @content: the value to check against the regular expression
5591 *
5592 * Check if the regular expression generates the value
5593 *
5594 * Returns 1 if it matches, 0 if not and a negative value in case of error
5595 */
5596 int
xmlRegexpExec(xmlRegexpPtr comp,const xmlChar * content)5597 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5598 if ((comp == NULL) || (content == NULL))
5599 return(-1);
5600 return(xmlFARegExec(comp, content));
5601 }
5602
5603 /**
5604 * xmlRegexpIsDeterminist:
5605 * @comp: the compiled regular expression
5606 *
5607 * Check if the regular expression is determinist
5608 *
5609 * Returns 1 if it yes, 0 if not and a negative value in case of error
5610 */
5611 int
xmlRegexpIsDeterminist(xmlRegexpPtr comp)5612 xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5613 xmlAutomataPtr am;
5614 int ret;
5615
5616 if (comp == NULL)
5617 return(-1);
5618 if (comp->determinist != -1)
5619 return(comp->determinist);
5620
5621 am = xmlNewAutomata();
5622 if (am == NULL)
5623 return(-1);
5624 if (am->states != NULL) {
5625 int i;
5626
5627 for (i = 0;i < am->nbStates;i++)
5628 xmlRegFreeState(am->states[i]);
5629 xmlFree(am->states);
5630 }
5631 am->nbAtoms = comp->nbAtoms;
5632 am->atoms = comp->atoms;
5633 am->nbStates = comp->nbStates;
5634 am->states = comp->states;
5635 am->determinist = -1;
5636 am->flags = comp->flags;
5637 ret = xmlFAComputesDeterminism(am);
5638 am->atoms = NULL;
5639 am->states = NULL;
5640 xmlFreeAutomata(am);
5641 comp->determinist = ret;
5642 return(ret);
5643 }
5644
5645 /**
5646 * xmlRegFreeRegexp:
5647 * @regexp: the regexp
5648 *
5649 * Free a regexp
5650 */
5651 void
xmlRegFreeRegexp(xmlRegexpPtr regexp)5652 xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5653 int i;
5654 if (regexp == NULL)
5655 return;
5656
5657 if (regexp->string != NULL)
5658 xmlFree(regexp->string);
5659 if (regexp->states != NULL) {
5660 for (i = 0;i < regexp->nbStates;i++)
5661 xmlRegFreeState(regexp->states[i]);
5662 xmlFree(regexp->states);
5663 }
5664 if (regexp->atoms != NULL) {
5665 for (i = 0;i < regexp->nbAtoms;i++)
5666 xmlRegFreeAtom(regexp->atoms[i]);
5667 xmlFree(regexp->atoms);
5668 }
5669 if (regexp->counters != NULL)
5670 xmlFree(regexp->counters);
5671 if (regexp->compact != NULL)
5672 xmlFree(regexp->compact);
5673 if (regexp->transdata != NULL)
5674 xmlFree(regexp->transdata);
5675 if (regexp->stringMap != NULL) {
5676 for (i = 0; i < regexp->nbstrings;i++)
5677 xmlFree(regexp->stringMap[i]);
5678 xmlFree(regexp->stringMap);
5679 }
5680
5681 xmlFree(regexp);
5682 }
5683
5684 #ifdef LIBXML_AUTOMATA_ENABLED
5685 /************************************************************************
5686 * *
5687 * The Automata interface *
5688 * *
5689 ************************************************************************/
5690
5691 /**
5692 * xmlNewAutomata:
5693 *
5694 * Create a new automata
5695 *
5696 * Returns the new object or NULL in case of failure
5697 */
5698 xmlAutomataPtr
xmlNewAutomata(void)5699 xmlNewAutomata(void) {
5700 xmlAutomataPtr ctxt;
5701
5702 ctxt = xmlRegNewParserCtxt(NULL);
5703 if (ctxt == NULL)
5704 return(NULL);
5705
5706 /* initialize the parser */
5707 ctxt->end = NULL;
5708 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5709 if (ctxt->start == NULL) {
5710 xmlFreeAutomata(ctxt);
5711 return(NULL);
5712 }
5713 ctxt->start->type = XML_REGEXP_START_STATE;
5714 if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5715 xmlRegFreeState(ctxt->start);
5716 xmlFreeAutomata(ctxt);
5717 return(NULL);
5718 }
5719 ctxt->flags = 0;
5720
5721 return(ctxt);
5722 }
5723
5724 /**
5725 * xmlFreeAutomata:
5726 * @am: an automata
5727 *
5728 * Free an automata
5729 */
5730 void
xmlFreeAutomata(xmlAutomataPtr am)5731 xmlFreeAutomata(xmlAutomataPtr am) {
5732 if (am == NULL)
5733 return;
5734 xmlRegFreeParserCtxt(am);
5735 }
5736
5737 /**
5738 * xmlAutomataSetFlags:
5739 * @am: an automata
5740 * @flags: a set of internal flags
5741 *
5742 * Set some flags on the automata
5743 */
5744 void
xmlAutomataSetFlags(xmlAutomataPtr am,int flags)5745 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5746 if (am == NULL)
5747 return;
5748 am->flags |= flags;
5749 }
5750
5751 /**
5752 * xmlAutomataGetInitState:
5753 * @am: an automata
5754 *
5755 * Initial state lookup
5756 *
5757 * Returns the initial state of the automata
5758 */
5759 xmlAutomataStatePtr
xmlAutomataGetInitState(xmlAutomataPtr am)5760 xmlAutomataGetInitState(xmlAutomataPtr am) {
5761 if (am == NULL)
5762 return(NULL);
5763 return(am->start);
5764 }
5765
5766 /**
5767 * xmlAutomataSetFinalState:
5768 * @am: an automata
5769 * @state: a state in this automata
5770 *
5771 * Makes that state a final state
5772 *
5773 * Returns 0 or -1 in case of error
5774 */
5775 int
xmlAutomataSetFinalState(xmlAutomataPtr am,xmlAutomataStatePtr state)5776 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5777 if ((am == NULL) || (state == NULL))
5778 return(-1);
5779 state->type = XML_REGEXP_FINAL_STATE;
5780 return(0);
5781 }
5782
5783 /**
5784 * xmlAutomataNewTransition:
5785 * @am: an automata
5786 * @from: the starting point of the transition
5787 * @to: the target point of the transition or NULL
5788 * @token: the input string associated to that transition
5789 * @data: data passed to the callback function if the transition is activated
5790 *
5791 * If @to is NULL, this creates first a new target state in the automata
5792 * and then adds a transition from the @from state to the target state
5793 * activated by the value of @token
5794 *
5795 * Returns the target state or NULL in case of error
5796 */
5797 xmlAutomataStatePtr
xmlAutomataNewTransition(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,void * data)5798 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5799 xmlAutomataStatePtr to, const xmlChar *token,
5800 void *data) {
5801 xmlRegAtomPtr atom;
5802
5803 if ((am == NULL) || (from == NULL) || (token == NULL))
5804 return(NULL);
5805 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5806 if (atom == NULL)
5807 return(NULL);
5808 atom->data = data;
5809 atom->valuep = xmlStrdup(token);
5810
5811 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5812 xmlRegFreeAtom(atom);
5813 return(NULL);
5814 }
5815 if (to == NULL)
5816 return(am->state);
5817 return(to);
5818 }
5819
5820 /**
5821 * xmlAutomataNewTransition2:
5822 * @am: an automata
5823 * @from: the starting point of the transition
5824 * @to: the target point of the transition or NULL
5825 * @token: the first input string associated to that transition
5826 * @token2: the second input string associated to that transition
5827 * @data: data passed to the callback function if the transition is activated
5828 *
5829 * If @to is NULL, this creates first a new target state in the automata
5830 * and then adds a transition from the @from state to the target state
5831 * activated by the value of @token
5832 *
5833 * Returns the target state or NULL in case of error
5834 */
5835 xmlAutomataStatePtr
xmlAutomataNewTransition2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5836 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5837 xmlAutomataStatePtr to, const xmlChar *token,
5838 const xmlChar *token2, void *data) {
5839 xmlRegAtomPtr atom;
5840
5841 if ((am == NULL) || (from == NULL) || (token == NULL))
5842 return(NULL);
5843 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5844 if (atom == NULL)
5845 return(NULL);
5846 atom->data = data;
5847 if ((token2 == NULL) || (*token2 == 0)) {
5848 atom->valuep = xmlStrdup(token);
5849 } else {
5850 int lenn, lenp;
5851 xmlChar *str;
5852
5853 lenn = strlen((char *) token2);
5854 lenp = strlen((char *) token);
5855
5856 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5857 if (str == NULL) {
5858 xmlRegFreeAtom(atom);
5859 return(NULL);
5860 }
5861 memcpy(&str[0], token, lenp);
5862 str[lenp] = '|';
5863 memcpy(&str[lenp + 1], token2, lenn);
5864 str[lenn + lenp + 1] = 0;
5865
5866 atom->valuep = str;
5867 }
5868
5869 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5870 xmlRegFreeAtom(atom);
5871 return(NULL);
5872 }
5873 if (to == NULL)
5874 return(am->state);
5875 return(to);
5876 }
5877
5878 /**
5879 * xmlAutomataNewNegTrans:
5880 * @am: an automata
5881 * @from: the starting point of the transition
5882 * @to: the target point of the transition or NULL
5883 * @token: the first input string associated to that transition
5884 * @token2: the second input string associated to that transition
5885 * @data: data passed to the callback function if the transition is activated
5886 *
5887 * If @to is NULL, this creates first a new target state in the automata
5888 * and then adds a transition from the @from state to the target state
5889 * activated by any value except (@token,@token2)
5890 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5891 # the semantic of XSD ##other
5892 *
5893 * Returns the target state or NULL in case of error
5894 */
5895 xmlAutomataStatePtr
xmlAutomataNewNegTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5896 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5897 xmlAutomataStatePtr to, const xmlChar *token,
5898 const xmlChar *token2, void *data) {
5899 xmlRegAtomPtr atom;
5900 xmlChar err_msg[200];
5901
5902 if ((am == NULL) || (from == NULL) || (token == NULL))
5903 return(NULL);
5904 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5905 if (atom == NULL)
5906 return(NULL);
5907 atom->data = data;
5908 atom->neg = 1;
5909 if ((token2 == NULL) || (*token2 == 0)) {
5910 atom->valuep = xmlStrdup(token);
5911 } else {
5912 int lenn, lenp;
5913 xmlChar *str;
5914
5915 lenn = strlen((char *) token2);
5916 lenp = strlen((char *) token);
5917
5918 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5919 if (str == NULL) {
5920 xmlRegFreeAtom(atom);
5921 return(NULL);
5922 }
5923 memcpy(&str[0], token, lenp);
5924 str[lenp] = '|';
5925 memcpy(&str[lenp + 1], token2, lenn);
5926 str[lenn + lenp + 1] = 0;
5927
5928 atom->valuep = str;
5929 }
5930 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
5931 err_msg[199] = 0;
5932 atom->valuep2 = xmlStrdup(err_msg);
5933
5934 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5935 xmlRegFreeAtom(atom);
5936 return(NULL);
5937 }
5938 am->negs++;
5939 if (to == NULL)
5940 return(am->state);
5941 return(to);
5942 }
5943
5944 /**
5945 * xmlAutomataNewCountTrans2:
5946 * @am: an automata
5947 * @from: the starting point of the transition
5948 * @to: the target point of the transition or NULL
5949 * @token: the input string associated to that transition
5950 * @token2: the second input string associated to that transition
5951 * @min: the minimum successive occurrences of token
5952 * @max: the maximum successive occurrences of token
5953 * @data: data associated to the transition
5954 *
5955 * If @to is NULL, this creates first a new target state in the automata
5956 * and then adds a transition from the @from state to the target state
5957 * activated by a succession of input of value @token and @token2 and
5958 * whose number is between @min and @max
5959 *
5960 * Returns the target state or NULL in case of error
5961 */
5962 xmlAutomataStatePtr
xmlAutomataNewCountTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)5963 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5964 xmlAutomataStatePtr to, const xmlChar *token,
5965 const xmlChar *token2,
5966 int min, int max, void *data) {
5967 xmlRegAtomPtr atom;
5968 int counter;
5969
5970 if ((am == NULL) || (from == NULL) || (token == NULL))
5971 return(NULL);
5972 if (min < 0)
5973 return(NULL);
5974 if ((max < min) || (max < 1))
5975 return(NULL);
5976 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5977 if (atom == NULL)
5978 return(NULL);
5979 if ((token2 == NULL) || (*token2 == 0)) {
5980 atom->valuep = xmlStrdup(token);
5981 } else {
5982 int lenn, lenp;
5983 xmlChar *str;
5984
5985 lenn = strlen((char *) token2);
5986 lenp = strlen((char *) token);
5987
5988 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5989 if (str == NULL) {
5990 xmlRegFreeAtom(atom);
5991 return(NULL);
5992 }
5993 memcpy(&str[0], token, lenp);
5994 str[lenp] = '|';
5995 memcpy(&str[lenp + 1], token2, lenn);
5996 str[lenn + lenp + 1] = 0;
5997
5998 atom->valuep = str;
5999 }
6000 atom->data = data;
6001 if (min == 0)
6002 atom->min = 1;
6003 else
6004 atom->min = min;
6005 atom->max = max;
6006
6007 /*
6008 * associate a counter to the transition.
6009 */
6010 counter = xmlRegGetCounter(am);
6011 am->counters[counter].min = min;
6012 am->counters[counter].max = max;
6013
6014 /* xmlFAGenerateTransitions(am, from, to, atom); */
6015 if (to == NULL) {
6016 to = xmlRegNewState(am);
6017 xmlRegStatePush(am, to);
6018 }
6019 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6020 xmlRegAtomPush(am, atom);
6021 am->state = to;
6022
6023 if (to == NULL)
6024 to = am->state;
6025 if (to == NULL)
6026 return(NULL);
6027 if (min == 0)
6028 xmlFAGenerateEpsilonTransition(am, from, to);
6029 return(to);
6030 }
6031
6032 /**
6033 * xmlAutomataNewCountTrans:
6034 * @am: an automata
6035 * @from: the starting point of the transition
6036 * @to: the target point of the transition or NULL
6037 * @token: the input string associated to that transition
6038 * @min: the minimum successive occurrences of token
6039 * @max: the maximum successive occurrences of token
6040 * @data: data associated to the transition
6041 *
6042 * If @to is NULL, this creates first a new target state in the automata
6043 * and then adds a transition from the @from state to the target state
6044 * activated by a succession of input of value @token and whose number
6045 * is between @min and @max
6046 *
6047 * Returns the target state or NULL in case of error
6048 */
6049 xmlAutomataStatePtr
xmlAutomataNewCountTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)6050 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6051 xmlAutomataStatePtr to, const xmlChar *token,
6052 int min, int max, void *data) {
6053 xmlRegAtomPtr atom;
6054 int counter;
6055
6056 if ((am == NULL) || (from == NULL) || (token == NULL))
6057 return(NULL);
6058 if (min < 0)
6059 return(NULL);
6060 if ((max < min) || (max < 1))
6061 return(NULL);
6062 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6063 if (atom == NULL)
6064 return(NULL);
6065 atom->valuep = xmlStrdup(token);
6066 atom->data = data;
6067 if (min == 0)
6068 atom->min = 1;
6069 else
6070 atom->min = min;
6071 atom->max = max;
6072
6073 /*
6074 * associate a counter to the transition.
6075 */
6076 counter = xmlRegGetCounter(am);
6077 am->counters[counter].min = min;
6078 am->counters[counter].max = max;
6079
6080 /* xmlFAGenerateTransitions(am, from, to, atom); */
6081 if (to == NULL) {
6082 to = xmlRegNewState(am);
6083 xmlRegStatePush(am, to);
6084 }
6085 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6086 xmlRegAtomPush(am, atom);
6087 am->state = to;
6088
6089 if (to == NULL)
6090 to = am->state;
6091 if (to == NULL)
6092 return(NULL);
6093 if (min == 0)
6094 xmlFAGenerateEpsilonTransition(am, from, to);
6095 return(to);
6096 }
6097
6098 /**
6099 * xmlAutomataNewOnceTrans2:
6100 * @am: an automata
6101 * @from: the starting point of the transition
6102 * @to: the target point of the transition or NULL
6103 * @token: the input string associated to that transition
6104 * @token2: the second input string associated to that transition
6105 * @min: the minimum successive occurrences of token
6106 * @max: the maximum successive occurrences of token
6107 * @data: data associated to the transition
6108 *
6109 * If @to is NULL, this creates first a new target state in the automata
6110 * and then adds a transition from the @from state to the target state
6111 * activated by a succession of input of value @token and @token2 and whose
6112 * number is between @min and @max, moreover that transition can only be
6113 * crossed once.
6114 *
6115 * Returns the target state or NULL in case of error
6116 */
6117 xmlAutomataStatePtr
xmlAutomataNewOnceTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)6118 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6119 xmlAutomataStatePtr to, const xmlChar *token,
6120 const xmlChar *token2,
6121 int min, int max, void *data) {
6122 xmlRegAtomPtr atom;
6123 int counter;
6124
6125 if ((am == NULL) || (from == NULL) || (token == NULL))
6126 return(NULL);
6127 if (min < 1)
6128 return(NULL);
6129 if (max < min)
6130 return(NULL);
6131 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6132 if (atom == NULL)
6133 return(NULL);
6134 if ((token2 == NULL) || (*token2 == 0)) {
6135 atom->valuep = xmlStrdup(token);
6136 } else {
6137 int lenn, lenp;
6138 xmlChar *str;
6139
6140 lenn = strlen((char *) token2);
6141 lenp = strlen((char *) token);
6142
6143 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6144 if (str == NULL) {
6145 xmlRegFreeAtom(atom);
6146 return(NULL);
6147 }
6148 memcpy(&str[0], token, lenp);
6149 str[lenp] = '|';
6150 memcpy(&str[lenp + 1], token2, lenn);
6151 str[lenn + lenp + 1] = 0;
6152
6153 atom->valuep = str;
6154 }
6155 atom->data = data;
6156 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6157 atom->min = min;
6158 atom->max = max;
6159 /*
6160 * associate a counter to the transition.
6161 */
6162 counter = xmlRegGetCounter(am);
6163 am->counters[counter].min = 1;
6164 am->counters[counter].max = 1;
6165
6166 /* xmlFAGenerateTransitions(am, from, to, atom); */
6167 if (to == NULL) {
6168 to = xmlRegNewState(am);
6169 xmlRegStatePush(am, to);
6170 }
6171 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6172 xmlRegAtomPush(am, atom);
6173 am->state = to;
6174 return(to);
6175 }
6176
6177
6178
6179 /**
6180 * xmlAutomataNewOnceTrans:
6181 * @am: an automata
6182 * @from: the starting point of the transition
6183 * @to: the target point of the transition or NULL
6184 * @token: the input string associated to that transition
6185 * @min: the minimum successive occurrences of token
6186 * @max: the maximum successive occurrences of token
6187 * @data: data associated to the transition
6188 *
6189 * If @to is NULL, this creates first a new target state in the automata
6190 * and then adds a transition from the @from state to the target state
6191 * activated by a succession of input of value @token and whose number
6192 * is between @min and @max, moreover that transition can only be crossed
6193 * once.
6194 *
6195 * Returns the target state or NULL in case of error
6196 */
6197 xmlAutomataStatePtr
xmlAutomataNewOnceTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)6198 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6199 xmlAutomataStatePtr to, const xmlChar *token,
6200 int min, int max, void *data) {
6201 xmlRegAtomPtr atom;
6202 int counter;
6203
6204 if ((am == NULL) || (from == NULL) || (token == NULL))
6205 return(NULL);
6206 if (min < 1)
6207 return(NULL);
6208 if (max < min)
6209 return(NULL);
6210 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6211 if (atom == NULL)
6212 return(NULL);
6213 atom->valuep = xmlStrdup(token);
6214 atom->data = data;
6215 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6216 atom->min = min;
6217 atom->max = max;
6218 /*
6219 * associate a counter to the transition.
6220 */
6221 counter = xmlRegGetCounter(am);
6222 am->counters[counter].min = 1;
6223 am->counters[counter].max = 1;
6224
6225 /* xmlFAGenerateTransitions(am, from, to, atom); */
6226 if (to == NULL) {
6227 to = xmlRegNewState(am);
6228 xmlRegStatePush(am, to);
6229 }
6230 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6231 xmlRegAtomPush(am, atom);
6232 am->state = to;
6233 return(to);
6234 }
6235
6236 /**
6237 * xmlAutomataNewState:
6238 * @am: an automata
6239 *
6240 * Create a new disconnected state in the automata
6241 *
6242 * Returns the new state or NULL in case of error
6243 */
6244 xmlAutomataStatePtr
xmlAutomataNewState(xmlAutomataPtr am)6245 xmlAutomataNewState(xmlAutomataPtr am) {
6246 xmlAutomataStatePtr to;
6247
6248 if (am == NULL)
6249 return(NULL);
6250 to = xmlRegNewState(am);
6251 xmlRegStatePush(am, to);
6252 return(to);
6253 }
6254
6255 /**
6256 * xmlAutomataNewEpsilon:
6257 * @am: an automata
6258 * @from: the starting point of the transition
6259 * @to: the target point of the transition or NULL
6260 *
6261 * If @to is NULL, this creates first a new target state in the automata
6262 * and then adds an epsilon transition from the @from state to the
6263 * target state
6264 *
6265 * Returns the target state or NULL in case of error
6266 */
6267 xmlAutomataStatePtr
xmlAutomataNewEpsilon(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to)6268 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
6269 xmlAutomataStatePtr to) {
6270 if ((am == NULL) || (from == NULL))
6271 return(NULL);
6272 xmlFAGenerateEpsilonTransition(am, from, to);
6273 if (to == NULL)
6274 return(am->state);
6275 return(to);
6276 }
6277
6278 /**
6279 * xmlAutomataNewAllTrans:
6280 * @am: an automata
6281 * @from: the starting point of the transition
6282 * @to: the target point of the transition or NULL
6283 * @lax: allow to transition if not all all transitions have been activated
6284 *
6285 * If @to is NULL, this creates first a new target state in the automata
6286 * and then adds a an ALL transition from the @from state to the
6287 * target state. That transition is an epsilon transition allowed only when
6288 * all transitions from the @from node have been activated.
6289 *
6290 * Returns the target state or NULL in case of error
6291 */
6292 xmlAutomataStatePtr
xmlAutomataNewAllTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int lax)6293 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6294 xmlAutomataStatePtr to, int lax) {
6295 if ((am == NULL) || (from == NULL))
6296 return(NULL);
6297 xmlFAGenerateAllTransition(am, from, to, lax);
6298 if (to == NULL)
6299 return(am->state);
6300 return(to);
6301 }
6302
6303 /**
6304 * xmlAutomataNewCounter:
6305 * @am: an automata
6306 * @min: the minimal value on the counter
6307 * @max: the maximal value on the counter
6308 *
6309 * Create a new counter
6310 *
6311 * Returns the counter number or -1 in case of error
6312 */
6313 int
xmlAutomataNewCounter(xmlAutomataPtr am,int min,int max)6314 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6315 int ret;
6316
6317 if (am == NULL)
6318 return(-1);
6319
6320 ret = xmlRegGetCounter(am);
6321 if (ret < 0)
6322 return(-1);
6323 am->counters[ret].min = min;
6324 am->counters[ret].max = max;
6325 return(ret);
6326 }
6327
6328 /**
6329 * xmlAutomataNewCountedTrans:
6330 * @am: an automata
6331 * @from: the starting point of the transition
6332 * @to: the target point of the transition or NULL
6333 * @counter: the counter associated to that transition
6334 *
6335 * If @to is NULL, this creates first a new target state in the automata
6336 * and then adds an epsilon transition from the @from state to the target state
6337 * which will increment the counter provided
6338 *
6339 * Returns the target state or NULL in case of error
6340 */
6341 xmlAutomataStatePtr
xmlAutomataNewCountedTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6342 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6343 xmlAutomataStatePtr to, int counter) {
6344 if ((am == NULL) || (from == NULL) || (counter < 0))
6345 return(NULL);
6346 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6347 if (to == NULL)
6348 return(am->state);
6349 return(to);
6350 }
6351
6352 /**
6353 * xmlAutomataNewCounterTrans:
6354 * @am: an automata
6355 * @from: the starting point of the transition
6356 * @to: the target point of the transition or NULL
6357 * @counter: the counter associated to that transition
6358 *
6359 * If @to is NULL, this creates first a new target state in the automata
6360 * and then adds an epsilon transition from the @from state to the target state
6361 * which will be allowed only if the counter is within the right range.
6362 *
6363 * Returns the target state or NULL in case of error
6364 */
6365 xmlAutomataStatePtr
xmlAutomataNewCounterTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6366 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6367 xmlAutomataStatePtr to, int counter) {
6368 if ((am == NULL) || (from == NULL) || (counter < 0))
6369 return(NULL);
6370 xmlFAGenerateCountedTransition(am, from, to, counter);
6371 if (to == NULL)
6372 return(am->state);
6373 return(to);
6374 }
6375
6376 /**
6377 * xmlAutomataCompile:
6378 * @am: an automata
6379 *
6380 * Compile the automata into a Reg Exp ready for being executed.
6381 * The automata should be free after this point.
6382 *
6383 * Returns the compiled regexp or NULL in case of error
6384 */
6385 xmlRegexpPtr
xmlAutomataCompile(xmlAutomataPtr am)6386 xmlAutomataCompile(xmlAutomataPtr am) {
6387 xmlRegexpPtr ret;
6388
6389 if ((am == NULL) || (am->error != 0)) return(NULL);
6390 xmlFAEliminateEpsilonTransitions(am);
6391 /* xmlFAComputesDeterminism(am); */
6392 ret = xmlRegEpxFromParse(am);
6393
6394 return(ret);
6395 }
6396
6397 /**
6398 * xmlAutomataIsDeterminist:
6399 * @am: an automata
6400 *
6401 * Checks if an automata is determinist.
6402 *
6403 * Returns 1 if true, 0 if not, and -1 in case of error
6404 */
6405 int
xmlAutomataIsDeterminist(xmlAutomataPtr am)6406 xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6407 int ret;
6408
6409 if (am == NULL)
6410 return(-1);
6411
6412 ret = xmlFAComputesDeterminism(am);
6413 return(ret);
6414 }
6415 #endif /* LIBXML_AUTOMATA_ENABLED */
6416
6417 #ifdef LIBXML_EXPR_ENABLED
6418 /************************************************************************
6419 * *
6420 * Formal Expression handling code *
6421 * *
6422 ************************************************************************/
6423 /************************************************************************
6424 * *
6425 * Expression handling context *
6426 * *
6427 ************************************************************************/
6428
6429 struct _xmlExpCtxt {
6430 xmlDictPtr dict;
6431 xmlExpNodePtr *table;
6432 int size;
6433 int nbElems;
6434 int nb_nodes;
6435 int maxNodes;
6436 const char *expr;
6437 const char *cur;
6438 int nb_cons;
6439 int tabSize;
6440 };
6441
6442 /**
6443 * xmlExpNewCtxt:
6444 * @maxNodes: the maximum number of nodes
6445 * @dict: optional dictionary to use internally
6446 *
6447 * Creates a new context for manipulating expressions
6448 *
6449 * Returns the context or NULL in case of error
6450 */
6451 xmlExpCtxtPtr
xmlExpNewCtxt(int maxNodes,xmlDictPtr dict)6452 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6453 xmlExpCtxtPtr ret;
6454 int size = 256;
6455
6456 if (maxNodes <= 4096)
6457 maxNodes = 4096;
6458
6459 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6460 if (ret == NULL)
6461 return(NULL);
6462 memset(ret, 0, sizeof(xmlExpCtxt));
6463 ret->size = size;
6464 ret->nbElems = 0;
6465 ret->maxNodes = maxNodes;
6466 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6467 if (ret->table == NULL) {
6468 xmlFree(ret);
6469 return(NULL);
6470 }
6471 memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6472 if (dict == NULL) {
6473 ret->dict = xmlDictCreate();
6474 if (ret->dict == NULL) {
6475 xmlFree(ret->table);
6476 xmlFree(ret);
6477 return(NULL);
6478 }
6479 } else {
6480 ret->dict = dict;
6481 xmlDictReference(ret->dict);
6482 }
6483 return(ret);
6484 }
6485
6486 /**
6487 * xmlExpFreeCtxt:
6488 * @ctxt: an expression context
6489 *
6490 * Free an expression context
6491 */
6492 void
xmlExpFreeCtxt(xmlExpCtxtPtr ctxt)6493 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6494 if (ctxt == NULL)
6495 return;
6496 xmlDictFree(ctxt->dict);
6497 if (ctxt->table != NULL)
6498 xmlFree(ctxt->table);
6499 xmlFree(ctxt);
6500 }
6501
6502 /************************************************************************
6503 * *
6504 * Structure associated to an expression node *
6505 * *
6506 ************************************************************************/
6507 #define MAX_NODES 10000
6508
6509 /* #define DEBUG_DERIV */
6510
6511 /*
6512 * TODO:
6513 * - Wildcards
6514 * - public API for creation
6515 *
6516 * Started
6517 * - regression testing
6518 *
6519 * Done
6520 * - split into module and test tool
6521 * - memleaks
6522 */
6523
6524 typedef enum {
6525 XML_EXP_NILABLE = (1 << 0)
6526 } xmlExpNodeInfo;
6527
6528 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6529
6530 struct _xmlExpNode {
6531 unsigned char type;/* xmlExpNodeType */
6532 unsigned char info;/* OR of xmlExpNodeInfo */
6533 unsigned short key; /* the hash key */
6534 unsigned int ref; /* The number of references */
6535 int c_max; /* the maximum length it can consume */
6536 xmlExpNodePtr exp_left;
6537 xmlExpNodePtr next;/* the next node in the hash table or free list */
6538 union {
6539 struct {
6540 int f_min;
6541 int f_max;
6542 } count;
6543 struct {
6544 xmlExpNodePtr f_right;
6545 } children;
6546 const xmlChar *f_str;
6547 } field;
6548 };
6549
6550 #define exp_min field.count.f_min
6551 #define exp_max field.count.f_max
6552 /* #define exp_left field.children.f_left */
6553 #define exp_right field.children.f_right
6554 #define exp_str field.f_str
6555
6556 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6557 static xmlExpNode forbiddenExpNode = {
6558 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6559 };
6560 xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6561 static xmlExpNode emptyExpNode = {
6562 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6563 };
6564 xmlExpNodePtr emptyExp = &emptyExpNode;
6565
6566 /************************************************************************
6567 * *
6568 * The custom hash table for unicity and canonicalization *
6569 * of sub-expressions pointers *
6570 * *
6571 ************************************************************************/
6572 /*
6573 * xmlExpHashNameComputeKey:
6574 * Calculate the hash key for a token
6575 */
6576 static unsigned short
xmlExpHashNameComputeKey(const xmlChar * name)6577 xmlExpHashNameComputeKey(const xmlChar *name) {
6578 unsigned short value = 0L;
6579 char ch;
6580
6581 if (name != NULL) {
6582 value += 30 * (*name);
6583 while ((ch = *name++) != 0) {
6584 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6585 }
6586 }
6587 return (value);
6588 }
6589
6590 /*
6591 * xmlExpHashComputeKey:
6592 * Calculate the hash key for a compound expression
6593 */
6594 static unsigned short
xmlExpHashComputeKey(xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right)6595 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6596 xmlExpNodePtr right) {
6597 unsigned long value;
6598 unsigned short ret;
6599
6600 switch (type) {
6601 case XML_EXP_SEQ:
6602 value = left->key;
6603 value += right->key;
6604 value *= 3;
6605 ret = (unsigned short) value;
6606 break;
6607 case XML_EXP_OR:
6608 value = left->key;
6609 value += right->key;
6610 value *= 7;
6611 ret = (unsigned short) value;
6612 break;
6613 case XML_EXP_COUNT:
6614 value = left->key;
6615 value += right->key;
6616 ret = (unsigned short) value;
6617 break;
6618 default:
6619 ret = 0;
6620 }
6621 return(ret);
6622 }
6623
6624
6625 static xmlExpNodePtr
xmlExpNewNode(xmlExpCtxtPtr ctxt,xmlExpNodeType type)6626 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6627 xmlExpNodePtr ret;
6628
6629 if (ctxt->nb_nodes >= MAX_NODES)
6630 return(NULL);
6631 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6632 if (ret == NULL)
6633 return(NULL);
6634 memset(ret, 0, sizeof(xmlExpNode));
6635 ret->type = type;
6636 ret->next = NULL;
6637 ctxt->nb_nodes++;
6638 ctxt->nb_cons++;
6639 return(ret);
6640 }
6641
6642 /**
6643 * xmlExpHashGetEntry:
6644 * @table: the hash table
6645 *
6646 * Get the unique entry from the hash table. The entry is created if
6647 * needed. @left and @right are consumed, i.e. their ref count will
6648 * be decremented by the operation.
6649 *
6650 * Returns the pointer or NULL in case of error
6651 */
6652 static xmlExpNodePtr
xmlExpHashGetEntry(xmlExpCtxtPtr ctxt,xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right,const xmlChar * name,int min,int max)6653 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6654 xmlExpNodePtr left, xmlExpNodePtr right,
6655 const xmlChar *name, int min, int max) {
6656 unsigned short kbase, key;
6657 xmlExpNodePtr entry;
6658 xmlExpNodePtr insert;
6659
6660 if (ctxt == NULL)
6661 return(NULL);
6662
6663 /*
6664 * Check for duplicate and insertion location.
6665 */
6666 if (type == XML_EXP_ATOM) {
6667 kbase = xmlExpHashNameComputeKey(name);
6668 } else if (type == XML_EXP_COUNT) {
6669 /* COUNT reduction rule 1 */
6670 /* a{1} -> a */
6671 if (min == max) {
6672 if (min == 1) {
6673 return(left);
6674 }
6675 if (min == 0) {
6676 xmlExpFree(ctxt, left);
6677 return(emptyExp);
6678 }
6679 }
6680 if (min < 0) {
6681 xmlExpFree(ctxt, left);
6682 return(forbiddenExp);
6683 }
6684 if (max == -1)
6685 kbase = min + 79;
6686 else
6687 kbase = max - min;
6688 kbase += left->key;
6689 } else if (type == XML_EXP_OR) {
6690 /* Forbid reduction rules */
6691 if (left->type == XML_EXP_FORBID) {
6692 xmlExpFree(ctxt, left);
6693 return(right);
6694 }
6695 if (right->type == XML_EXP_FORBID) {
6696 xmlExpFree(ctxt, right);
6697 return(left);
6698 }
6699
6700 /* OR reduction rule 1 */
6701 /* a | a reduced to a */
6702 if (left == right) {
6703 xmlExpFree(ctxt, right);
6704 return(left);
6705 }
6706 /* OR canonicalization rule 1 */
6707 /* linearize (a | b) | c into a | (b | c) */
6708 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6709 xmlExpNodePtr tmp = left;
6710 left = right;
6711 right = tmp;
6712 }
6713 /* OR reduction rule 2 */
6714 /* a | (a | b) and b | (a | b) are reduced to a | b */
6715 if (right->type == XML_EXP_OR) {
6716 if ((left == right->exp_left) ||
6717 (left == right->exp_right)) {
6718 xmlExpFree(ctxt, left);
6719 return(right);
6720 }
6721 }
6722 /* OR canonicalization rule 2 */
6723 /* linearize (a | b) | c into a | (b | c) */
6724 if (left->type == XML_EXP_OR) {
6725 xmlExpNodePtr tmp;
6726
6727 /* OR canonicalization rule 2 */
6728 if ((left->exp_right->type != XML_EXP_OR) &&
6729 (left->exp_right->key < left->exp_left->key)) {
6730 tmp = left->exp_right;
6731 left->exp_right = left->exp_left;
6732 left->exp_left = tmp;
6733 }
6734 left->exp_right->ref++;
6735 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6736 NULL, 0, 0);
6737 left->exp_left->ref++;
6738 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6739 NULL, 0, 0);
6740
6741 xmlExpFree(ctxt, left);
6742 return(tmp);
6743 }
6744 if (right->type == XML_EXP_OR) {
6745 /* Ordering in the tree */
6746 /* C | (A | B) -> A | (B | C) */
6747 if (left->key > right->exp_right->key) {
6748 xmlExpNodePtr tmp;
6749 right->exp_right->ref++;
6750 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6751 left, NULL, 0, 0);
6752 right->exp_left->ref++;
6753 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6754 tmp, NULL, 0, 0);
6755 xmlExpFree(ctxt, right);
6756 return(tmp);
6757 }
6758 /* Ordering in the tree */
6759 /* B | (A | C) -> A | (B | C) */
6760 if (left->key > right->exp_left->key) {
6761 xmlExpNodePtr tmp;
6762 right->exp_right->ref++;
6763 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6764 right->exp_right, NULL, 0, 0);
6765 right->exp_left->ref++;
6766 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6767 tmp, NULL, 0, 0);
6768 xmlExpFree(ctxt, right);
6769 return(tmp);
6770 }
6771 }
6772 /* we know both types are != XML_EXP_OR here */
6773 else if (left->key > right->key) {
6774 xmlExpNodePtr tmp = left;
6775 left = right;
6776 right = tmp;
6777 }
6778 kbase = xmlExpHashComputeKey(type, left, right);
6779 } else if (type == XML_EXP_SEQ) {
6780 /* Forbid reduction rules */
6781 if (left->type == XML_EXP_FORBID) {
6782 xmlExpFree(ctxt, right);
6783 return(left);
6784 }
6785 if (right->type == XML_EXP_FORBID) {
6786 xmlExpFree(ctxt, left);
6787 return(right);
6788 }
6789 /* Empty reduction rules */
6790 if (right->type == XML_EXP_EMPTY) {
6791 return(left);
6792 }
6793 if (left->type == XML_EXP_EMPTY) {
6794 return(right);
6795 }
6796 kbase = xmlExpHashComputeKey(type, left, right);
6797 } else
6798 return(NULL);
6799
6800 key = kbase % ctxt->size;
6801 if (ctxt->table[key] != NULL) {
6802 for (insert = ctxt->table[key]; insert != NULL;
6803 insert = insert->next) {
6804 if ((insert->key == kbase) &&
6805 (insert->type == type)) {
6806 if (type == XML_EXP_ATOM) {
6807 if (name == insert->exp_str) {
6808 insert->ref++;
6809 return(insert);
6810 }
6811 } else if (type == XML_EXP_COUNT) {
6812 if ((insert->exp_min == min) && (insert->exp_max == max) &&
6813 (insert->exp_left == left)) {
6814 insert->ref++;
6815 left->ref--;
6816 return(insert);
6817 }
6818 } else if ((insert->exp_left == left) &&
6819 (insert->exp_right == right)) {
6820 insert->ref++;
6821 left->ref--;
6822 right->ref--;
6823 return(insert);
6824 }
6825 }
6826 }
6827 }
6828
6829 entry = xmlExpNewNode(ctxt, type);
6830 if (entry == NULL)
6831 return(NULL);
6832 entry->key = kbase;
6833 if (type == XML_EXP_ATOM) {
6834 entry->exp_str = name;
6835 entry->c_max = 1;
6836 } else if (type == XML_EXP_COUNT) {
6837 entry->exp_min = min;
6838 entry->exp_max = max;
6839 entry->exp_left = left;
6840 if ((min == 0) || (IS_NILLABLE(left)))
6841 entry->info |= XML_EXP_NILABLE;
6842 if (max < 0)
6843 entry->c_max = -1;
6844 else
6845 entry->c_max = max * entry->exp_left->c_max;
6846 } else {
6847 entry->exp_left = left;
6848 entry->exp_right = right;
6849 if (type == XML_EXP_OR) {
6850 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6851 entry->info |= XML_EXP_NILABLE;
6852 if ((entry->exp_left->c_max == -1) ||
6853 (entry->exp_right->c_max == -1))
6854 entry->c_max = -1;
6855 else if (entry->exp_left->c_max > entry->exp_right->c_max)
6856 entry->c_max = entry->exp_left->c_max;
6857 else
6858 entry->c_max = entry->exp_right->c_max;
6859 } else {
6860 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6861 entry->info |= XML_EXP_NILABLE;
6862 if ((entry->exp_left->c_max == -1) ||
6863 (entry->exp_right->c_max == -1))
6864 entry->c_max = -1;
6865 else
6866 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6867 }
6868 }
6869 entry->ref = 1;
6870 if (ctxt->table[key] != NULL)
6871 entry->next = ctxt->table[key];
6872
6873 ctxt->table[key] = entry;
6874 ctxt->nbElems++;
6875
6876 return(entry);
6877 }
6878
6879 /**
6880 * xmlExpFree:
6881 * @ctxt: the expression context
6882 * @exp: the expression
6883 *
6884 * Dereference the expression
6885 */
6886 void
xmlExpFree(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp)6887 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6888 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6889 return;
6890 exp->ref--;
6891 if (exp->ref == 0) {
6892 unsigned short key;
6893
6894 /* Unlink it first from the hash table */
6895 key = exp->key % ctxt->size;
6896 if (ctxt->table[key] == exp) {
6897 ctxt->table[key] = exp->next;
6898 } else {
6899 xmlExpNodePtr tmp;
6900
6901 tmp = ctxt->table[key];
6902 while (tmp != NULL) {
6903 if (tmp->next == exp) {
6904 tmp->next = exp->next;
6905 break;
6906 }
6907 tmp = tmp->next;
6908 }
6909 }
6910
6911 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6912 xmlExpFree(ctxt, exp->exp_left);
6913 xmlExpFree(ctxt, exp->exp_right);
6914 } else if (exp->type == XML_EXP_COUNT) {
6915 xmlExpFree(ctxt, exp->exp_left);
6916 }
6917 xmlFree(exp);
6918 ctxt->nb_nodes--;
6919 }
6920 }
6921
6922 /**
6923 * xmlExpRef:
6924 * @exp: the expression
6925 *
6926 * Increase the reference count of the expression
6927 */
6928 void
xmlExpRef(xmlExpNodePtr exp)6929 xmlExpRef(xmlExpNodePtr exp) {
6930 if (exp != NULL)
6931 exp->ref++;
6932 }
6933
6934 /**
6935 * xmlExpNewAtom:
6936 * @ctxt: the expression context
6937 * @name: the atom name
6938 * @len: the atom name length in byte (or -1);
6939 *
6940 * Get the atom associated to this name from that context
6941 *
6942 * Returns the node or NULL in case of error
6943 */
6944 xmlExpNodePtr
xmlExpNewAtom(xmlExpCtxtPtr ctxt,const xmlChar * name,int len)6945 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
6946 if ((ctxt == NULL) || (name == NULL))
6947 return(NULL);
6948 name = xmlDictLookup(ctxt->dict, name, len);
6949 if (name == NULL)
6950 return(NULL);
6951 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
6952 }
6953
6954 /**
6955 * xmlExpNewOr:
6956 * @ctxt: the expression context
6957 * @left: left expression
6958 * @right: right expression
6959 *
6960 * Get the atom associated to the choice @left | @right
6961 * Note that @left and @right are consumed in the operation, to keep
6962 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6963 * this is true even in case of failure (unless ctxt == NULL).
6964 *
6965 * Returns the node or NULL in case of error
6966 */
6967 xmlExpNodePtr
xmlExpNewOr(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)6968 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6969 if (ctxt == NULL)
6970 return(NULL);
6971 if ((left == NULL) || (right == NULL)) {
6972 xmlExpFree(ctxt, left);
6973 xmlExpFree(ctxt, right);
6974 return(NULL);
6975 }
6976 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
6977 }
6978
6979 /**
6980 * xmlExpNewSeq:
6981 * @ctxt: the expression context
6982 * @left: left expression
6983 * @right: right expression
6984 *
6985 * Get the atom associated to the sequence @left , @right
6986 * Note that @left and @right are consumed in the operation, to keep
6987 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6988 * this is true even in case of failure (unless ctxt == NULL).
6989 *
6990 * Returns the node or NULL in case of error
6991 */
6992 xmlExpNodePtr
xmlExpNewSeq(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)6993 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6994 if (ctxt == NULL)
6995 return(NULL);
6996 if ((left == NULL) || (right == NULL)) {
6997 xmlExpFree(ctxt, left);
6998 xmlExpFree(ctxt, right);
6999 return(NULL);
7000 }
7001 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
7002 }
7003
7004 /**
7005 * xmlExpNewRange:
7006 * @ctxt: the expression context
7007 * @subset: the expression to be repeated
7008 * @min: the lower bound for the repetition
7009 * @max: the upper bound for the repetition, -1 means infinite
7010 *
7011 * Get the atom associated to the range (@subset){@min, @max}
7012 * Note that @subset is consumed in the operation, to keep
7013 * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
7014 * this is true even in case of failure (unless ctxt == NULL).
7015 *
7016 * Returns the node or NULL in case of error
7017 */
7018 xmlExpNodePtr
xmlExpNewRange(xmlExpCtxtPtr ctxt,xmlExpNodePtr subset,int min,int max)7019 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
7020 if (ctxt == NULL)
7021 return(NULL);
7022 if ((subset == NULL) || (min < 0) || (max < -1) ||
7023 ((max >= 0) && (min > max))) {
7024 xmlExpFree(ctxt, subset);
7025 return(NULL);
7026 }
7027 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
7028 NULL, NULL, min, max));
7029 }
7030
7031 /************************************************************************
7032 * *
7033 * Public API for operations on expressions *
7034 * *
7035 ************************************************************************/
7036
7037 static int
xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)7038 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7039 const xmlChar**list, int len, int nb) {
7040 int tmp, tmp2;
7041 tail:
7042 switch (exp->type) {
7043 case XML_EXP_EMPTY:
7044 return(0);
7045 case XML_EXP_ATOM:
7046 for (tmp = 0;tmp < nb;tmp++)
7047 if (list[tmp] == exp->exp_str)
7048 return(0);
7049 if (nb >= len)
7050 return(-2);
7051 list[nb] = exp->exp_str;
7052 return(1);
7053 case XML_EXP_COUNT:
7054 exp = exp->exp_left;
7055 goto tail;
7056 case XML_EXP_SEQ:
7057 case XML_EXP_OR:
7058 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
7059 if (tmp < 0)
7060 return(tmp);
7061 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
7062 nb + tmp);
7063 if (tmp2 < 0)
7064 return(tmp2);
7065 return(tmp + tmp2);
7066 }
7067 return(-1);
7068 }
7069
7070 /**
7071 * xmlExpGetLanguage:
7072 * @ctxt: the expression context
7073 * @exp: the expression
7074 * @langList: where to store the tokens
7075 * @len: the allocated length of @list
7076 *
7077 * Find all the strings used in @exp and store them in @list
7078 *
7079 * Returns the number of unique strings found, -1 in case of errors and
7080 * -2 if there is more than @len strings
7081 */
7082 int
xmlExpGetLanguage(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** langList,int len)7083 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7084 const xmlChar**langList, int len) {
7085 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
7086 return(-1);
7087 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
7088 }
7089
7090 static int
xmlExpGetStartInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)7091 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7092 const xmlChar**list, int len, int nb) {
7093 int tmp, tmp2;
7094 tail:
7095 switch (exp->type) {
7096 case XML_EXP_FORBID:
7097 return(0);
7098 case XML_EXP_EMPTY:
7099 return(0);
7100 case XML_EXP_ATOM:
7101 for (tmp = 0;tmp < nb;tmp++)
7102 if (list[tmp] == exp->exp_str)
7103 return(0);
7104 if (nb >= len)
7105 return(-2);
7106 list[nb] = exp->exp_str;
7107 return(1);
7108 case XML_EXP_COUNT:
7109 exp = exp->exp_left;
7110 goto tail;
7111 case XML_EXP_SEQ:
7112 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7113 if (tmp < 0)
7114 return(tmp);
7115 if (IS_NILLABLE(exp->exp_left)) {
7116 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7117 nb + tmp);
7118 if (tmp2 < 0)
7119 return(tmp2);
7120 tmp += tmp2;
7121 }
7122 return(tmp);
7123 case XML_EXP_OR:
7124 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7125 if (tmp < 0)
7126 return(tmp);
7127 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7128 nb + tmp);
7129 if (tmp2 < 0)
7130 return(tmp2);
7131 return(tmp + tmp2);
7132 }
7133 return(-1);
7134 }
7135
7136 /**
7137 * xmlExpGetStart:
7138 * @ctxt: the expression context
7139 * @exp: the expression
7140 * @tokList: where to store the tokens
7141 * @len: the allocated length of @list
7142 *
7143 * Find all the strings that appears at the start of the languages
7144 * accepted by @exp and store them in @list. E.g. for (a, b) | c
7145 * it will return the list [a, c]
7146 *
7147 * Returns the number of unique strings found, -1 in case of errors and
7148 * -2 if there is more than @len strings
7149 */
7150 int
xmlExpGetStart(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** tokList,int len)7151 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7152 const xmlChar**tokList, int len) {
7153 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
7154 return(-1);
7155 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
7156 }
7157
7158 /**
7159 * xmlExpIsNillable:
7160 * @exp: the expression
7161 *
7162 * Finds if the expression is nillable, i.e. if it accepts the empty sequence
7163 *
7164 * Returns 1 if nillable, 0 if not and -1 in case of error
7165 */
7166 int
xmlExpIsNillable(xmlExpNodePtr exp)7167 xmlExpIsNillable(xmlExpNodePtr exp) {
7168 if (exp == NULL)
7169 return(-1);
7170 return(IS_NILLABLE(exp) != 0);
7171 }
7172
7173 static xmlExpNodePtr
xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str)7174 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
7175 {
7176 xmlExpNodePtr ret;
7177
7178 switch (exp->type) {
7179 case XML_EXP_EMPTY:
7180 return(forbiddenExp);
7181 case XML_EXP_FORBID:
7182 return(forbiddenExp);
7183 case XML_EXP_ATOM:
7184 if (exp->exp_str == str) {
7185 #ifdef DEBUG_DERIV
7186 printf("deriv atom: equal => Empty\n");
7187 #endif
7188 ret = emptyExp;
7189 } else {
7190 #ifdef DEBUG_DERIV
7191 printf("deriv atom: mismatch => forbid\n");
7192 #endif
7193 /* TODO wildcards here */
7194 ret = forbiddenExp;
7195 }
7196 return(ret);
7197 case XML_EXP_OR: {
7198 xmlExpNodePtr tmp;
7199
7200 #ifdef DEBUG_DERIV
7201 printf("deriv or: => or(derivs)\n");
7202 #endif
7203 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7204 if (tmp == NULL) {
7205 return(NULL);
7206 }
7207 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7208 if (ret == NULL) {
7209 xmlExpFree(ctxt, tmp);
7210 return(NULL);
7211 }
7212 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
7213 NULL, 0, 0);
7214 return(ret);
7215 }
7216 case XML_EXP_SEQ:
7217 #ifdef DEBUG_DERIV
7218 printf("deriv seq: starting with left\n");
7219 #endif
7220 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7221 if (ret == NULL) {
7222 return(NULL);
7223 } else if (ret == forbiddenExp) {
7224 if (IS_NILLABLE(exp->exp_left)) {
7225 #ifdef DEBUG_DERIV
7226 printf("deriv seq: left failed but nillable\n");
7227 #endif
7228 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7229 }
7230 } else {
7231 #ifdef DEBUG_DERIV
7232 printf("deriv seq: left match => sequence\n");
7233 #endif
7234 exp->exp_right->ref++;
7235 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
7236 NULL, 0, 0);
7237 }
7238 return(ret);
7239 case XML_EXP_COUNT: {
7240 int min, max;
7241 xmlExpNodePtr tmp;
7242
7243 if (exp->exp_max == 0)
7244 return(forbiddenExp);
7245 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7246 if (ret == NULL)
7247 return(NULL);
7248 if (ret == forbiddenExp) {
7249 #ifdef DEBUG_DERIV
7250 printf("deriv count: pattern mismatch => forbid\n");
7251 #endif
7252 return(ret);
7253 }
7254 if (exp->exp_max == 1)
7255 return(ret);
7256 if (exp->exp_max < 0) /* unbounded */
7257 max = -1;
7258 else
7259 max = exp->exp_max - 1;
7260 if (exp->exp_min > 0)
7261 min = exp->exp_min - 1;
7262 else
7263 min = 0;
7264 exp->exp_left->ref++;
7265 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
7266 NULL, min, max);
7267 if (ret == emptyExp) {
7268 #ifdef DEBUG_DERIV
7269 printf("deriv count: match to empty => new count\n");
7270 #endif
7271 return(tmp);
7272 }
7273 #ifdef DEBUG_DERIV
7274 printf("deriv count: match => sequence with new count\n");
7275 #endif
7276 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
7277 NULL, 0, 0));
7278 }
7279 }
7280 return(NULL);
7281 }
7282
7283 /**
7284 * xmlExpStringDerive:
7285 * @ctxt: the expression context
7286 * @exp: the expression
7287 * @str: the string
7288 * @len: the string len in bytes if available
7289 *
7290 * Do one step of Brzozowski derivation of the expression @exp with
7291 * respect to the input string
7292 *
7293 * Returns the resulting expression or NULL in case of internal error
7294 */
7295 xmlExpNodePtr
xmlExpStringDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str,int len)7296 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7297 const xmlChar *str, int len) {
7298 const xmlChar *input;
7299
7300 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
7301 return(NULL);
7302 }
7303 /*
7304 * check the string is in the dictionary, if yes use an interned
7305 * copy, otherwise we know it's not an acceptable input
7306 */
7307 input = xmlDictExists(ctxt->dict, str, len);
7308 if (input == NULL) {
7309 return(forbiddenExp);
7310 }
7311 return(xmlExpStringDeriveInt(ctxt, exp, input));
7312 }
7313
7314 static int
xmlExpCheckCard(xmlExpNodePtr exp,xmlExpNodePtr sub)7315 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7316 int ret = 1;
7317
7318 if (sub->c_max == -1) {
7319 if (exp->c_max != -1)
7320 ret = 0;
7321 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7322 ret = 0;
7323 }
7324 #if 0
7325 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7326 ret = 0;
7327 #endif
7328 return(ret);
7329 }
7330
7331 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7332 xmlExpNodePtr sub);
7333 /**
7334 * xmlExpDivide:
7335 * @ctxt: the expressions context
7336 * @exp: the englobing expression
7337 * @sub: the subexpression
7338 * @mult: the multiple expression
7339 * @remain: the remain from the derivation of the multiple
7340 *
7341 * Check if exp is a multiple of sub, i.e. if there is a finite number n
7342 * so that sub{n} subsume exp
7343 *
7344 * Returns the multiple value if successful, 0 if it is not a multiple
7345 * and -1 in case of internal error.
7346 */
7347
7348 static int
xmlExpDivide(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub,xmlExpNodePtr * mult,xmlExpNodePtr * remain)7349 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7350 xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7351 int i;
7352 xmlExpNodePtr tmp, tmp2;
7353
7354 if (mult != NULL) *mult = NULL;
7355 if (remain != NULL) *remain = NULL;
7356 if (exp->c_max == -1) return(0);
7357 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7358
7359 for (i = 1;i <= exp->c_max;i++) {
7360 sub->ref++;
7361 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7362 sub, NULL, NULL, i, i);
7363 if (tmp == NULL) {
7364 return(-1);
7365 }
7366 if (!xmlExpCheckCard(tmp, exp)) {
7367 xmlExpFree(ctxt, tmp);
7368 continue;
7369 }
7370 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7371 if (tmp2 == NULL) {
7372 xmlExpFree(ctxt, tmp);
7373 return(-1);
7374 }
7375 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7376 if (remain != NULL)
7377 *remain = tmp2;
7378 else
7379 xmlExpFree(ctxt, tmp2);
7380 if (mult != NULL)
7381 *mult = tmp;
7382 else
7383 xmlExpFree(ctxt, tmp);
7384 #ifdef DEBUG_DERIV
7385 printf("Divide succeeded %d\n", i);
7386 #endif
7387 return(i);
7388 }
7389 xmlExpFree(ctxt, tmp);
7390 xmlExpFree(ctxt, tmp2);
7391 }
7392 #ifdef DEBUG_DERIV
7393 printf("Divide failed\n");
7394 #endif
7395 return(0);
7396 }
7397
7398 /**
7399 * xmlExpExpDeriveInt:
7400 * @ctxt: the expressions context
7401 * @exp: the englobing expression
7402 * @sub: the subexpression
7403 *
7404 * Try to do a step of Brzozowski derivation but at a higher level
7405 * the input being a subexpression.
7406 *
7407 * Returns the resulting expression or NULL in case of internal error
7408 */
7409 static xmlExpNodePtr
xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7410 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7411 xmlExpNodePtr ret, tmp, tmp2, tmp3;
7412 const xmlChar **tab;
7413 int len, i;
7414
7415 /*
7416 * In case of equality and if the expression can only consume a finite
7417 * amount, then the derivation is empty
7418 */
7419 if ((exp == sub) && (exp->c_max >= 0)) {
7420 #ifdef DEBUG_DERIV
7421 printf("Equal(exp, sub) and finite -> Empty\n");
7422 #endif
7423 return(emptyExp);
7424 }
7425 /*
7426 * decompose sub sequence first
7427 */
7428 if (sub->type == XML_EXP_EMPTY) {
7429 #ifdef DEBUG_DERIV
7430 printf("Empty(sub) -> Empty\n");
7431 #endif
7432 exp->ref++;
7433 return(exp);
7434 }
7435 if (sub->type == XML_EXP_SEQ) {
7436 #ifdef DEBUG_DERIV
7437 printf("Seq(sub) -> decompose\n");
7438 #endif
7439 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7440 if (tmp == NULL)
7441 return(NULL);
7442 if (tmp == forbiddenExp)
7443 return(tmp);
7444 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7445 xmlExpFree(ctxt, tmp);
7446 return(ret);
7447 }
7448 if (sub->type == XML_EXP_OR) {
7449 #ifdef DEBUG_DERIV
7450 printf("Or(sub) -> decompose\n");
7451 #endif
7452 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7453 if (tmp == forbiddenExp)
7454 return(tmp);
7455 if (tmp == NULL)
7456 return(NULL);
7457 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7458 if ((ret == NULL) || (ret == forbiddenExp)) {
7459 xmlExpFree(ctxt, tmp);
7460 return(ret);
7461 }
7462 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7463 }
7464 if (!xmlExpCheckCard(exp, sub)) {
7465 #ifdef DEBUG_DERIV
7466 printf("CheckCard(exp, sub) failed -> Forbid\n");
7467 #endif
7468 return(forbiddenExp);
7469 }
7470 switch (exp->type) {
7471 case XML_EXP_EMPTY:
7472 if (sub == emptyExp)
7473 return(emptyExp);
7474 #ifdef DEBUG_DERIV
7475 printf("Empty(exp) -> Forbid\n");
7476 #endif
7477 return(forbiddenExp);
7478 case XML_EXP_FORBID:
7479 #ifdef DEBUG_DERIV
7480 printf("Forbid(exp) -> Forbid\n");
7481 #endif
7482 return(forbiddenExp);
7483 case XML_EXP_ATOM:
7484 if (sub->type == XML_EXP_ATOM) {
7485 /* TODO: handle wildcards */
7486 if (exp->exp_str == sub->exp_str) {
7487 #ifdef DEBUG_DERIV
7488 printf("Atom match -> Empty\n");
7489 #endif
7490 return(emptyExp);
7491 }
7492 #ifdef DEBUG_DERIV
7493 printf("Atom mismatch -> Forbid\n");
7494 #endif
7495 return(forbiddenExp);
7496 }
7497 if ((sub->type == XML_EXP_COUNT) &&
7498 (sub->exp_max == 1) &&
7499 (sub->exp_left->type == XML_EXP_ATOM)) {
7500 /* TODO: handle wildcards */
7501 if (exp->exp_str == sub->exp_left->exp_str) {
7502 #ifdef DEBUG_DERIV
7503 printf("Atom match -> Empty\n");
7504 #endif
7505 return(emptyExp);
7506 }
7507 #ifdef DEBUG_DERIV
7508 printf("Atom mismatch -> Forbid\n");
7509 #endif
7510 return(forbiddenExp);
7511 }
7512 #ifdef DEBUG_DERIV
7513 printf("Complex exp vs Atom -> Forbid\n");
7514 #endif
7515 return(forbiddenExp);
7516 case XML_EXP_SEQ:
7517 /* try to get the sequence consumed only if possible */
7518 if (xmlExpCheckCard(exp->exp_left, sub)) {
7519 /* See if the sequence can be consumed directly */
7520 #ifdef DEBUG_DERIV
7521 printf("Seq trying left only\n");
7522 #endif
7523 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7524 if ((ret != forbiddenExp) && (ret != NULL)) {
7525 #ifdef DEBUG_DERIV
7526 printf("Seq trying left only worked\n");
7527 #endif
7528 /*
7529 * TODO: assumption here that we are determinist
7530 * i.e. we won't get to a nillable exp left
7531 * subset which could be matched by the right
7532 * part too.
7533 * e.g.: (a | b)+,(a | c) and 'a+,a'
7534 */
7535 exp->exp_right->ref++;
7536 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7537 exp->exp_right, NULL, 0, 0));
7538 }
7539 #ifdef DEBUG_DERIV
7540 } else {
7541 printf("Seq: left too short\n");
7542 #endif
7543 }
7544 /* Try instead to decompose */
7545 if (sub->type == XML_EXP_COUNT) {
7546 int min, max;
7547
7548 #ifdef DEBUG_DERIV
7549 printf("Seq: sub is a count\n");
7550 #endif
7551 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7552 if (ret == NULL)
7553 return(NULL);
7554 if (ret != forbiddenExp) {
7555 #ifdef DEBUG_DERIV
7556 printf("Seq , Count match on left\n");
7557 #endif
7558 if (sub->exp_max < 0)
7559 max = -1;
7560 else
7561 max = sub->exp_max -1;
7562 if (sub->exp_min > 0)
7563 min = sub->exp_min -1;
7564 else
7565 min = 0;
7566 exp->exp_right->ref++;
7567 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7568 exp->exp_right, NULL, 0, 0);
7569 if (tmp == NULL)
7570 return(NULL);
7571
7572 sub->exp_left->ref++;
7573 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7574 sub->exp_left, NULL, NULL, min, max);
7575 if (tmp2 == NULL) {
7576 xmlExpFree(ctxt, tmp);
7577 return(NULL);
7578 }
7579 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7580 xmlExpFree(ctxt, tmp);
7581 xmlExpFree(ctxt, tmp2);
7582 return(ret);
7583 }
7584 }
7585 /* we made no progress on structured operations */
7586 break;
7587 case XML_EXP_OR:
7588 #ifdef DEBUG_DERIV
7589 printf("Or , trying both side\n");
7590 #endif
7591 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7592 if (ret == NULL)
7593 return(NULL);
7594 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7595 if (tmp == NULL) {
7596 xmlExpFree(ctxt, ret);
7597 return(NULL);
7598 }
7599 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7600 case XML_EXP_COUNT: {
7601 int min, max;
7602
7603 if (sub->type == XML_EXP_COUNT) {
7604 /*
7605 * Try to see if the loop is completely subsumed
7606 */
7607 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7608 if (tmp == NULL)
7609 return(NULL);
7610 if (tmp == forbiddenExp) {
7611 int mult;
7612
7613 #ifdef DEBUG_DERIV
7614 printf("Count, Count inner don't subsume\n");
7615 #endif
7616 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7617 NULL, &tmp);
7618 if (mult <= 0) {
7619 #ifdef DEBUG_DERIV
7620 printf("Count, Count not multiple => forbidden\n");
7621 #endif
7622 return(forbiddenExp);
7623 }
7624 if (sub->exp_max == -1) {
7625 max = -1;
7626 if (exp->exp_max == -1) {
7627 if (exp->exp_min <= sub->exp_min * mult)
7628 min = 0;
7629 else
7630 min = exp->exp_min - sub->exp_min * mult;
7631 } else {
7632 #ifdef DEBUG_DERIV
7633 printf("Count, Count finite can't subsume infinite\n");
7634 #endif
7635 xmlExpFree(ctxt, tmp);
7636 return(forbiddenExp);
7637 }
7638 } else {
7639 if (exp->exp_max == -1) {
7640 #ifdef DEBUG_DERIV
7641 printf("Infinite loop consume mult finite loop\n");
7642 #endif
7643 if (exp->exp_min > sub->exp_min * mult) {
7644 max = -1;
7645 min = exp->exp_min - sub->exp_min * mult;
7646 } else {
7647 max = -1;
7648 min = 0;
7649 }
7650 } else {
7651 if (exp->exp_max < sub->exp_max * mult) {
7652 #ifdef DEBUG_DERIV
7653 printf("loops max mult mismatch => forbidden\n");
7654 #endif
7655 xmlExpFree(ctxt, tmp);
7656 return(forbiddenExp);
7657 }
7658 if (sub->exp_max * mult > exp->exp_min)
7659 min = 0;
7660 else
7661 min = exp->exp_min - sub->exp_max * mult;
7662 max = exp->exp_max - sub->exp_max * mult;
7663 }
7664 }
7665 } else if (!IS_NILLABLE(tmp)) {
7666 /*
7667 * TODO: loop here to try to grow if working on finite
7668 * blocks.
7669 */
7670 #ifdef DEBUG_DERIV
7671 printf("Count, Count remain not nillable => forbidden\n");
7672 #endif
7673 xmlExpFree(ctxt, tmp);
7674 return(forbiddenExp);
7675 } else if (sub->exp_max == -1) {
7676 if (exp->exp_max == -1) {
7677 if (exp->exp_min <= sub->exp_min) {
7678 #ifdef DEBUG_DERIV
7679 printf("Infinite loops Okay => COUNT(0,Inf)\n");
7680 #endif
7681 max = -1;
7682 min = 0;
7683 } else {
7684 #ifdef DEBUG_DERIV
7685 printf("Infinite loops min => Count(X,Inf)\n");
7686 #endif
7687 max = -1;
7688 min = exp->exp_min - sub->exp_min;
7689 }
7690 } else if (exp->exp_min > sub->exp_min) {
7691 #ifdef DEBUG_DERIV
7692 printf("loops min mismatch 1 => forbidden ???\n");
7693 #endif
7694 xmlExpFree(ctxt, tmp);
7695 return(forbiddenExp);
7696 } else {
7697 max = -1;
7698 min = 0;
7699 }
7700 } else {
7701 if (exp->exp_max == -1) {
7702 #ifdef DEBUG_DERIV
7703 printf("Infinite loop consume finite loop\n");
7704 #endif
7705 if (exp->exp_min > sub->exp_min) {
7706 max = -1;
7707 min = exp->exp_min - sub->exp_min;
7708 } else {
7709 max = -1;
7710 min = 0;
7711 }
7712 } else {
7713 if (exp->exp_max < sub->exp_max) {
7714 #ifdef DEBUG_DERIV
7715 printf("loops max mismatch => forbidden\n");
7716 #endif
7717 xmlExpFree(ctxt, tmp);
7718 return(forbiddenExp);
7719 }
7720 if (sub->exp_max > exp->exp_min)
7721 min = 0;
7722 else
7723 min = exp->exp_min - sub->exp_max;
7724 max = exp->exp_max - sub->exp_max;
7725 }
7726 }
7727 #ifdef DEBUG_DERIV
7728 printf("loops match => SEQ(COUNT())\n");
7729 #endif
7730 exp->exp_left->ref++;
7731 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7732 NULL, NULL, min, max);
7733 if (tmp2 == NULL) {
7734 return(NULL);
7735 }
7736 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7737 NULL, 0, 0);
7738 return(ret);
7739 }
7740 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7741 if (tmp == NULL)
7742 return(NULL);
7743 if (tmp == forbiddenExp) {
7744 #ifdef DEBUG_DERIV
7745 printf("loop mismatch => forbidden\n");
7746 #endif
7747 return(forbiddenExp);
7748 }
7749 if (exp->exp_min > 0)
7750 min = exp->exp_min - 1;
7751 else
7752 min = 0;
7753 if (exp->exp_max < 0)
7754 max = -1;
7755 else
7756 max = exp->exp_max - 1;
7757
7758 #ifdef DEBUG_DERIV
7759 printf("loop match => SEQ(COUNT())\n");
7760 #endif
7761 exp->exp_left->ref++;
7762 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7763 NULL, NULL, min, max);
7764 if (tmp2 == NULL)
7765 return(NULL);
7766 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7767 NULL, 0, 0);
7768 return(ret);
7769 }
7770 }
7771
7772 #ifdef DEBUG_DERIV
7773 printf("Fallback to derivative\n");
7774 #endif
7775 if (IS_NILLABLE(sub)) {
7776 if (!(IS_NILLABLE(exp)))
7777 return(forbiddenExp);
7778 else
7779 ret = emptyExp;
7780 } else
7781 ret = NULL;
7782 /*
7783 * here the structured derivation made no progress so
7784 * we use the default token based derivation to force one more step
7785 */
7786 if (ctxt->tabSize == 0)
7787 ctxt->tabSize = 40;
7788
7789 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7790 sizeof(const xmlChar *));
7791 if (tab == NULL) {
7792 return(NULL);
7793 }
7794
7795 /*
7796 * collect all the strings accepted by the subexpression on input
7797 */
7798 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7799 while (len < 0) {
7800 const xmlChar **temp;
7801 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
7802 sizeof(const xmlChar *));
7803 if (temp == NULL) {
7804 xmlFree((xmlChar **) tab);
7805 return(NULL);
7806 }
7807 tab = temp;
7808 ctxt->tabSize *= 2;
7809 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7810 }
7811 for (i = 0;i < len;i++) {
7812 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7813 if ((tmp == NULL) || (tmp == forbiddenExp)) {
7814 xmlExpFree(ctxt, ret);
7815 xmlFree((xmlChar **) tab);
7816 return(tmp);
7817 }
7818 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7819 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7820 xmlExpFree(ctxt, tmp);
7821 xmlExpFree(ctxt, ret);
7822 xmlFree((xmlChar **) tab);
7823 return(tmp);
7824 }
7825 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7826 xmlExpFree(ctxt, tmp);
7827 xmlExpFree(ctxt, tmp2);
7828
7829 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7830 xmlExpFree(ctxt, ret);
7831 xmlFree((xmlChar **) tab);
7832 return(tmp3);
7833 }
7834
7835 if (ret == NULL)
7836 ret = tmp3;
7837 else {
7838 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7839 if (ret == NULL) {
7840 xmlFree((xmlChar **) tab);
7841 return(NULL);
7842 }
7843 }
7844 }
7845 xmlFree((xmlChar **) tab);
7846 return(ret);
7847 }
7848
7849 /**
7850 * xmlExpExpDerive:
7851 * @ctxt: the expressions context
7852 * @exp: the englobing expression
7853 * @sub: the subexpression
7854 *
7855 * Evaluates the expression resulting from @exp consuming a sub expression @sub
7856 * Based on algebraic derivation and sometimes direct Brzozowski derivation
7857 * it usually takes less than linear time and can handle expressions generating
7858 * infinite languages.
7859 *
7860 * Returns the resulting expression or NULL in case of internal error, the
7861 * result must be freed
7862 */
7863 xmlExpNodePtr
xmlExpExpDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7864 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7865 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7866 return(NULL);
7867
7868 /*
7869 * O(1) speedups
7870 */
7871 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7872 #ifdef DEBUG_DERIV
7873 printf("Sub nillable and not exp : can't subsume\n");
7874 #endif
7875 return(forbiddenExp);
7876 }
7877 if (xmlExpCheckCard(exp, sub) == 0) {
7878 #ifdef DEBUG_DERIV
7879 printf("sub generate longer sequences than exp : can't subsume\n");
7880 #endif
7881 return(forbiddenExp);
7882 }
7883 return(xmlExpExpDeriveInt(ctxt, exp, sub));
7884 }
7885
7886 /**
7887 * xmlExpSubsume:
7888 * @ctxt: the expressions context
7889 * @exp: the englobing expression
7890 * @sub: the subexpression
7891 *
7892 * Check whether @exp accepts all the languages accepted by @sub
7893 * the input being a subexpression.
7894 *
7895 * Returns 1 if true 0 if false and -1 in case of failure.
7896 */
7897 int
xmlExpSubsume(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7898 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7899 xmlExpNodePtr tmp;
7900
7901 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7902 return(-1);
7903
7904 /*
7905 * TODO: speedup by checking the language of sub is a subset of the
7906 * language of exp
7907 */
7908 /*
7909 * O(1) speedups
7910 */
7911 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7912 #ifdef DEBUG_DERIV
7913 printf("Sub nillable and not exp : can't subsume\n");
7914 #endif
7915 return(0);
7916 }
7917 if (xmlExpCheckCard(exp, sub) == 0) {
7918 #ifdef DEBUG_DERIV
7919 printf("sub generate longer sequences than exp : can't subsume\n");
7920 #endif
7921 return(0);
7922 }
7923 tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7924 #ifdef DEBUG_DERIV
7925 printf("Result derivation :\n");
7926 PRINT_EXP(tmp);
7927 #endif
7928 if (tmp == NULL)
7929 return(-1);
7930 if (tmp == forbiddenExp)
7931 return(0);
7932 if (tmp == emptyExp)
7933 return(1);
7934 if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
7935 xmlExpFree(ctxt, tmp);
7936 return(1);
7937 }
7938 xmlExpFree(ctxt, tmp);
7939 return(0);
7940 }
7941
7942 /************************************************************************
7943 * *
7944 * Parsing expression *
7945 * *
7946 ************************************************************************/
7947
7948 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
7949
7950 #undef CUR
7951 #define CUR (*ctxt->cur)
7952 #undef NEXT
7953 #define NEXT ctxt->cur++;
7954 #undef IS_BLANK
7955 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
7956 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
7957
7958 static int
xmlExpParseNumber(xmlExpCtxtPtr ctxt)7959 xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
7960 int ret = 0;
7961
7962 SKIP_BLANKS
7963 if (CUR == '*') {
7964 NEXT
7965 return(-1);
7966 }
7967 if ((CUR < '0') || (CUR > '9'))
7968 return(-1);
7969 while ((CUR >= '0') && (CUR <= '9')) {
7970 ret = ret * 10 + (CUR - '0');
7971 NEXT
7972 }
7973 return(ret);
7974 }
7975
7976 static xmlExpNodePtr
xmlExpParseOr(xmlExpCtxtPtr ctxt)7977 xmlExpParseOr(xmlExpCtxtPtr ctxt) {
7978 const char *base;
7979 xmlExpNodePtr ret;
7980 const xmlChar *val;
7981
7982 SKIP_BLANKS
7983 base = ctxt->cur;
7984 if (*ctxt->cur == '(') {
7985 NEXT
7986 ret = xmlExpParseExpr(ctxt);
7987 SKIP_BLANKS
7988 if (*ctxt->cur != ')') {
7989 fprintf(stderr, "unbalanced '(' : %s\n", base);
7990 xmlExpFree(ctxt, ret);
7991 return(NULL);
7992 }
7993 NEXT;
7994 SKIP_BLANKS
7995 goto parse_quantifier;
7996 }
7997 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
7998 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
7999 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
8000 NEXT;
8001 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
8002 if (val == NULL)
8003 return(NULL);
8004 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
8005 if (ret == NULL)
8006 return(NULL);
8007 SKIP_BLANKS
8008 parse_quantifier:
8009 if (CUR == '{') {
8010 int min, max;
8011
8012 NEXT
8013 min = xmlExpParseNumber(ctxt);
8014 if (min < 0) {
8015 xmlExpFree(ctxt, ret);
8016 return(NULL);
8017 }
8018 SKIP_BLANKS
8019 if (CUR == ',') {
8020 NEXT
8021 max = xmlExpParseNumber(ctxt);
8022 SKIP_BLANKS
8023 } else
8024 max = min;
8025 if (CUR != '}') {
8026 xmlExpFree(ctxt, ret);
8027 return(NULL);
8028 }
8029 NEXT
8030 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8031 min, max);
8032 SKIP_BLANKS
8033 } else if (CUR == '?') {
8034 NEXT
8035 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8036 0, 1);
8037 SKIP_BLANKS
8038 } else if (CUR == '+') {
8039 NEXT
8040 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8041 1, -1);
8042 SKIP_BLANKS
8043 } else if (CUR == '*') {
8044 NEXT
8045 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
8046 0, -1);
8047 SKIP_BLANKS
8048 }
8049 return(ret);
8050 }
8051
8052
8053 static xmlExpNodePtr
xmlExpParseSeq(xmlExpCtxtPtr ctxt)8054 xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
8055 xmlExpNodePtr ret, right;
8056
8057 ret = xmlExpParseOr(ctxt);
8058 SKIP_BLANKS
8059 while (CUR == '|') {
8060 NEXT
8061 right = xmlExpParseOr(ctxt);
8062 if (right == NULL) {
8063 xmlExpFree(ctxt, ret);
8064 return(NULL);
8065 }
8066 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
8067 if (ret == NULL)
8068 return(NULL);
8069 }
8070 return(ret);
8071 }
8072
8073 static xmlExpNodePtr
xmlExpParseExpr(xmlExpCtxtPtr ctxt)8074 xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
8075 xmlExpNodePtr ret, right;
8076
8077 ret = xmlExpParseSeq(ctxt);
8078 SKIP_BLANKS
8079 while (CUR == ',') {
8080 NEXT
8081 right = xmlExpParseSeq(ctxt);
8082 if (right == NULL) {
8083 xmlExpFree(ctxt, ret);
8084 return(NULL);
8085 }
8086 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
8087 if (ret == NULL)
8088 return(NULL);
8089 }
8090 return(ret);
8091 }
8092
8093 /**
8094 * xmlExpParse:
8095 * @ctxt: the expressions context
8096 * @expr: the 0 terminated string
8097 *
8098 * Minimal parser for regexps, it understand the following constructs
8099 * - string terminals
8100 * - choice operator |
8101 * - sequence operator ,
8102 * - subexpressions (...)
8103 * - usual cardinality operators + * and ?
8104 * - finite sequences { min, max }
8105 * - infinite sequences { min, * }
8106 * There is minimal checkings made especially no checking on strings values
8107 *
8108 * Returns a new expression or NULL in case of failure
8109 */
8110 xmlExpNodePtr
xmlExpParse(xmlExpCtxtPtr ctxt,const char * expr)8111 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
8112 xmlExpNodePtr ret;
8113
8114 ctxt->expr = expr;
8115 ctxt->cur = expr;
8116
8117 ret = xmlExpParseExpr(ctxt);
8118 SKIP_BLANKS
8119 if (*ctxt->cur != 0) {
8120 xmlExpFree(ctxt, ret);
8121 return(NULL);
8122 }
8123 return(ret);
8124 }
8125
8126 static void
xmlExpDumpInt(xmlBufferPtr buf,xmlExpNodePtr expr,int glob)8127 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
8128 xmlExpNodePtr c;
8129
8130 if (expr == NULL) return;
8131 if (glob) xmlBufferWriteChar(buf, "(");
8132 switch (expr->type) {
8133 case XML_EXP_EMPTY:
8134 xmlBufferWriteChar(buf, "empty");
8135 break;
8136 case XML_EXP_FORBID:
8137 xmlBufferWriteChar(buf, "forbidden");
8138 break;
8139 case XML_EXP_ATOM:
8140 xmlBufferWriteCHAR(buf, expr->exp_str);
8141 break;
8142 case XML_EXP_SEQ:
8143 c = expr->exp_left;
8144 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8145 xmlExpDumpInt(buf, c, 1);
8146 else
8147 xmlExpDumpInt(buf, c, 0);
8148 xmlBufferWriteChar(buf, " , ");
8149 c = expr->exp_right;
8150 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8151 xmlExpDumpInt(buf, c, 1);
8152 else
8153 xmlExpDumpInt(buf, c, 0);
8154 break;
8155 case XML_EXP_OR:
8156 c = expr->exp_left;
8157 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8158 xmlExpDumpInt(buf, c, 1);
8159 else
8160 xmlExpDumpInt(buf, c, 0);
8161 xmlBufferWriteChar(buf, " | ");
8162 c = expr->exp_right;
8163 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8164 xmlExpDumpInt(buf, c, 1);
8165 else
8166 xmlExpDumpInt(buf, c, 0);
8167 break;
8168 case XML_EXP_COUNT: {
8169 char rep[40];
8170
8171 c = expr->exp_left;
8172 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8173 xmlExpDumpInt(buf, c, 1);
8174 else
8175 xmlExpDumpInt(buf, c, 0);
8176 if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
8177 rep[0] = '?';
8178 rep[1] = 0;
8179 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
8180 rep[0] = '*';
8181 rep[1] = 0;
8182 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
8183 rep[0] = '+';
8184 rep[1] = 0;
8185 } else if (expr->exp_max == expr->exp_min) {
8186 snprintf(rep, 39, "{%d}", expr->exp_min);
8187 } else if (expr->exp_max < 0) {
8188 snprintf(rep, 39, "{%d,inf}", expr->exp_min);
8189 } else {
8190 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
8191 }
8192 rep[39] = 0;
8193 xmlBufferWriteChar(buf, rep);
8194 break;
8195 }
8196 default:
8197 fprintf(stderr, "Error in tree\n");
8198 }
8199 if (glob)
8200 xmlBufferWriteChar(buf, ")");
8201 }
8202 /**
8203 * xmlExpDump:
8204 * @buf: a buffer to receive the output
8205 * @expr: the compiled expression
8206 *
8207 * Serialize the expression as compiled to the buffer
8208 */
8209 void
xmlExpDump(xmlBufferPtr buf,xmlExpNodePtr expr)8210 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
8211 if ((buf == NULL) || (expr == NULL))
8212 return;
8213 xmlExpDumpInt(buf, expr, 0);
8214 }
8215
8216 /**
8217 * xmlExpMaxToken:
8218 * @expr: a compiled expression
8219 *
8220 * Indicate the maximum number of input a expression can accept
8221 *
8222 * Returns the maximum length or -1 in case of error
8223 */
8224 int
xmlExpMaxToken(xmlExpNodePtr expr)8225 xmlExpMaxToken(xmlExpNodePtr expr) {
8226 if (expr == NULL)
8227 return(-1);
8228 return(expr->c_max);
8229 }
8230
8231 /**
8232 * xmlExpCtxtNbNodes:
8233 * @ctxt: an expression context
8234 *
8235 * Debugging facility provides the number of allocated nodes at a that point
8236 *
8237 * Returns the number of nodes in use or -1 in case of error
8238 */
8239 int
xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt)8240 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
8241 if (ctxt == NULL)
8242 return(-1);
8243 return(ctxt->nb_nodes);
8244 }
8245
8246 /**
8247 * xmlExpCtxtNbCons:
8248 * @ctxt: an expression context
8249 *
8250 * Debugging facility provides the number of allocated nodes over lifetime
8251 *
8252 * Returns the number of nodes ever allocated or -1 in case of error
8253 */
8254 int
xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt)8255 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
8256 if (ctxt == NULL)
8257 return(-1);
8258 return(ctxt->nb_cons);
8259 }
8260
8261 #endif /* LIBXML_EXPR_ENABLED */
8262 #define bottom_xmlregexp
8263 #include "elfgcchack.h"
8264 #endif /* LIBXML_REGEXP_ENABLED */
8265