1 /*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/schemas mechanisms now available in
6 * XML related specifications these include:
7 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17 #define IN_LIBXML
18 #include "libxml.h"
19
20 #ifdef LIBXML_REGEXP_ENABLED
21
22 #include <stdio.h>
23 #include <string.h>
24 #include <limits.h>
25
26 #include <libxml/tree.h>
27 #include <libxml/parserInternals.h>
28 #include <libxml/xmlregexp.h>
29 #include <libxml/xmlautomata.h>
30 #include <libxml/xmlunicode.h>
31
32 #include "private/error.h"
33 #include "private/memory.h"
34 #include "private/parser.h"
35 #include "private/regexp.h"
36
37 #ifndef SIZE_MAX
38 #define SIZE_MAX ((size_t) -1)
39 #endif
40
41 #define MAX_PUSH 10000000
42
43 #ifdef ERROR
44 #undef ERROR
45 #endif
46 #define ERROR(str) \
47 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
48 xmlRegexpErrCompile(ctxt, str);
49 #define NEXT ctxt->cur++
50 #define CUR (*(ctxt->cur))
51 #define NXT(index) (ctxt->cur[index])
52
53 #define NEXTL(l) ctxt->cur += l;
54 #define XML_REG_STRING_SEPARATOR '|'
55 /*
56 * Need PREV to check on a '-' within a Character Group. May only be used
57 * when it's guaranteed that cur is not at the beginning of ctxt->string!
58 */
59 #define PREV (ctxt->cur[-1])
60
61 /************************************************************************
62 * *
63 * Datatypes and structures *
64 * *
65 ************************************************************************/
66
67 /*
68 * Note: the order of the enums below is significant, do not shuffle
69 */
70 typedef enum {
71 XML_REGEXP_EPSILON = 1,
72 XML_REGEXP_CHARVAL,
73 XML_REGEXP_RANGES,
74 XML_REGEXP_SUBREG, /* used for () sub regexps */
75 XML_REGEXP_STRING,
76 XML_REGEXP_ANYCHAR, /* . */
77 XML_REGEXP_ANYSPACE, /* \s */
78 XML_REGEXP_NOTSPACE, /* \S */
79 XML_REGEXP_INITNAME, /* \l */
80 XML_REGEXP_NOTINITNAME, /* \L */
81 XML_REGEXP_NAMECHAR, /* \c */
82 XML_REGEXP_NOTNAMECHAR, /* \C */
83 XML_REGEXP_DECIMAL, /* \d */
84 XML_REGEXP_NOTDECIMAL, /* \D */
85 XML_REGEXP_REALCHAR, /* \w */
86 XML_REGEXP_NOTREALCHAR, /* \W */
87 XML_REGEXP_LETTER = 100,
88 XML_REGEXP_LETTER_UPPERCASE,
89 XML_REGEXP_LETTER_LOWERCASE,
90 XML_REGEXP_LETTER_TITLECASE,
91 XML_REGEXP_LETTER_MODIFIER,
92 XML_REGEXP_LETTER_OTHERS,
93 XML_REGEXP_MARK,
94 XML_REGEXP_MARK_NONSPACING,
95 XML_REGEXP_MARK_SPACECOMBINING,
96 XML_REGEXP_MARK_ENCLOSING,
97 XML_REGEXP_NUMBER,
98 XML_REGEXP_NUMBER_DECIMAL,
99 XML_REGEXP_NUMBER_LETTER,
100 XML_REGEXP_NUMBER_OTHERS,
101 XML_REGEXP_PUNCT,
102 XML_REGEXP_PUNCT_CONNECTOR,
103 XML_REGEXP_PUNCT_DASH,
104 XML_REGEXP_PUNCT_OPEN,
105 XML_REGEXP_PUNCT_CLOSE,
106 XML_REGEXP_PUNCT_INITQUOTE,
107 XML_REGEXP_PUNCT_FINQUOTE,
108 XML_REGEXP_PUNCT_OTHERS,
109 XML_REGEXP_SEPAR,
110 XML_REGEXP_SEPAR_SPACE,
111 XML_REGEXP_SEPAR_LINE,
112 XML_REGEXP_SEPAR_PARA,
113 XML_REGEXP_SYMBOL,
114 XML_REGEXP_SYMBOL_MATH,
115 XML_REGEXP_SYMBOL_CURRENCY,
116 XML_REGEXP_SYMBOL_MODIFIER,
117 XML_REGEXP_SYMBOL_OTHERS,
118 XML_REGEXP_OTHER,
119 XML_REGEXP_OTHER_CONTROL,
120 XML_REGEXP_OTHER_FORMAT,
121 XML_REGEXP_OTHER_PRIVATE,
122 XML_REGEXP_OTHER_NA,
123 XML_REGEXP_BLOCK_NAME
124 } xmlRegAtomType;
125
126 typedef enum {
127 XML_REGEXP_QUANT_EPSILON = 1,
128 XML_REGEXP_QUANT_ONCE,
129 XML_REGEXP_QUANT_OPT,
130 XML_REGEXP_QUANT_MULT,
131 XML_REGEXP_QUANT_PLUS,
132 XML_REGEXP_QUANT_ONCEONLY,
133 XML_REGEXP_QUANT_ALL,
134 XML_REGEXP_QUANT_RANGE
135 } xmlRegQuantType;
136
137 typedef enum {
138 XML_REGEXP_START_STATE = 1,
139 XML_REGEXP_FINAL_STATE,
140 XML_REGEXP_TRANS_STATE,
141 XML_REGEXP_SINK_STATE,
142 XML_REGEXP_UNREACH_STATE
143 } xmlRegStateType;
144
145 typedef enum {
146 XML_REGEXP_MARK_NORMAL = 0,
147 XML_REGEXP_MARK_START,
148 XML_REGEXP_MARK_VISITED
149 } xmlRegMarkedType;
150
151 typedef struct _xmlRegRange xmlRegRange;
152 typedef xmlRegRange *xmlRegRangePtr;
153
154 struct _xmlRegRange {
155 int neg; /* 0 normal, 1 not, 2 exclude */
156 xmlRegAtomType type;
157 int start;
158 int end;
159 xmlChar *blockName;
160 };
161
162 typedef struct _xmlRegAtom xmlRegAtom;
163 typedef xmlRegAtom *xmlRegAtomPtr;
164
165 typedef struct _xmlAutomataState xmlRegState;
166 typedef xmlRegState *xmlRegStatePtr;
167
168 struct _xmlRegAtom {
169 int no;
170 xmlRegAtomType type;
171 xmlRegQuantType quant;
172 int min;
173 int max;
174
175 void *valuep;
176 void *valuep2;
177 int neg;
178 int codepoint;
179 xmlRegStatePtr start;
180 xmlRegStatePtr start0;
181 xmlRegStatePtr stop;
182 int maxRanges;
183 int nbRanges;
184 xmlRegRangePtr *ranges;
185 void *data;
186 };
187
188 typedef struct _xmlRegCounter xmlRegCounter;
189 typedef xmlRegCounter *xmlRegCounterPtr;
190
191 struct _xmlRegCounter {
192 int min;
193 int max;
194 };
195
196 typedef struct _xmlRegTrans xmlRegTrans;
197 typedef xmlRegTrans *xmlRegTransPtr;
198
199 struct _xmlRegTrans {
200 xmlRegAtomPtr atom;
201 int to;
202 int counter;
203 int count;
204 int nd;
205 };
206
207 struct _xmlAutomataState {
208 xmlRegStateType type;
209 xmlRegMarkedType mark;
210 xmlRegMarkedType markd;
211 xmlRegMarkedType reached;
212 int no;
213 int maxTrans;
214 int nbTrans;
215 xmlRegTrans *trans;
216 /* knowing states pointing to us can speed things up */
217 int maxTransTo;
218 int nbTransTo;
219 int *transTo;
220 };
221
222 typedef struct _xmlAutomata xmlRegParserCtxt;
223 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
224
225 #define AM_AUTOMATA_RNG 1
226
227 struct _xmlAutomata {
228 xmlChar *string;
229 xmlChar *cur;
230
231 int error;
232 int neg;
233
234 xmlRegStatePtr start;
235 xmlRegStatePtr end;
236 xmlRegStatePtr state;
237
238 xmlRegAtomPtr atom;
239
240 int maxAtoms;
241 int nbAtoms;
242 xmlRegAtomPtr *atoms;
243
244 int maxStates;
245 int nbStates;
246 xmlRegStatePtr *states;
247
248 int maxCounters;
249 int nbCounters;
250 xmlRegCounter *counters;
251
252 int determinist;
253 int negs;
254 int flags;
255
256 int depth;
257 };
258
259 struct _xmlRegexp {
260 xmlChar *string;
261 int nbStates;
262 xmlRegStatePtr *states;
263 int nbAtoms;
264 xmlRegAtomPtr *atoms;
265 int nbCounters;
266 xmlRegCounter *counters;
267 int determinist;
268 int flags;
269 /*
270 * That's the compact form for determinists automatas
271 */
272 int nbstates;
273 int *compact;
274 void **transdata;
275 int nbstrings;
276 xmlChar **stringMap;
277 };
278
279 typedef struct _xmlRegExecRollback xmlRegExecRollback;
280 typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
281
282 struct _xmlRegExecRollback {
283 xmlRegStatePtr state;/* the current state */
284 int index; /* the index in the input stack */
285 int nextbranch; /* the next transition to explore in that state */
286 int *counts; /* save the automata state if it has some */
287 };
288
289 typedef struct _xmlRegInputToken xmlRegInputToken;
290 typedef xmlRegInputToken *xmlRegInputTokenPtr;
291
292 struct _xmlRegInputToken {
293 xmlChar *value;
294 void *data;
295 };
296
297 struct _xmlRegExecCtxt {
298 int status; /* execution status != 0 indicate an error */
299 int determinist; /* did we find an indeterministic behaviour */
300 xmlRegexpPtr comp; /* the compiled regexp */
301 xmlRegExecCallbacks callback;
302 void *data;
303
304 xmlRegStatePtr state;/* the current state */
305 int transno; /* the current transition on that state */
306 int transcount; /* the number of chars in char counted transitions */
307
308 /*
309 * A stack of rollback states
310 */
311 int maxRollbacks;
312 int nbRollbacks;
313 xmlRegExecRollback *rollbacks;
314
315 /*
316 * The state of the automata if any
317 */
318 int *counts;
319
320 /*
321 * The input stack
322 */
323 int inputStackMax;
324 int inputStackNr;
325 int index;
326 int *charStack;
327 const xmlChar *inputString; /* when operating on characters */
328 xmlRegInputTokenPtr inputStack;/* when operating on strings */
329
330 /*
331 * error handling
332 */
333 int errStateNo; /* the error state number */
334 xmlRegStatePtr errState; /* the error state */
335 xmlChar *errString; /* the string raising the error */
336 int *errCounts; /* counters at the error state */
337 int nbPush;
338 };
339
340 #define REGEXP_ALL_COUNTER 0x123456
341 #define REGEXP_ALL_LAX_COUNTER 0x123457
342
343 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
344 static void xmlRegFreeState(xmlRegStatePtr state);
345 static void xmlRegFreeAtom(xmlRegAtomPtr atom);
346 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
347 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
348 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
349 int neg, int start, int end, const xmlChar *blockName);
350
351 /************************************************************************
352 * *
353 * Regexp memory error handler *
354 * *
355 ************************************************************************/
356 /**
357 * xmlRegexpErrMemory:
358 * @extra: extra information
359 *
360 * Handle an out of memory condition
361 */
362 static void
xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt)363 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt)
364 {
365 if (ctxt != NULL)
366 ctxt->error = XML_ERR_NO_MEMORY;
367
368 xmlRaiseMemoryError(NULL, NULL, NULL, XML_FROM_REGEXP, NULL);
369 }
370
371 /**
372 * xmlRegexpErrCompile:
373 * @extra: extra information
374 *
375 * Handle a compilation failure
376 */
377 static void
xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt,const char * extra)378 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
379 {
380 const char *regexp = NULL;
381 int idx = 0;
382 int res;
383
384 if (ctxt != NULL) {
385 regexp = (const char *) ctxt->string;
386 idx = ctxt->cur - ctxt->string;
387 ctxt->error = XML_REGEXP_COMPILE_ERROR;
388 }
389
390 res = xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
391 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL,
392 NULL, 0, extra, regexp, NULL, idx, 0,
393 "failed to compile: %s\n", extra);
394 if (res < 0)
395 xmlRegexpErrMemory(ctxt);
396 }
397
398 /************************************************************************
399 * *
400 * Allocation/Deallocation *
401 * *
402 ************************************************************************/
403
404 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
405
406 /**
407 * xmlRegCalloc2:
408 * @dim1: size of first dimension
409 * @dim2: size of second dimension
410 * @elemSize: size of element
411 *
412 * Allocate a two-dimensional array and set all elements to zero.
413 *
414 * Returns the new array or NULL in case of error.
415 */
416 static void*
xmlRegCalloc2(size_t dim1,size_t dim2,size_t elemSize)417 xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) {
418 size_t totalSize;
419 void *ret;
420
421 /* Check for overflow */
422 if ((dim2 == 0) || (elemSize == 0) ||
423 (dim1 > SIZE_MAX / dim2 / elemSize))
424 return (NULL);
425 totalSize = dim1 * dim2 * elemSize;
426 ret = xmlMalloc(totalSize);
427 if (ret != NULL)
428 memset(ret, 0, totalSize);
429 return (ret);
430 }
431
432 /**
433 * xmlRegEpxFromParse:
434 * @ctxt: the parser context used to build it
435 *
436 * Allocate a new regexp and fill it with the result from the parser
437 *
438 * Returns the new regexp or NULL in case of error
439 */
440 static xmlRegexpPtr
xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt)441 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
442 xmlRegexpPtr ret;
443
444 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
445 if (ret == NULL) {
446 xmlRegexpErrMemory(ctxt);
447 return(NULL);
448 }
449 memset(ret, 0, sizeof(xmlRegexp));
450 ret->string = ctxt->string;
451 ret->nbStates = ctxt->nbStates;
452 ret->states = ctxt->states;
453 ret->nbAtoms = ctxt->nbAtoms;
454 ret->atoms = ctxt->atoms;
455 ret->nbCounters = ctxt->nbCounters;
456 ret->counters = ctxt->counters;
457 ret->determinist = ctxt->determinist;
458 ret->flags = ctxt->flags;
459 if (ret->determinist == -1) {
460 if (xmlRegexpIsDeterminist(ret) < 0) {
461 xmlRegexpErrMemory(ctxt);
462 xmlFree(ret);
463 return(NULL);
464 }
465 }
466
467 if ((ret->determinist != 0) &&
468 (ret->nbCounters == 0) &&
469 (ctxt->negs == 0) &&
470 (ret->atoms != NULL) &&
471 (ret->atoms[0] != NULL) &&
472 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
473 int i, j, nbstates = 0, nbatoms = 0;
474 int *stateRemap;
475 int *stringRemap;
476 int *transitions;
477 void **transdata;
478 xmlChar **stringMap;
479 xmlChar *value;
480
481 /*
482 * Switch to a compact representation
483 * 1/ counting the effective number of states left
484 * 2/ counting the unique number of atoms, and check that
485 * they are all of the string type
486 * 3/ build a table state x atom for the transitions
487 */
488
489 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
490 if (stateRemap == NULL) {
491 xmlRegexpErrMemory(ctxt);
492 xmlFree(ret);
493 return(NULL);
494 }
495 for (i = 0;i < ret->nbStates;i++) {
496 if (ret->states[i] != NULL) {
497 stateRemap[i] = nbstates;
498 nbstates++;
499 } else {
500 stateRemap[i] = -1;
501 }
502 }
503 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
504 if (stringMap == NULL) {
505 xmlRegexpErrMemory(ctxt);
506 xmlFree(stateRemap);
507 xmlFree(ret);
508 return(NULL);
509 }
510 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
511 if (stringRemap == NULL) {
512 xmlRegexpErrMemory(ctxt);
513 xmlFree(stringMap);
514 xmlFree(stateRemap);
515 xmlFree(ret);
516 return(NULL);
517 }
518 for (i = 0;i < ret->nbAtoms;i++) {
519 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
520 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
521 value = ret->atoms[i]->valuep;
522 for (j = 0;j < nbatoms;j++) {
523 if (xmlStrEqual(stringMap[j], value)) {
524 stringRemap[i] = j;
525 break;
526 }
527 }
528 if (j >= nbatoms) {
529 stringRemap[i] = nbatoms;
530 stringMap[nbatoms] = xmlStrdup(value);
531 if (stringMap[nbatoms] == NULL) {
532 for (i = 0;i < nbatoms;i++)
533 xmlFree(stringMap[i]);
534 xmlFree(stringRemap);
535 xmlFree(stringMap);
536 xmlFree(stateRemap);
537 xmlFree(ret);
538 return(NULL);
539 }
540 nbatoms++;
541 }
542 } else {
543 xmlFree(stateRemap);
544 xmlFree(stringRemap);
545 for (i = 0;i < nbatoms;i++)
546 xmlFree(stringMap[i]);
547 xmlFree(stringMap);
548 xmlFree(ret);
549 return(NULL);
550 }
551 }
552 transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1,
553 sizeof(int));
554 if (transitions == NULL) {
555 xmlFree(stateRemap);
556 xmlFree(stringRemap);
557 for (i = 0;i < nbatoms;i++)
558 xmlFree(stringMap[i]);
559 xmlFree(stringMap);
560 xmlFree(ret);
561 return(NULL);
562 }
563
564 /*
565 * Allocate the transition table. The first entry for each
566 * state corresponds to the state type.
567 */
568 transdata = NULL;
569
570 for (i = 0;i < ret->nbStates;i++) {
571 int stateno, atomno, targetno, prev;
572 xmlRegStatePtr state;
573 xmlRegTransPtr trans;
574
575 stateno = stateRemap[i];
576 if (stateno == -1)
577 continue;
578 state = ret->states[i];
579
580 transitions[stateno * (nbatoms + 1)] = state->type;
581
582 for (j = 0;j < state->nbTrans;j++) {
583 trans = &(state->trans[j]);
584 if ((trans->to < 0) || (trans->atom == NULL))
585 continue;
586 atomno = stringRemap[trans->atom->no];
587 if ((trans->atom->data != NULL) && (transdata == NULL)) {
588 transdata = (void **) xmlRegCalloc2(nbstates, nbatoms,
589 sizeof(void *));
590 if (transdata == NULL) {
591 xmlRegexpErrMemory(ctxt);
592 break;
593 }
594 }
595 targetno = stateRemap[trans->to];
596 /*
597 * if the same atom can generate transitions to 2 different
598 * states then it means the automata is not deterministic and
599 * the compact form can't be used !
600 */
601 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
602 if (prev != 0) {
603 if (prev != targetno + 1) {
604 ret->determinist = 0;
605 if (transdata != NULL)
606 xmlFree(transdata);
607 xmlFree(transitions);
608 xmlFree(stateRemap);
609 xmlFree(stringRemap);
610 for (i = 0;i < nbatoms;i++)
611 xmlFree(stringMap[i]);
612 xmlFree(stringMap);
613 goto not_determ;
614 }
615 } else {
616 transitions[stateno * (nbatoms + 1) + atomno + 1] =
617 targetno + 1; /* to avoid 0 */
618 if (transdata != NULL)
619 transdata[stateno * nbatoms + atomno] =
620 trans->atom->data;
621 }
622 }
623 }
624 ret->determinist = 1;
625 /*
626 * Cleanup of the old data
627 */
628 if (ret->states != NULL) {
629 for (i = 0;i < ret->nbStates;i++)
630 xmlRegFreeState(ret->states[i]);
631 xmlFree(ret->states);
632 }
633 ret->states = NULL;
634 ret->nbStates = 0;
635 if (ret->atoms != NULL) {
636 for (i = 0;i < ret->nbAtoms;i++)
637 xmlRegFreeAtom(ret->atoms[i]);
638 xmlFree(ret->atoms);
639 }
640 ret->atoms = NULL;
641 ret->nbAtoms = 0;
642
643 ret->compact = transitions;
644 ret->transdata = transdata;
645 ret->stringMap = stringMap;
646 ret->nbstrings = nbatoms;
647 ret->nbstates = nbstates;
648 xmlFree(stateRemap);
649 xmlFree(stringRemap);
650 }
651 not_determ:
652 ctxt->string = NULL;
653 ctxt->nbStates = 0;
654 ctxt->states = NULL;
655 ctxt->nbAtoms = 0;
656 ctxt->atoms = NULL;
657 ctxt->nbCounters = 0;
658 ctxt->counters = NULL;
659 return(ret);
660 }
661
662 /**
663 * xmlRegNewParserCtxt:
664 * @string: the string to parse
665 *
666 * Allocate a new regexp parser context
667 *
668 * Returns the new context or NULL in case of error
669 */
670 static xmlRegParserCtxtPtr
xmlRegNewParserCtxt(const xmlChar * string)671 xmlRegNewParserCtxt(const xmlChar *string) {
672 xmlRegParserCtxtPtr ret;
673
674 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
675 if (ret == NULL)
676 return(NULL);
677 memset(ret, 0, sizeof(xmlRegParserCtxt));
678 if (string != NULL) {
679 ret->string = xmlStrdup(string);
680 if (ret->string == NULL) {
681 xmlFree(ret);
682 return(NULL);
683 }
684 }
685 ret->cur = ret->string;
686 ret->neg = 0;
687 ret->negs = 0;
688 ret->error = 0;
689 ret->determinist = -1;
690 return(ret);
691 }
692
693 /**
694 * xmlRegNewRange:
695 * @ctxt: the regexp parser context
696 * @neg: is that negative
697 * @type: the type of range
698 * @start: the start codepoint
699 * @end: the end codepoint
700 *
701 * Allocate a new regexp range
702 *
703 * Returns the new range or NULL in case of error
704 */
705 static xmlRegRangePtr
xmlRegNewRange(xmlRegParserCtxtPtr ctxt,int neg,xmlRegAtomType type,int start,int end)706 xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
707 int neg, xmlRegAtomType type, int start, int end) {
708 xmlRegRangePtr ret;
709
710 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
711 if (ret == NULL) {
712 xmlRegexpErrMemory(ctxt);
713 return(NULL);
714 }
715 ret->neg = neg;
716 ret->type = type;
717 ret->start = start;
718 ret->end = end;
719 return(ret);
720 }
721
722 /**
723 * xmlRegFreeRange:
724 * @range: the regexp range
725 *
726 * Free a regexp range
727 */
728 static void
xmlRegFreeRange(xmlRegRangePtr range)729 xmlRegFreeRange(xmlRegRangePtr range) {
730 if (range == NULL)
731 return;
732
733 if (range->blockName != NULL)
734 xmlFree(range->blockName);
735 xmlFree(range);
736 }
737
738 /**
739 * xmlRegCopyRange:
740 * @range: the regexp range
741 *
742 * Copy a regexp range
743 *
744 * Returns the new copy or NULL in case of error.
745 */
746 static xmlRegRangePtr
xmlRegCopyRange(xmlRegParserCtxtPtr ctxt,xmlRegRangePtr range)747 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
748 xmlRegRangePtr ret;
749
750 if (range == NULL)
751 return(NULL);
752
753 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
754 range->end);
755 if (ret == NULL)
756 return(NULL);
757 if (range->blockName != NULL) {
758 ret->blockName = xmlStrdup(range->blockName);
759 if (ret->blockName == NULL) {
760 xmlRegexpErrMemory(ctxt);
761 xmlRegFreeRange(ret);
762 return(NULL);
763 }
764 }
765 return(ret);
766 }
767
768 /**
769 * xmlRegNewAtom:
770 * @ctxt: the regexp parser context
771 * @type: the type of atom
772 *
773 * Allocate a new atom
774 *
775 * Returns the new atom or NULL in case of error
776 */
777 static xmlRegAtomPtr
xmlRegNewAtom(xmlRegParserCtxtPtr ctxt,xmlRegAtomType type)778 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
779 xmlRegAtomPtr ret;
780
781 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
782 if (ret == NULL) {
783 xmlRegexpErrMemory(ctxt);
784 return(NULL);
785 }
786 memset(ret, 0, sizeof(xmlRegAtom));
787 ret->type = type;
788 ret->quant = XML_REGEXP_QUANT_ONCE;
789 ret->min = 0;
790 ret->max = 0;
791 return(ret);
792 }
793
794 /**
795 * xmlRegFreeAtom:
796 * @atom: the regexp atom
797 *
798 * Free a regexp atom
799 */
800 static void
xmlRegFreeAtom(xmlRegAtomPtr atom)801 xmlRegFreeAtom(xmlRegAtomPtr atom) {
802 int i;
803
804 if (atom == NULL)
805 return;
806
807 for (i = 0;i < atom->nbRanges;i++)
808 xmlRegFreeRange(atom->ranges[i]);
809 if (atom->ranges != NULL)
810 xmlFree(atom->ranges);
811 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
812 xmlFree(atom->valuep);
813 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
814 xmlFree(atom->valuep2);
815 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
816 xmlFree(atom->valuep);
817 xmlFree(atom);
818 }
819
820 /**
821 * xmlRegCopyAtom:
822 * @ctxt: the regexp parser context
823 * @atom: the original atom
824 *
825 * Allocate a new regexp range
826 *
827 * Returns the new atom or NULL in case of error
828 */
829 static xmlRegAtomPtr
xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom)830 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
831 xmlRegAtomPtr ret;
832
833 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
834 if (ret == NULL) {
835 xmlRegexpErrMemory(ctxt);
836 return(NULL);
837 }
838 memset(ret, 0, sizeof(xmlRegAtom));
839 ret->type = atom->type;
840 ret->quant = atom->quant;
841 ret->min = atom->min;
842 ret->max = atom->max;
843 if (atom->nbRanges > 0) {
844 int i;
845
846 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
847 atom->nbRanges);
848 if (ret->ranges == NULL) {
849 xmlRegexpErrMemory(ctxt);
850 goto error;
851 }
852 for (i = 0;i < atom->nbRanges;i++) {
853 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
854 if (ret->ranges[i] == NULL)
855 goto error;
856 ret->nbRanges = i + 1;
857 }
858 }
859 return(ret);
860
861 error:
862 xmlRegFreeAtom(ret);
863 return(NULL);
864 }
865
866 static xmlRegStatePtr
xmlRegNewState(xmlRegParserCtxtPtr ctxt)867 xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
868 xmlRegStatePtr ret;
869
870 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
871 if (ret == NULL) {
872 xmlRegexpErrMemory(ctxt);
873 return(NULL);
874 }
875 memset(ret, 0, sizeof(xmlRegState));
876 ret->type = XML_REGEXP_TRANS_STATE;
877 ret->mark = XML_REGEXP_MARK_NORMAL;
878 return(ret);
879 }
880
881 /**
882 * xmlRegFreeState:
883 * @state: the regexp state
884 *
885 * Free a regexp state
886 */
887 static void
xmlRegFreeState(xmlRegStatePtr state)888 xmlRegFreeState(xmlRegStatePtr state) {
889 if (state == NULL)
890 return;
891
892 if (state->trans != NULL)
893 xmlFree(state->trans);
894 if (state->transTo != NULL)
895 xmlFree(state->transTo);
896 xmlFree(state);
897 }
898
899 /**
900 * xmlRegFreeParserCtxt:
901 * @ctxt: the regexp parser context
902 *
903 * Free a regexp parser context
904 */
905 static void
xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt)906 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
907 int i;
908 if (ctxt == NULL)
909 return;
910
911 if (ctxt->string != NULL)
912 xmlFree(ctxt->string);
913 if (ctxt->states != NULL) {
914 for (i = 0;i < ctxt->nbStates;i++)
915 xmlRegFreeState(ctxt->states[i]);
916 xmlFree(ctxt->states);
917 }
918 if (ctxt->atoms != NULL) {
919 for (i = 0;i < ctxt->nbAtoms;i++)
920 xmlRegFreeAtom(ctxt->atoms[i]);
921 xmlFree(ctxt->atoms);
922 }
923 if (ctxt->counters != NULL)
924 xmlFree(ctxt->counters);
925 xmlFree(ctxt);
926 }
927
928 /************************************************************************
929 * *
930 * Display of Data structures *
931 * *
932 ************************************************************************/
933
934 static void
xmlRegPrintAtomType(FILE * output,xmlRegAtomType type)935 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
936 switch (type) {
937 case XML_REGEXP_EPSILON:
938 fprintf(output, "epsilon "); break;
939 case XML_REGEXP_CHARVAL:
940 fprintf(output, "charval "); break;
941 case XML_REGEXP_RANGES:
942 fprintf(output, "ranges "); break;
943 case XML_REGEXP_SUBREG:
944 fprintf(output, "subexpr "); break;
945 case XML_REGEXP_STRING:
946 fprintf(output, "string "); break;
947 case XML_REGEXP_ANYCHAR:
948 fprintf(output, "anychar "); break;
949 case XML_REGEXP_ANYSPACE:
950 fprintf(output, "anyspace "); break;
951 case XML_REGEXP_NOTSPACE:
952 fprintf(output, "notspace "); break;
953 case XML_REGEXP_INITNAME:
954 fprintf(output, "initname "); break;
955 case XML_REGEXP_NOTINITNAME:
956 fprintf(output, "notinitname "); break;
957 case XML_REGEXP_NAMECHAR:
958 fprintf(output, "namechar "); break;
959 case XML_REGEXP_NOTNAMECHAR:
960 fprintf(output, "notnamechar "); break;
961 case XML_REGEXP_DECIMAL:
962 fprintf(output, "decimal "); break;
963 case XML_REGEXP_NOTDECIMAL:
964 fprintf(output, "notdecimal "); break;
965 case XML_REGEXP_REALCHAR:
966 fprintf(output, "realchar "); break;
967 case XML_REGEXP_NOTREALCHAR:
968 fprintf(output, "notrealchar "); break;
969 case XML_REGEXP_LETTER:
970 fprintf(output, "LETTER "); break;
971 case XML_REGEXP_LETTER_UPPERCASE:
972 fprintf(output, "LETTER_UPPERCASE "); break;
973 case XML_REGEXP_LETTER_LOWERCASE:
974 fprintf(output, "LETTER_LOWERCASE "); break;
975 case XML_REGEXP_LETTER_TITLECASE:
976 fprintf(output, "LETTER_TITLECASE "); break;
977 case XML_REGEXP_LETTER_MODIFIER:
978 fprintf(output, "LETTER_MODIFIER "); break;
979 case XML_REGEXP_LETTER_OTHERS:
980 fprintf(output, "LETTER_OTHERS "); break;
981 case XML_REGEXP_MARK:
982 fprintf(output, "MARK "); break;
983 case XML_REGEXP_MARK_NONSPACING:
984 fprintf(output, "MARK_NONSPACING "); break;
985 case XML_REGEXP_MARK_SPACECOMBINING:
986 fprintf(output, "MARK_SPACECOMBINING "); break;
987 case XML_REGEXP_MARK_ENCLOSING:
988 fprintf(output, "MARK_ENCLOSING "); break;
989 case XML_REGEXP_NUMBER:
990 fprintf(output, "NUMBER "); break;
991 case XML_REGEXP_NUMBER_DECIMAL:
992 fprintf(output, "NUMBER_DECIMAL "); break;
993 case XML_REGEXP_NUMBER_LETTER:
994 fprintf(output, "NUMBER_LETTER "); break;
995 case XML_REGEXP_NUMBER_OTHERS:
996 fprintf(output, "NUMBER_OTHERS "); break;
997 case XML_REGEXP_PUNCT:
998 fprintf(output, "PUNCT "); break;
999 case XML_REGEXP_PUNCT_CONNECTOR:
1000 fprintf(output, "PUNCT_CONNECTOR "); break;
1001 case XML_REGEXP_PUNCT_DASH:
1002 fprintf(output, "PUNCT_DASH "); break;
1003 case XML_REGEXP_PUNCT_OPEN:
1004 fprintf(output, "PUNCT_OPEN "); break;
1005 case XML_REGEXP_PUNCT_CLOSE:
1006 fprintf(output, "PUNCT_CLOSE "); break;
1007 case XML_REGEXP_PUNCT_INITQUOTE:
1008 fprintf(output, "PUNCT_INITQUOTE "); break;
1009 case XML_REGEXP_PUNCT_FINQUOTE:
1010 fprintf(output, "PUNCT_FINQUOTE "); break;
1011 case XML_REGEXP_PUNCT_OTHERS:
1012 fprintf(output, "PUNCT_OTHERS "); break;
1013 case XML_REGEXP_SEPAR:
1014 fprintf(output, "SEPAR "); break;
1015 case XML_REGEXP_SEPAR_SPACE:
1016 fprintf(output, "SEPAR_SPACE "); break;
1017 case XML_REGEXP_SEPAR_LINE:
1018 fprintf(output, "SEPAR_LINE "); break;
1019 case XML_REGEXP_SEPAR_PARA:
1020 fprintf(output, "SEPAR_PARA "); break;
1021 case XML_REGEXP_SYMBOL:
1022 fprintf(output, "SYMBOL "); break;
1023 case XML_REGEXP_SYMBOL_MATH:
1024 fprintf(output, "SYMBOL_MATH "); break;
1025 case XML_REGEXP_SYMBOL_CURRENCY:
1026 fprintf(output, "SYMBOL_CURRENCY "); break;
1027 case XML_REGEXP_SYMBOL_MODIFIER:
1028 fprintf(output, "SYMBOL_MODIFIER "); break;
1029 case XML_REGEXP_SYMBOL_OTHERS:
1030 fprintf(output, "SYMBOL_OTHERS "); break;
1031 case XML_REGEXP_OTHER:
1032 fprintf(output, "OTHER "); break;
1033 case XML_REGEXP_OTHER_CONTROL:
1034 fprintf(output, "OTHER_CONTROL "); break;
1035 case XML_REGEXP_OTHER_FORMAT:
1036 fprintf(output, "OTHER_FORMAT "); break;
1037 case XML_REGEXP_OTHER_PRIVATE:
1038 fprintf(output, "OTHER_PRIVATE "); break;
1039 case XML_REGEXP_OTHER_NA:
1040 fprintf(output, "OTHER_NA "); break;
1041 case XML_REGEXP_BLOCK_NAME:
1042 fprintf(output, "BLOCK "); break;
1043 }
1044 }
1045
1046 static void
xmlRegPrintQuantType(FILE * output,xmlRegQuantType type)1047 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1048 switch (type) {
1049 case XML_REGEXP_QUANT_EPSILON:
1050 fprintf(output, "epsilon "); break;
1051 case XML_REGEXP_QUANT_ONCE:
1052 fprintf(output, "once "); break;
1053 case XML_REGEXP_QUANT_OPT:
1054 fprintf(output, "? "); break;
1055 case XML_REGEXP_QUANT_MULT:
1056 fprintf(output, "* "); break;
1057 case XML_REGEXP_QUANT_PLUS:
1058 fprintf(output, "+ "); break;
1059 case XML_REGEXP_QUANT_RANGE:
1060 fprintf(output, "range "); break;
1061 case XML_REGEXP_QUANT_ONCEONLY:
1062 fprintf(output, "onceonly "); break;
1063 case XML_REGEXP_QUANT_ALL:
1064 fprintf(output, "all "); break;
1065 }
1066 }
1067 static void
xmlRegPrintRange(FILE * output,xmlRegRangePtr range)1068 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1069 fprintf(output, " range: ");
1070 if (range->neg)
1071 fprintf(output, "negative ");
1072 xmlRegPrintAtomType(output, range->type);
1073 fprintf(output, "%c - %c\n", range->start, range->end);
1074 }
1075
1076 static void
xmlRegPrintAtom(FILE * output,xmlRegAtomPtr atom)1077 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1078 fprintf(output, " atom: ");
1079 if (atom == NULL) {
1080 fprintf(output, "NULL\n");
1081 return;
1082 }
1083 if (atom->neg)
1084 fprintf(output, "not ");
1085 xmlRegPrintAtomType(output, atom->type);
1086 xmlRegPrintQuantType(output, atom->quant);
1087 if (atom->quant == XML_REGEXP_QUANT_RANGE)
1088 fprintf(output, "%d-%d ", atom->min, atom->max);
1089 if (atom->type == XML_REGEXP_STRING)
1090 fprintf(output, "'%s' ", (char *) atom->valuep);
1091 if (atom->type == XML_REGEXP_CHARVAL)
1092 fprintf(output, "char %c\n", atom->codepoint);
1093 else if (atom->type == XML_REGEXP_RANGES) {
1094 int i;
1095 fprintf(output, "%d entries\n", atom->nbRanges);
1096 for (i = 0; i < atom->nbRanges;i++)
1097 xmlRegPrintRange(output, atom->ranges[i]);
1098 } else if (atom->type == XML_REGEXP_SUBREG) {
1099 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1100 } else {
1101 fprintf(output, "\n");
1102 }
1103 }
1104
1105 static void
xmlRegPrintTrans(FILE * output,xmlRegTransPtr trans)1106 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1107 fprintf(output, " trans: ");
1108 if (trans == NULL) {
1109 fprintf(output, "NULL\n");
1110 return;
1111 }
1112 if (trans->to < 0) {
1113 fprintf(output, "removed\n");
1114 return;
1115 }
1116 if (trans->nd != 0) {
1117 if (trans->nd == 2)
1118 fprintf(output, "last not determinist, ");
1119 else
1120 fprintf(output, "not determinist, ");
1121 }
1122 if (trans->counter >= 0) {
1123 fprintf(output, "counted %d, ", trans->counter);
1124 }
1125 if (trans->count == REGEXP_ALL_COUNTER) {
1126 fprintf(output, "all transition, ");
1127 } else if (trans->count >= 0) {
1128 fprintf(output, "count based %d, ", trans->count);
1129 }
1130 if (trans->atom == NULL) {
1131 fprintf(output, "epsilon to %d\n", trans->to);
1132 return;
1133 }
1134 if (trans->atom->type == XML_REGEXP_CHARVAL)
1135 fprintf(output, "char %c ", trans->atom->codepoint);
1136 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1137 }
1138
1139 static void
xmlRegPrintState(FILE * output,xmlRegStatePtr state)1140 xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1141 int i;
1142
1143 fprintf(output, " state: ");
1144 if (state == NULL) {
1145 fprintf(output, "NULL\n");
1146 return;
1147 }
1148 if (state->type == XML_REGEXP_START_STATE)
1149 fprintf(output, "START ");
1150 if (state->type == XML_REGEXP_FINAL_STATE)
1151 fprintf(output, "FINAL ");
1152
1153 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1154 for (i = 0;i < state->nbTrans; i++) {
1155 xmlRegPrintTrans(output, &(state->trans[i]));
1156 }
1157 }
1158
1159 /************************************************************************
1160 * *
1161 * Finite Automata structures manipulations *
1162 * *
1163 ************************************************************************/
1164
1165 static xmlRegRangePtr
xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom,int neg,xmlRegAtomType type,int start,int end,xmlChar * blockName)1166 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1167 int neg, xmlRegAtomType type, int start, int end,
1168 xmlChar *blockName) {
1169 xmlRegRangePtr range;
1170
1171 if (atom == NULL) {
1172 ERROR("add range: atom is NULL");
1173 return(NULL);
1174 }
1175 if (atom->type != XML_REGEXP_RANGES) {
1176 ERROR("add range: atom is not ranges");
1177 return(NULL);
1178 }
1179 if (atom->nbRanges >= atom->maxRanges) {
1180 xmlRegRangePtr *tmp;
1181 int newSize;
1182
1183 newSize = xmlGrowCapacity(atom->maxRanges, sizeof(tmp[0]),
1184 4, XML_MAX_ITEMS);
1185 if (newSize < 0) {
1186 xmlRegexpErrMemory(ctxt);
1187 return(NULL);
1188 }
1189 tmp = xmlRealloc(atom->ranges, newSize * sizeof(tmp[0]));
1190 if (tmp == NULL) {
1191 xmlRegexpErrMemory(ctxt);
1192 return(NULL);
1193 }
1194 atom->ranges = tmp;
1195 atom->maxRanges = newSize;
1196 }
1197 range = xmlRegNewRange(ctxt, neg, type, start, end);
1198 if (range == NULL)
1199 return(NULL);
1200 range->blockName = blockName;
1201 atom->ranges[atom->nbRanges++] = range;
1202
1203 return(range);
1204 }
1205
1206 static int
xmlRegGetCounter(xmlRegParserCtxtPtr ctxt)1207 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1208 if (ctxt->nbCounters >= ctxt->maxCounters) {
1209 xmlRegCounter *tmp;
1210 int newSize;
1211
1212 newSize = xmlGrowCapacity(ctxt->maxCounters, sizeof(tmp[0]),
1213 4, XML_MAX_ITEMS);
1214 if (newSize < 0) {
1215 xmlRegexpErrMemory(ctxt);
1216 return(-1);
1217 }
1218 tmp = xmlRealloc(ctxt->counters, newSize * sizeof(tmp[0]));
1219 if (tmp == NULL) {
1220 xmlRegexpErrMemory(ctxt);
1221 return(-1);
1222 }
1223 ctxt->counters = tmp;
1224 ctxt->maxCounters = newSize;
1225 }
1226 ctxt->counters[ctxt->nbCounters].min = -1;
1227 ctxt->counters[ctxt->nbCounters].max = -1;
1228 return(ctxt->nbCounters++);
1229 }
1230
1231 static int
xmlRegAtomPush(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom)1232 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1233 if (atom == NULL) {
1234 ERROR("atom push: atom is NULL");
1235 return(-1);
1236 }
1237 if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1238 xmlRegAtomPtr *tmp;
1239 int newSize;
1240
1241 newSize = xmlGrowCapacity(ctxt->maxAtoms, sizeof(tmp[0]),
1242 4, XML_MAX_ITEMS);
1243 if (newSize < 0) {
1244 xmlRegexpErrMemory(ctxt);
1245 return(-1);
1246 }
1247 tmp = xmlRealloc(ctxt->atoms, newSize * sizeof(tmp[0]));
1248 if (tmp == NULL) {
1249 xmlRegexpErrMemory(ctxt);
1250 return(-1);
1251 }
1252 ctxt->atoms = tmp;
1253 ctxt->maxAtoms = newSize;
1254 }
1255 atom->no = ctxt->nbAtoms;
1256 ctxt->atoms[ctxt->nbAtoms++] = atom;
1257 return(0);
1258 }
1259
1260 static void
xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr target,int from)1261 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1262 int from) {
1263 if (target->nbTransTo >= target->maxTransTo) {
1264 int *tmp;
1265 int newSize;
1266
1267 newSize = xmlGrowCapacity(target->maxTransTo, sizeof(tmp[0]),
1268 8, XML_MAX_ITEMS);
1269 if (newSize < 0) {
1270 xmlRegexpErrMemory(ctxt);
1271 return;
1272 }
1273 tmp = xmlRealloc(target->transTo, newSize * sizeof(tmp[0]));
1274 if (tmp == NULL) {
1275 xmlRegexpErrMemory(ctxt);
1276 return;
1277 }
1278 target->transTo = tmp;
1279 target->maxTransTo = newSize;
1280 }
1281 target->transTo[target->nbTransTo] = from;
1282 target->nbTransTo++;
1283 }
1284
1285 static void
xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,xmlRegAtomPtr atom,xmlRegStatePtr target,int counter,int count)1286 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1287 xmlRegAtomPtr atom, xmlRegStatePtr target,
1288 int counter, int count) {
1289
1290 int nrtrans;
1291
1292 if (state == NULL) {
1293 ERROR("add state: state is NULL");
1294 return;
1295 }
1296 if (target == NULL) {
1297 ERROR("add state: target is NULL");
1298 return;
1299 }
1300 /*
1301 * Other routines follow the philosophy 'When in doubt, add a transition'
1302 * so we check here whether such a transition is already present and, if
1303 * so, silently ignore this request.
1304 */
1305
1306 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1307 xmlRegTransPtr trans = &(state->trans[nrtrans]);
1308 if ((trans->atom == atom) &&
1309 (trans->to == target->no) &&
1310 (trans->counter == counter) &&
1311 (trans->count == count)) {
1312 return;
1313 }
1314 }
1315
1316 if (state->nbTrans >= state->maxTrans) {
1317 xmlRegTrans *tmp;
1318 int newSize;
1319
1320 newSize = xmlGrowCapacity(state->maxTrans, sizeof(tmp[0]),
1321 8, XML_MAX_ITEMS);
1322 if (newSize < 0) {
1323 xmlRegexpErrMemory(ctxt);
1324 return;
1325 }
1326 tmp = xmlRealloc(state->trans, newSize * sizeof(tmp[0]));
1327 if (tmp == NULL) {
1328 xmlRegexpErrMemory(ctxt);
1329 return;
1330 }
1331 state->trans = tmp;
1332 state->maxTrans = newSize;
1333 }
1334
1335 state->trans[state->nbTrans].atom = atom;
1336 state->trans[state->nbTrans].to = target->no;
1337 state->trans[state->nbTrans].counter = counter;
1338 state->trans[state->nbTrans].count = count;
1339 state->trans[state->nbTrans].nd = 0;
1340 state->nbTrans++;
1341 xmlRegStateAddTransTo(ctxt, target, state->no);
1342 }
1343
1344 static xmlRegStatePtr
xmlRegStatePush(xmlRegParserCtxtPtr ctxt)1345 xmlRegStatePush(xmlRegParserCtxtPtr ctxt) {
1346 xmlRegStatePtr state;
1347
1348 if (ctxt->nbStates >= ctxt->maxStates) {
1349 xmlRegStatePtr *tmp;
1350 int newSize;
1351
1352 newSize = xmlGrowCapacity(ctxt->maxStates, sizeof(tmp[0]),
1353 4, XML_MAX_ITEMS);
1354 if (newSize < 0) {
1355 xmlRegexpErrMemory(ctxt);
1356 return(NULL);
1357 }
1358 tmp = xmlRealloc(ctxt->states, newSize * sizeof(tmp[0]));
1359 if (tmp == NULL) {
1360 xmlRegexpErrMemory(ctxt);
1361 return(NULL);
1362 }
1363 ctxt->states = tmp;
1364 ctxt->maxStates = newSize;
1365 }
1366
1367 state = xmlRegNewState(ctxt);
1368 if (state == NULL)
1369 return(NULL);
1370
1371 state->no = ctxt->nbStates;
1372 ctxt->states[ctxt->nbStates++] = state;
1373
1374 return(state);
1375 }
1376
1377 /**
1378 * xmlFAGenerateAllTransition:
1379 * @ctxt: a regexp parser context
1380 * @from: the from state
1381 * @to: the target state or NULL for building a new one
1382 * @lax:
1383 *
1384 */
1385 static int
xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int lax)1386 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1387 xmlRegStatePtr from, xmlRegStatePtr to,
1388 int lax) {
1389 if (to == NULL) {
1390 to = xmlRegStatePush(ctxt);
1391 if (to == NULL)
1392 return(-1);
1393 ctxt->state = to;
1394 }
1395 if (lax)
1396 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1397 else
1398 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1399 return(0);
1400 }
1401
1402 /**
1403 * xmlFAGenerateEpsilonTransition:
1404 * @ctxt: a regexp parser context
1405 * @from: the from state
1406 * @to: the target state or NULL for building a new one
1407 *
1408 */
1409 static int
xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to)1410 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1411 xmlRegStatePtr from, xmlRegStatePtr to) {
1412 if (to == NULL) {
1413 to = xmlRegStatePush(ctxt);
1414 if (to == NULL)
1415 return(-1);
1416 ctxt->state = to;
1417 }
1418 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1419 return(0);
1420 }
1421
1422 /**
1423 * xmlFAGenerateCountedEpsilonTransition:
1424 * @ctxt: a regexp parser context
1425 * @from: the from state
1426 * @to: the target state or NULL for building a new one
1427 * counter: the counter for that transition
1428 *
1429 */
1430 static int
xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1431 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1432 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1433 if (to == NULL) {
1434 to = xmlRegStatePush(ctxt);
1435 if (to == NULL)
1436 return(-1);
1437 ctxt->state = to;
1438 }
1439 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1440 return(0);
1441 }
1442
1443 /**
1444 * xmlFAGenerateCountedTransition:
1445 * @ctxt: a regexp parser context
1446 * @from: the from state
1447 * @to: the target state or NULL for building a new one
1448 * counter: the counter for that transition
1449 *
1450 */
1451 static int
xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1452 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1453 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1454 if (to == NULL) {
1455 to = xmlRegStatePush(ctxt);
1456 if (to == NULL)
1457 return(-1);
1458 ctxt->state = to;
1459 }
1460 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1461 return(0);
1462 }
1463
1464 /**
1465 * xmlFAGenerateTransitions:
1466 * @ctxt: a regexp parser context
1467 * @from: the from state
1468 * @to: the target state or NULL for building a new one
1469 * @atom: the atom generating the transition
1470 *
1471 * Returns 0 if success and -1 in case of error.
1472 */
1473 static int
xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,xmlRegAtomPtr atom)1474 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1475 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1476 xmlRegStatePtr end;
1477 int nullable = 0;
1478
1479 if (atom == NULL) {
1480 ERROR("generate transition: atom == NULL");
1481 return(-1);
1482 }
1483 if (atom->type == XML_REGEXP_SUBREG) {
1484 /*
1485 * this is a subexpression handling one should not need to
1486 * create a new node except for XML_REGEXP_QUANT_RANGE.
1487 */
1488 if ((to != NULL) && (atom->stop != to) &&
1489 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1490 /*
1491 * Generate an epsilon transition to link to the target
1492 */
1493 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1494 #ifdef DV
1495 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1496 (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1497 to = xmlRegStatePush(ctxt, to);
1498 if (to == NULL)
1499 return(-1);
1500 ctxt->state = to;
1501 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1502 #endif
1503 }
1504 switch (atom->quant) {
1505 case XML_REGEXP_QUANT_OPT:
1506 atom->quant = XML_REGEXP_QUANT_ONCE;
1507 /*
1508 * transition done to the state after end of atom.
1509 * 1. set transition from atom start to new state
1510 * 2. set transition from atom end to this state.
1511 */
1512 if (to == NULL) {
1513 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1514 xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1515 ctxt->state);
1516 } else {
1517 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1518 }
1519 break;
1520 case XML_REGEXP_QUANT_MULT:
1521 atom->quant = XML_REGEXP_QUANT_ONCE;
1522 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1523 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1524 break;
1525 case XML_REGEXP_QUANT_PLUS:
1526 atom->quant = XML_REGEXP_QUANT_ONCE;
1527 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1528 break;
1529 case XML_REGEXP_QUANT_RANGE: {
1530 int counter;
1531 xmlRegStatePtr inter, newstate;
1532
1533 /*
1534 * create the final state now if needed
1535 */
1536 if (to != NULL) {
1537 newstate = to;
1538 } else {
1539 newstate = xmlRegStatePush(ctxt);
1540 if (newstate == NULL)
1541 return(-1);
1542 }
1543
1544 /*
1545 * The principle here is to use counted transition
1546 * to avoid explosion in the number of states in the
1547 * graph. This is clearly more complex but should not
1548 * be exploitable at runtime.
1549 */
1550 if ((atom->min == 0) && (atom->start0 == NULL)) {
1551 xmlRegAtomPtr copy;
1552 /*
1553 * duplicate a transition based on atom to count next
1554 * occurrences after 1. We cannot loop to atom->start
1555 * directly because we need an epsilon transition to
1556 * newstate.
1557 */
1558 /* ???? For some reason it seems we never reach that
1559 case, I suppose this got optimized out before when
1560 building the automata */
1561 copy = xmlRegCopyAtom(ctxt, atom);
1562 if (copy == NULL)
1563 return(-1);
1564 copy->quant = XML_REGEXP_QUANT_ONCE;
1565 copy->min = 0;
1566 copy->max = 0;
1567
1568 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1569 < 0) {
1570 xmlRegFreeAtom(copy);
1571 return(-1);
1572 }
1573 inter = ctxt->state;
1574 counter = xmlRegGetCounter(ctxt);
1575 if (counter < 0)
1576 return(-1);
1577 ctxt->counters[counter].min = atom->min - 1;
1578 ctxt->counters[counter].max = atom->max - 1;
1579 /* count the number of times we see it again */
1580 xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1581 atom->stop, counter);
1582 /* allow a way out based on the count */
1583 xmlFAGenerateCountedTransition(ctxt, inter,
1584 newstate, counter);
1585 /* and also allow a direct exit for 0 */
1586 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1587 newstate);
1588 } else {
1589 /*
1590 * either we need the atom at least once or there
1591 * is an atom->start0 allowing to easily plug the
1592 * epsilon transition.
1593 */
1594 counter = xmlRegGetCounter(ctxt);
1595 if (counter < 0)
1596 return(-1);
1597 ctxt->counters[counter].min = atom->min - 1;
1598 ctxt->counters[counter].max = atom->max - 1;
1599 /* allow a way out based on the count */
1600 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1601 newstate, counter);
1602 /* count the number of times we see it again */
1603 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1604 atom->start, counter);
1605 /* and if needed allow a direct exit for 0 */
1606 if (atom->min == 0)
1607 xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1608 newstate);
1609
1610 }
1611 atom->min = 0;
1612 atom->max = 0;
1613 atom->quant = XML_REGEXP_QUANT_ONCE;
1614 ctxt->state = newstate;
1615 }
1616 default:
1617 break;
1618 }
1619 if (xmlRegAtomPush(ctxt, atom) < 0)
1620 return(-1);
1621 return(0);
1622 }
1623 if ((atom->min == 0) && (atom->max == 0) &&
1624 (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1625 /*
1626 * we can discard the atom and generate an epsilon transition instead
1627 */
1628 if (to == NULL) {
1629 to = xmlRegStatePush(ctxt);
1630 if (to == NULL)
1631 return(-1);
1632 }
1633 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1634 ctxt->state = to;
1635 xmlRegFreeAtom(atom);
1636 return(0);
1637 }
1638 if (to == NULL) {
1639 to = xmlRegStatePush(ctxt);
1640 if (to == NULL)
1641 return(-1);
1642 }
1643 end = to;
1644 if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
1645 (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1646 /*
1647 * Do not pollute the target state by adding transitions from
1648 * it as it is likely to be the shared target of multiple branches.
1649 * So isolate with an epsilon transition.
1650 */
1651 xmlRegStatePtr tmp;
1652
1653 tmp = xmlRegStatePush(ctxt);
1654 if (tmp == NULL)
1655 return(-1);
1656 xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1657 to = tmp;
1658 }
1659 if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
1660 (atom->min == 0) && (atom->max > 0)) {
1661 nullable = 1;
1662 atom->min = 1;
1663 if (atom->max == 1)
1664 atom->quant = XML_REGEXP_QUANT_OPT;
1665 }
1666 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1667 ctxt->state = end;
1668 switch (atom->quant) {
1669 case XML_REGEXP_QUANT_OPT:
1670 atom->quant = XML_REGEXP_QUANT_ONCE;
1671 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1672 break;
1673 case XML_REGEXP_QUANT_MULT:
1674 atom->quant = XML_REGEXP_QUANT_ONCE;
1675 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1676 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1677 break;
1678 case XML_REGEXP_QUANT_PLUS:
1679 atom->quant = XML_REGEXP_QUANT_ONCE;
1680 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1681 break;
1682 case XML_REGEXP_QUANT_RANGE:
1683 if (nullable)
1684 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1685 break;
1686 default:
1687 break;
1688 }
1689 if (xmlRegAtomPush(ctxt, atom) < 0)
1690 return(-1);
1691 return(0);
1692 }
1693
1694 /**
1695 * xmlFAReduceEpsilonTransitions:
1696 * @ctxt: a regexp parser context
1697 * @fromnr: the from state
1698 * @tonr: the to state
1699 * @counter: should that transition be associated to a counted
1700 *
1701 */
1702 static void
xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt,int fromnr,int tonr,int counter)1703 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1704 int tonr, int counter) {
1705 int transnr;
1706 xmlRegStatePtr from;
1707 xmlRegStatePtr to;
1708
1709 from = ctxt->states[fromnr];
1710 if (from == NULL)
1711 return;
1712 to = ctxt->states[tonr];
1713 if (to == NULL)
1714 return;
1715 if ((to->mark == XML_REGEXP_MARK_START) ||
1716 (to->mark == XML_REGEXP_MARK_VISITED))
1717 return;
1718
1719 to->mark = XML_REGEXP_MARK_VISITED;
1720 if (to->type == XML_REGEXP_FINAL_STATE) {
1721 from->type = XML_REGEXP_FINAL_STATE;
1722 }
1723 for (transnr = 0;transnr < to->nbTrans;transnr++) {
1724 xmlRegTransPtr t1 = &to->trans[transnr];
1725 int tcounter;
1726
1727 if (t1->to < 0)
1728 continue;
1729 if (t1->counter >= 0) {
1730 /* assert(counter < 0); */
1731 tcounter = t1->counter;
1732 } else {
1733 tcounter = counter;
1734 }
1735 if (t1->atom == NULL) {
1736 /*
1737 * Don't remove counted transitions
1738 * Don't loop either
1739 */
1740 if (t1->to != fromnr) {
1741 if (t1->count >= 0) {
1742 xmlRegStateAddTrans(ctxt, from, NULL, ctxt->states[t1->to],
1743 -1, t1->count);
1744 } else {
1745 xmlFAReduceEpsilonTransitions(ctxt, fromnr, t1->to,
1746 tcounter);
1747 }
1748 }
1749 } else {
1750 xmlRegStateAddTrans(ctxt, from, t1->atom,
1751 ctxt->states[t1->to], tcounter, -1);
1752 }
1753 }
1754 }
1755
1756 /**
1757 * xmlFAFinishReduceEpsilonTransitions:
1758 * @ctxt: a regexp parser context
1759 * @fromnr: the from state
1760 * @tonr: the to state
1761 * @counter: should that transition be associated to a counted
1762 *
1763 */
1764 static void
xmlFAFinishReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt,int tonr)1765 xmlFAFinishReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int tonr) {
1766 int transnr;
1767 xmlRegStatePtr to;
1768
1769 to = ctxt->states[tonr];
1770 if (to == NULL)
1771 return;
1772 if ((to->mark == XML_REGEXP_MARK_START) ||
1773 (to->mark == XML_REGEXP_MARK_NORMAL))
1774 return;
1775
1776 to->mark = XML_REGEXP_MARK_NORMAL;
1777 for (transnr = 0;transnr < to->nbTrans;transnr++) {
1778 xmlRegTransPtr t1 = &to->trans[transnr];
1779 if ((t1->to >= 0) && (t1->atom == NULL))
1780 xmlFAFinishReduceEpsilonTransitions(ctxt, t1->to);
1781 }
1782 }
1783
1784 /**
1785 * xmlFAEliminateSimpleEpsilonTransitions:
1786 * @ctxt: a regexp parser context
1787 *
1788 * Eliminating general epsilon transitions can get costly in the general
1789 * algorithm due to the large amount of generated new transitions and
1790 * associated comparisons. However for simple epsilon transition used just
1791 * to separate building blocks when generating the automata this can be
1792 * reduced to state elimination:
1793 * - if there exists an epsilon from X to Y
1794 * - if there is no other transition from X
1795 * then X and Y are semantically equivalent and X can be eliminated
1796 * If X is the start state then make Y the start state, else replace the
1797 * target of all transitions to X by transitions to Y.
1798 *
1799 * If X is a final state, skip it.
1800 * Otherwise it would be necessary to manipulate counters for this case when
1801 * eliminating state 2:
1802 * State 1 has a transition with an atom to state 2.
1803 * State 2 is final and has an epsilon transition to state 1.
1804 */
1805 static void
xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1806 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1807 int statenr, i, j, newto;
1808 xmlRegStatePtr state, tmp;
1809
1810 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1811 state = ctxt->states[statenr];
1812 if (state == NULL)
1813 continue;
1814 if (state->nbTrans != 1)
1815 continue;
1816 if (state->type == XML_REGEXP_UNREACH_STATE ||
1817 state->type == XML_REGEXP_FINAL_STATE)
1818 continue;
1819 /* is the only transition out a basic transition */
1820 if ((state->trans[0].atom == NULL) &&
1821 (state->trans[0].to >= 0) &&
1822 (state->trans[0].to != statenr) &&
1823 (state->trans[0].counter < 0) &&
1824 (state->trans[0].count < 0)) {
1825 newto = state->trans[0].to;
1826
1827 if (state->type == XML_REGEXP_START_STATE) {
1828 } else {
1829 for (i = 0;i < state->nbTransTo;i++) {
1830 tmp = ctxt->states[state->transTo[i]];
1831 for (j = 0;j < tmp->nbTrans;j++) {
1832 if (tmp->trans[j].to == statenr) {
1833 tmp->trans[j].to = -1;
1834 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
1835 ctxt->states[newto],
1836 tmp->trans[j].counter,
1837 tmp->trans[j].count);
1838 }
1839 }
1840 }
1841 if (state->type == XML_REGEXP_FINAL_STATE)
1842 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1843 /* eliminate the transition completely */
1844 state->nbTrans = 0;
1845
1846 state->type = XML_REGEXP_UNREACH_STATE;
1847
1848 }
1849
1850 }
1851 }
1852 }
1853 /**
1854 * xmlFAEliminateEpsilonTransitions:
1855 * @ctxt: a regexp parser context
1856 *
1857 */
1858 static void
xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1859 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1860 int statenr, transnr;
1861 xmlRegStatePtr state;
1862 int has_epsilon;
1863
1864 if (ctxt->states == NULL) return;
1865
1866 /*
1867 * Eliminate simple epsilon transition and the associated unreachable
1868 * states.
1869 */
1870 xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1871 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1872 state = ctxt->states[statenr];
1873 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1874 xmlRegFreeState(state);
1875 ctxt->states[statenr] = NULL;
1876 }
1877 }
1878
1879 has_epsilon = 0;
1880
1881 /*
1882 * Build the completed transitions bypassing the epsilons
1883 * Use a marking algorithm to avoid loops
1884 * Mark sink states too.
1885 * Process from the latest states backward to the start when
1886 * there is long cascading epsilon chains this minimize the
1887 * recursions and transition compares when adding the new ones
1888 */
1889 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
1890 state = ctxt->states[statenr];
1891 if (state == NULL)
1892 continue;
1893 if ((state->nbTrans == 0) &&
1894 (state->type != XML_REGEXP_FINAL_STATE)) {
1895 state->type = XML_REGEXP_SINK_STATE;
1896 }
1897 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1898 if ((state->trans[transnr].atom == NULL) &&
1899 (state->trans[transnr].to >= 0)) {
1900 if (state->trans[transnr].to == statenr) {
1901 state->trans[transnr].to = -1;
1902 } else if (state->trans[transnr].count < 0) {
1903 int newto = state->trans[transnr].to;
1904
1905 has_epsilon = 1;
1906 state->trans[transnr].to = -2;
1907 state->mark = XML_REGEXP_MARK_START;
1908 xmlFAReduceEpsilonTransitions(ctxt, statenr,
1909 newto, state->trans[transnr].counter);
1910 xmlFAFinishReduceEpsilonTransitions(ctxt, newto);
1911 state->mark = XML_REGEXP_MARK_NORMAL;
1912 }
1913 }
1914 }
1915 }
1916 /*
1917 * Eliminate the epsilon transitions
1918 */
1919 if (has_epsilon) {
1920 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1921 state = ctxt->states[statenr];
1922 if (state == NULL)
1923 continue;
1924 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1925 xmlRegTransPtr trans = &(state->trans[transnr]);
1926 if ((trans->atom == NULL) &&
1927 (trans->count < 0) &&
1928 (trans->to >= 0)) {
1929 trans->to = -1;
1930 }
1931 }
1932 }
1933 }
1934
1935 /*
1936 * Use this pass to detect unreachable states too
1937 */
1938 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1939 state = ctxt->states[statenr];
1940 if (state != NULL)
1941 state->reached = XML_REGEXP_MARK_NORMAL;
1942 }
1943 state = ctxt->states[0];
1944 if (state != NULL)
1945 state->reached = XML_REGEXP_MARK_START;
1946 while (state != NULL) {
1947 xmlRegStatePtr target = NULL;
1948 state->reached = XML_REGEXP_MARK_VISITED;
1949 /*
1950 * Mark all states reachable from the current reachable state
1951 */
1952 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1953 if ((state->trans[transnr].to >= 0) &&
1954 ((state->trans[transnr].atom != NULL) ||
1955 (state->trans[transnr].count >= 0))) {
1956 int newto = state->trans[transnr].to;
1957
1958 if (ctxt->states[newto] == NULL)
1959 continue;
1960 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
1961 ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
1962 target = ctxt->states[newto];
1963 }
1964 }
1965 }
1966
1967 /*
1968 * find the next accessible state not explored
1969 */
1970 if (target == NULL) {
1971 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
1972 state = ctxt->states[statenr];
1973 if ((state != NULL) && (state->reached ==
1974 XML_REGEXP_MARK_START)) {
1975 target = state;
1976 break;
1977 }
1978 }
1979 }
1980 state = target;
1981 }
1982 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1983 state = ctxt->states[statenr];
1984 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
1985 xmlRegFreeState(state);
1986 ctxt->states[statenr] = NULL;
1987 }
1988 }
1989
1990 }
1991
1992 static int
xmlFACompareRanges(xmlRegRangePtr range1,xmlRegRangePtr range2)1993 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
1994 int ret = 0;
1995
1996 if ((range1->type == XML_REGEXP_RANGES) ||
1997 (range2->type == XML_REGEXP_RANGES) ||
1998 (range2->type == XML_REGEXP_SUBREG) ||
1999 (range1->type == XML_REGEXP_SUBREG) ||
2000 (range1->type == XML_REGEXP_STRING) ||
2001 (range2->type == XML_REGEXP_STRING))
2002 return(-1);
2003
2004 /* put them in order */
2005 if (range1->type > range2->type) {
2006 xmlRegRangePtr tmp;
2007
2008 tmp = range1;
2009 range1 = range2;
2010 range2 = tmp;
2011 }
2012 if ((range1->type == XML_REGEXP_ANYCHAR) ||
2013 (range2->type == XML_REGEXP_ANYCHAR)) {
2014 ret = 1;
2015 } else if ((range1->type == XML_REGEXP_EPSILON) ||
2016 (range2->type == XML_REGEXP_EPSILON)) {
2017 return(0);
2018 } else if (range1->type == range2->type) {
2019 if (range1->type != XML_REGEXP_CHARVAL)
2020 ret = 1;
2021 else if ((range1->end < range2->start) ||
2022 (range2->end < range1->start))
2023 ret = 0;
2024 else
2025 ret = 1;
2026 } else if (range1->type == XML_REGEXP_CHARVAL) {
2027 int codepoint;
2028 int neg = 0;
2029
2030 /*
2031 * just check all codepoints in the range for acceptance,
2032 * this is usually way cheaper since done only once at
2033 * compilation than testing over and over at runtime or
2034 * pushing too many states when evaluating.
2035 */
2036 if (((range1->neg == 0) && (range2->neg != 0)) ||
2037 ((range1->neg != 0) && (range2->neg == 0)))
2038 neg = 1;
2039
2040 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2041 ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2042 0, range2->start, range2->end,
2043 range2->blockName);
2044 if (ret < 0)
2045 return(-1);
2046 if (((neg == 1) && (ret == 0)) ||
2047 ((neg == 0) && (ret == 1)))
2048 return(1);
2049 }
2050 return(0);
2051 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2052 (range2->type == XML_REGEXP_BLOCK_NAME)) {
2053 if (range1->type == range2->type) {
2054 ret = xmlStrEqual(range1->blockName, range2->blockName);
2055 } else {
2056 /*
2057 * comparing a block range with anything else is way
2058 * too costly, and maintaining the table is like too much
2059 * memory too, so let's force the automata to save state
2060 * here.
2061 */
2062 return(1);
2063 }
2064 } else if ((range1->type < XML_REGEXP_LETTER) ||
2065 (range2->type < XML_REGEXP_LETTER)) {
2066 if ((range1->type == XML_REGEXP_ANYSPACE) &&
2067 (range2->type == XML_REGEXP_NOTSPACE))
2068 ret = 0;
2069 else if ((range1->type == XML_REGEXP_INITNAME) &&
2070 (range2->type == XML_REGEXP_NOTINITNAME))
2071 ret = 0;
2072 else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2073 (range2->type == XML_REGEXP_NOTNAMECHAR))
2074 ret = 0;
2075 else if ((range1->type == XML_REGEXP_DECIMAL) &&
2076 (range2->type == XML_REGEXP_NOTDECIMAL))
2077 ret = 0;
2078 else if ((range1->type == XML_REGEXP_REALCHAR) &&
2079 (range2->type == XML_REGEXP_NOTREALCHAR))
2080 ret = 0;
2081 else {
2082 /* same thing to limit complexity */
2083 return(1);
2084 }
2085 } else {
2086 ret = 0;
2087 /* range1->type < range2->type here */
2088 switch (range1->type) {
2089 case XML_REGEXP_LETTER:
2090 /* all disjoint except in the subgroups */
2091 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2092 (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2093 (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2094 (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2095 (range2->type == XML_REGEXP_LETTER_OTHERS))
2096 ret = 1;
2097 break;
2098 case XML_REGEXP_MARK:
2099 if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2100 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2101 (range2->type == XML_REGEXP_MARK_ENCLOSING))
2102 ret = 1;
2103 break;
2104 case XML_REGEXP_NUMBER:
2105 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2106 (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2107 (range2->type == XML_REGEXP_NUMBER_OTHERS))
2108 ret = 1;
2109 break;
2110 case XML_REGEXP_PUNCT:
2111 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2112 (range2->type == XML_REGEXP_PUNCT_DASH) ||
2113 (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2114 (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2115 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2116 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2117 (range2->type == XML_REGEXP_PUNCT_OTHERS))
2118 ret = 1;
2119 break;
2120 case XML_REGEXP_SEPAR:
2121 if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2122 (range2->type == XML_REGEXP_SEPAR_LINE) ||
2123 (range2->type == XML_REGEXP_SEPAR_PARA))
2124 ret = 1;
2125 break;
2126 case XML_REGEXP_SYMBOL:
2127 if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2128 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2129 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2130 (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2131 ret = 1;
2132 break;
2133 case XML_REGEXP_OTHER:
2134 if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2135 (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2136 (range2->type == XML_REGEXP_OTHER_PRIVATE))
2137 ret = 1;
2138 break;
2139 default:
2140 if ((range2->type >= XML_REGEXP_LETTER) &&
2141 (range2->type < XML_REGEXP_BLOCK_NAME))
2142 ret = 0;
2143 else {
2144 /* safety net ! */
2145 return(1);
2146 }
2147 }
2148 }
2149 if (((range1->neg == 0) && (range2->neg != 0)) ||
2150 ((range1->neg != 0) && (range2->neg == 0)))
2151 ret = !ret;
2152 return(ret);
2153 }
2154
2155 /**
2156 * xmlFACompareAtomTypes:
2157 * @type1: an atom type
2158 * @type2: an atom type
2159 *
2160 * Compares two atoms type to check whether they intersect in some ways,
2161 * this is used by xmlFACompareAtoms only
2162 *
2163 * Returns 1 if they may intersect and 0 otherwise
2164 */
2165 static int
xmlFACompareAtomTypes(xmlRegAtomType type1,xmlRegAtomType type2)2166 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2167 if ((type1 == XML_REGEXP_EPSILON) ||
2168 (type1 == XML_REGEXP_CHARVAL) ||
2169 (type1 == XML_REGEXP_RANGES) ||
2170 (type1 == XML_REGEXP_SUBREG) ||
2171 (type1 == XML_REGEXP_STRING) ||
2172 (type1 == XML_REGEXP_ANYCHAR))
2173 return(1);
2174 if ((type2 == XML_REGEXP_EPSILON) ||
2175 (type2 == XML_REGEXP_CHARVAL) ||
2176 (type2 == XML_REGEXP_RANGES) ||
2177 (type2 == XML_REGEXP_SUBREG) ||
2178 (type2 == XML_REGEXP_STRING) ||
2179 (type2 == XML_REGEXP_ANYCHAR))
2180 return(1);
2181
2182 if (type1 == type2) return(1);
2183
2184 /* simplify subsequent compares by making sure type1 < type2 */
2185 if (type1 > type2) {
2186 xmlRegAtomType tmp = type1;
2187 type1 = type2;
2188 type2 = tmp;
2189 }
2190 switch (type1) {
2191 case XML_REGEXP_ANYSPACE: /* \s */
2192 /* can't be a letter, number, mark, punctuation, symbol */
2193 if ((type2 == XML_REGEXP_NOTSPACE) ||
2194 ((type2 >= XML_REGEXP_LETTER) &&
2195 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2196 ((type2 >= XML_REGEXP_NUMBER) &&
2197 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2198 ((type2 >= XML_REGEXP_MARK) &&
2199 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2200 ((type2 >= XML_REGEXP_PUNCT) &&
2201 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2202 ((type2 >= XML_REGEXP_SYMBOL) &&
2203 (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2204 ) return(0);
2205 break;
2206 case XML_REGEXP_NOTSPACE: /* \S */
2207 break;
2208 case XML_REGEXP_INITNAME: /* \l */
2209 /* can't be a number, mark, separator, punctuation, symbol or other */
2210 if ((type2 == XML_REGEXP_NOTINITNAME) ||
2211 ((type2 >= XML_REGEXP_NUMBER) &&
2212 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2213 ((type2 >= XML_REGEXP_MARK) &&
2214 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2215 ((type2 >= XML_REGEXP_SEPAR) &&
2216 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2217 ((type2 >= XML_REGEXP_PUNCT) &&
2218 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2219 ((type2 >= XML_REGEXP_SYMBOL) &&
2220 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2221 ((type2 >= XML_REGEXP_OTHER) &&
2222 (type2 <= XML_REGEXP_OTHER_NA))
2223 ) return(0);
2224 break;
2225 case XML_REGEXP_NOTINITNAME: /* \L */
2226 break;
2227 case XML_REGEXP_NAMECHAR: /* \c */
2228 /* can't be a mark, separator, punctuation, symbol or other */
2229 if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2230 ((type2 >= XML_REGEXP_MARK) &&
2231 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2232 ((type2 >= XML_REGEXP_PUNCT) &&
2233 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2234 ((type2 >= XML_REGEXP_SEPAR) &&
2235 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2236 ((type2 >= XML_REGEXP_SYMBOL) &&
2237 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2238 ((type2 >= XML_REGEXP_OTHER) &&
2239 (type2 <= XML_REGEXP_OTHER_NA))
2240 ) return(0);
2241 break;
2242 case XML_REGEXP_NOTNAMECHAR: /* \C */
2243 break;
2244 case XML_REGEXP_DECIMAL: /* \d */
2245 /* can't be a letter, mark, separator, punctuation, symbol or other */
2246 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2247 (type2 == XML_REGEXP_REALCHAR) ||
2248 ((type2 >= XML_REGEXP_LETTER) &&
2249 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2250 ((type2 >= XML_REGEXP_MARK) &&
2251 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2252 ((type2 >= XML_REGEXP_PUNCT) &&
2253 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2254 ((type2 >= XML_REGEXP_SEPAR) &&
2255 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2256 ((type2 >= XML_REGEXP_SYMBOL) &&
2257 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2258 ((type2 >= XML_REGEXP_OTHER) &&
2259 (type2 <= XML_REGEXP_OTHER_NA))
2260 )return(0);
2261 break;
2262 case XML_REGEXP_NOTDECIMAL: /* \D */
2263 break;
2264 case XML_REGEXP_REALCHAR: /* \w */
2265 /* can't be a mark, separator, punctuation, symbol or other */
2266 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2267 ((type2 >= XML_REGEXP_MARK) &&
2268 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2269 ((type2 >= XML_REGEXP_PUNCT) &&
2270 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2271 ((type2 >= XML_REGEXP_SEPAR) &&
2272 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2273 ((type2 >= XML_REGEXP_SYMBOL) &&
2274 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2275 ((type2 >= XML_REGEXP_OTHER) &&
2276 (type2 <= XML_REGEXP_OTHER_NA))
2277 )return(0);
2278 break;
2279 case XML_REGEXP_NOTREALCHAR: /* \W */
2280 break;
2281 /*
2282 * at that point we know both type 1 and type2 are from
2283 * character categories are ordered and are different,
2284 * it becomes simple because this is a partition
2285 */
2286 case XML_REGEXP_LETTER:
2287 if (type2 <= XML_REGEXP_LETTER_OTHERS)
2288 return(1);
2289 return(0);
2290 case XML_REGEXP_LETTER_UPPERCASE:
2291 case XML_REGEXP_LETTER_LOWERCASE:
2292 case XML_REGEXP_LETTER_TITLECASE:
2293 case XML_REGEXP_LETTER_MODIFIER:
2294 case XML_REGEXP_LETTER_OTHERS:
2295 return(0);
2296 case XML_REGEXP_MARK:
2297 if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2298 return(1);
2299 return(0);
2300 case XML_REGEXP_MARK_NONSPACING:
2301 case XML_REGEXP_MARK_SPACECOMBINING:
2302 case XML_REGEXP_MARK_ENCLOSING:
2303 return(0);
2304 case XML_REGEXP_NUMBER:
2305 if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2306 return(1);
2307 return(0);
2308 case XML_REGEXP_NUMBER_DECIMAL:
2309 case XML_REGEXP_NUMBER_LETTER:
2310 case XML_REGEXP_NUMBER_OTHERS:
2311 return(0);
2312 case XML_REGEXP_PUNCT:
2313 if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2314 return(1);
2315 return(0);
2316 case XML_REGEXP_PUNCT_CONNECTOR:
2317 case XML_REGEXP_PUNCT_DASH:
2318 case XML_REGEXP_PUNCT_OPEN:
2319 case XML_REGEXP_PUNCT_CLOSE:
2320 case XML_REGEXP_PUNCT_INITQUOTE:
2321 case XML_REGEXP_PUNCT_FINQUOTE:
2322 case XML_REGEXP_PUNCT_OTHERS:
2323 return(0);
2324 case XML_REGEXP_SEPAR:
2325 if (type2 <= XML_REGEXP_SEPAR_PARA)
2326 return(1);
2327 return(0);
2328 case XML_REGEXP_SEPAR_SPACE:
2329 case XML_REGEXP_SEPAR_LINE:
2330 case XML_REGEXP_SEPAR_PARA:
2331 return(0);
2332 case XML_REGEXP_SYMBOL:
2333 if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2334 return(1);
2335 return(0);
2336 case XML_REGEXP_SYMBOL_MATH:
2337 case XML_REGEXP_SYMBOL_CURRENCY:
2338 case XML_REGEXP_SYMBOL_MODIFIER:
2339 case XML_REGEXP_SYMBOL_OTHERS:
2340 return(0);
2341 case XML_REGEXP_OTHER:
2342 if (type2 <= XML_REGEXP_OTHER_NA)
2343 return(1);
2344 return(0);
2345 case XML_REGEXP_OTHER_CONTROL:
2346 case XML_REGEXP_OTHER_FORMAT:
2347 case XML_REGEXP_OTHER_PRIVATE:
2348 case XML_REGEXP_OTHER_NA:
2349 return(0);
2350 default:
2351 break;
2352 }
2353 return(1);
2354 }
2355
2356 /**
2357 * xmlFAEqualAtoms:
2358 * @atom1: an atom
2359 * @atom2: an atom
2360 * @deep: if not set only compare string pointers
2361 *
2362 * Compares two atoms to check whether they are the same exactly
2363 * this is used to remove equivalent transitions
2364 *
2365 * Returns 1 if same and 0 otherwise
2366 */
2367 static int
xmlFAEqualAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2,int deep)2368 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2369 int ret = 0;
2370
2371 if (atom1 == atom2)
2372 return(1);
2373 if ((atom1 == NULL) || (atom2 == NULL))
2374 return(0);
2375
2376 if (atom1->type != atom2->type)
2377 return(0);
2378 switch (atom1->type) {
2379 case XML_REGEXP_EPSILON:
2380 ret = 0;
2381 break;
2382 case XML_REGEXP_STRING:
2383 if (!deep)
2384 ret = (atom1->valuep == atom2->valuep);
2385 else
2386 ret = xmlStrEqual((xmlChar *)atom1->valuep,
2387 (xmlChar *)atom2->valuep);
2388 break;
2389 case XML_REGEXP_CHARVAL:
2390 ret = (atom1->codepoint == atom2->codepoint);
2391 break;
2392 case XML_REGEXP_RANGES:
2393 /* too hard to do in the general case */
2394 ret = 0;
2395 default:
2396 break;
2397 }
2398 return(ret);
2399 }
2400
2401 /**
2402 * xmlFACompareAtoms:
2403 * @atom1: an atom
2404 * @atom2: an atom
2405 * @deep: if not set only compare string pointers
2406 *
2407 * Compares two atoms to check whether they intersect in some ways,
2408 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2409 *
2410 * Returns 1 if yes and 0 otherwise
2411 */
2412 static int
xmlFACompareAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2,int deep)2413 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2414 int ret = 1;
2415
2416 if (atom1 == atom2)
2417 return(1);
2418 if ((atom1 == NULL) || (atom2 == NULL))
2419 return(0);
2420
2421 if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2422 (atom2->type == XML_REGEXP_ANYCHAR))
2423 return(1);
2424
2425 if (atom1->type > atom2->type) {
2426 xmlRegAtomPtr tmp;
2427 tmp = atom1;
2428 atom1 = atom2;
2429 atom2 = tmp;
2430 }
2431 if (atom1->type != atom2->type) {
2432 ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2433 /* if they can't intersect at the type level break now */
2434 if (ret == 0)
2435 return(0);
2436 }
2437 switch (atom1->type) {
2438 case XML_REGEXP_STRING:
2439 if (!deep)
2440 ret = (atom1->valuep != atom2->valuep);
2441 else {
2442 xmlChar *val1 = (xmlChar *)atom1->valuep;
2443 xmlChar *val2 = (xmlChar *)atom2->valuep;
2444 int compound1 = (xmlStrchr(val1, '|') != NULL);
2445 int compound2 = (xmlStrchr(val2, '|') != NULL);
2446
2447 /* Ignore negative match flag for ##other namespaces */
2448 if (compound1 != compound2)
2449 return(0);
2450
2451 ret = xmlRegStrEqualWildcard(val1, val2);
2452 }
2453 break;
2454 case XML_REGEXP_EPSILON:
2455 goto not_determinist;
2456 case XML_REGEXP_CHARVAL:
2457 if (atom2->type == XML_REGEXP_CHARVAL) {
2458 ret = (atom1->codepoint == atom2->codepoint);
2459 } else {
2460 ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2461 if (ret < 0)
2462 ret = 1;
2463 }
2464 break;
2465 case XML_REGEXP_RANGES:
2466 if (atom2->type == XML_REGEXP_RANGES) {
2467 int i, j, res;
2468 xmlRegRangePtr r1, r2;
2469
2470 /*
2471 * need to check that none of the ranges eventually matches
2472 */
2473 for (i = 0;i < atom1->nbRanges;i++) {
2474 for (j = 0;j < atom2->nbRanges;j++) {
2475 r1 = atom1->ranges[i];
2476 r2 = atom2->ranges[j];
2477 res = xmlFACompareRanges(r1, r2);
2478 if (res == 1) {
2479 ret = 1;
2480 goto done;
2481 }
2482 }
2483 }
2484 ret = 0;
2485 }
2486 break;
2487 default:
2488 goto not_determinist;
2489 }
2490 done:
2491 if (atom1->neg != atom2->neg) {
2492 ret = !ret;
2493 }
2494 if (ret == 0)
2495 return(0);
2496 not_determinist:
2497 return(1);
2498 }
2499
2500 /**
2501 * xmlFARecurseDeterminism:
2502 * @ctxt: a regexp parser context
2503 *
2504 * Check whether the associated regexp is determinist,
2505 * should be called after xmlFAEliminateEpsilonTransitions()
2506 *
2507 */
2508 static int
xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,int fromnr,int tonr,xmlRegAtomPtr atom)2509 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2510 int fromnr, int tonr, xmlRegAtomPtr atom) {
2511 int ret = 1;
2512 int res;
2513 int transnr, nbTrans;
2514 xmlRegTransPtr t1;
2515 int deep = 1;
2516
2517 if (state == NULL)
2518 return(ret);
2519 if (state->markd == XML_REGEXP_MARK_VISITED)
2520 return(ret);
2521
2522 if (ctxt->flags & AM_AUTOMATA_RNG)
2523 deep = 0;
2524
2525 /*
2526 * don't recurse on transitions potentially added in the course of
2527 * the elimination.
2528 */
2529 nbTrans = state->nbTrans;
2530 for (transnr = 0;transnr < nbTrans;transnr++) {
2531 t1 = &(state->trans[transnr]);
2532 /*
2533 * check transitions conflicting with the one looked at
2534 */
2535 if ((t1->to < 0) || (t1->to == fromnr))
2536 continue;
2537 if (t1->atom == NULL) {
2538 state->markd = XML_REGEXP_MARK_VISITED;
2539 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2540 fromnr, tonr, atom);
2541 if (res == 0) {
2542 ret = 0;
2543 /* t1->nd = 1; */
2544 }
2545 continue;
2546 }
2547 if (xmlFACompareAtoms(t1->atom, atom, deep)) {
2548 /* Treat equal transitions as deterministic. */
2549 if ((t1->to != tonr) ||
2550 (!xmlFAEqualAtoms(t1->atom, atom, deep)))
2551 ret = 0;
2552 /* mark the transition as non-deterministic */
2553 t1->nd = 1;
2554 }
2555 }
2556 return(ret);
2557 }
2558
2559 /**
2560 * xmlFAFinishRecurseDeterminism:
2561 * @ctxt: a regexp parser context
2562 *
2563 * Reset flags after checking determinism.
2564 */
2565 static void
xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state)2566 xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
2567 int transnr, nbTrans;
2568
2569 if (state == NULL)
2570 return;
2571 if (state->markd != XML_REGEXP_MARK_VISITED)
2572 return;
2573 state->markd = 0;
2574
2575 nbTrans = state->nbTrans;
2576 for (transnr = 0; transnr < nbTrans; transnr++) {
2577 xmlRegTransPtr t1 = &state->trans[transnr];
2578 if ((t1->atom == NULL) && (t1->to >= 0))
2579 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2580 }
2581 }
2582
2583 /**
2584 * xmlFAComputesDeterminism:
2585 * @ctxt: a regexp parser context
2586 *
2587 * Check whether the associated regexp is determinist,
2588 * should be called after xmlFAEliminateEpsilonTransitions()
2589 *
2590 */
2591 static int
xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt)2592 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2593 int statenr, transnr;
2594 xmlRegStatePtr state;
2595 xmlRegTransPtr t1, t2, last;
2596 int i;
2597 int ret = 1;
2598 int deep = 1;
2599
2600 if (ctxt->determinist != -1)
2601 return(ctxt->determinist);
2602
2603 if (ctxt->flags & AM_AUTOMATA_RNG)
2604 deep = 0;
2605
2606 /*
2607 * First cleanup the automata removing cancelled transitions
2608 */
2609 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2610 state = ctxt->states[statenr];
2611 if (state == NULL)
2612 continue;
2613 if (state->nbTrans < 2)
2614 continue;
2615 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2616 t1 = &(state->trans[transnr]);
2617 /*
2618 * Determinism checks in case of counted or all transitions
2619 * will have to be handled separately
2620 */
2621 if (t1->atom == NULL) {
2622 /* t1->nd = 1; */
2623 continue;
2624 }
2625 if (t1->to < 0) /* eliminated */
2626 continue;
2627 for (i = 0;i < transnr;i++) {
2628 t2 = &(state->trans[i]);
2629 if (t2->to < 0) /* eliminated */
2630 continue;
2631 if (t2->atom != NULL) {
2632 if (t1->to == t2->to) {
2633 /*
2634 * Here we use deep because we want to keep the
2635 * transitions which indicate a conflict
2636 */
2637 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
2638 (t1->counter == t2->counter) &&
2639 (t1->count == t2->count))
2640 t2->to = -1; /* eliminated */
2641 }
2642 }
2643 }
2644 }
2645 }
2646
2647 /*
2648 * Check for all states that there aren't 2 transitions
2649 * with the same atom and a different target.
2650 */
2651 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2652 state = ctxt->states[statenr];
2653 if (state == NULL)
2654 continue;
2655 if (state->nbTrans < 2)
2656 continue;
2657 last = NULL;
2658 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2659 t1 = &(state->trans[transnr]);
2660 /*
2661 * Determinism checks in case of counted or all transitions
2662 * will have to be handled separately
2663 */
2664 if (t1->atom == NULL) {
2665 continue;
2666 }
2667 if (t1->to < 0) /* eliminated */
2668 continue;
2669 for (i = 0;i < transnr;i++) {
2670 t2 = &(state->trans[i]);
2671 if (t2->to < 0) /* eliminated */
2672 continue;
2673 if (t2->atom != NULL) {
2674 /*
2675 * But here we don't use deep because we want to
2676 * find transitions which indicate a conflict
2677 */
2678 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
2679 /*
2680 * Treat equal counter transitions that couldn't be
2681 * eliminated as deterministic.
2682 */
2683 if ((t1->to != t2->to) ||
2684 (t1->counter == t2->counter) ||
2685 (!xmlFAEqualAtoms(t1->atom, t2->atom, deep)))
2686 ret = 0;
2687 /* mark the transitions as non-deterministic ones */
2688 t1->nd = 1;
2689 t2->nd = 1;
2690 last = t1;
2691 }
2692 } else {
2693 int res;
2694
2695 /*
2696 * do the closure in case of remaining specific
2697 * epsilon transitions like choices or all
2698 */
2699 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t2->to],
2700 statenr, t1->to, t1->atom);
2701 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t2->to]);
2702 /* don't shortcut the computation so all non deterministic
2703 transition get marked down
2704 if (ret == 0)
2705 return(0);
2706 */
2707 if (res == 0) {
2708 t1->nd = 1;
2709 /* t2->nd = 1; */
2710 last = t1;
2711 ret = 0;
2712 }
2713 }
2714 }
2715 /* don't shortcut the computation so all non deterministic
2716 transition get marked down
2717 if (ret == 0)
2718 break; */
2719 }
2720
2721 /*
2722 * mark specifically the last non-deterministic transition
2723 * from a state since there is no need to set-up rollback
2724 * from it
2725 */
2726 if (last != NULL) {
2727 last->nd = 2;
2728 }
2729
2730 /* don't shortcut the computation so all non deterministic
2731 transition get marked down
2732 if (ret == 0)
2733 break; */
2734 }
2735
2736 ctxt->determinist = ret;
2737 return(ret);
2738 }
2739
2740 /************************************************************************
2741 * *
2742 * Routines to check input against transition atoms *
2743 * *
2744 ************************************************************************/
2745
2746 static int
xmlRegCheckCharacterRange(xmlRegAtomType type,int codepoint,int neg,int start,int end,const xmlChar * blockName)2747 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2748 int start, int end, const xmlChar *blockName) {
2749 int ret = 0;
2750
2751 switch (type) {
2752 case XML_REGEXP_STRING:
2753 case XML_REGEXP_SUBREG:
2754 case XML_REGEXP_RANGES:
2755 case XML_REGEXP_EPSILON:
2756 return(-1);
2757 case XML_REGEXP_ANYCHAR:
2758 ret = ((codepoint != '\n') && (codepoint != '\r'));
2759 break;
2760 case XML_REGEXP_CHARVAL:
2761 ret = ((codepoint >= start) && (codepoint <= end));
2762 break;
2763 case XML_REGEXP_NOTSPACE:
2764 neg = !neg;
2765 /* Falls through. */
2766 case XML_REGEXP_ANYSPACE:
2767 ret = ((codepoint == '\n') || (codepoint == '\r') ||
2768 (codepoint == '\t') || (codepoint == ' '));
2769 break;
2770 case XML_REGEXP_NOTINITNAME:
2771 neg = !neg;
2772 /* Falls through. */
2773 case XML_REGEXP_INITNAME:
2774 ret = (IS_LETTER(codepoint) ||
2775 (codepoint == '_') || (codepoint == ':'));
2776 break;
2777 case XML_REGEXP_NOTNAMECHAR:
2778 neg = !neg;
2779 /* Falls through. */
2780 case XML_REGEXP_NAMECHAR:
2781 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
2782 (codepoint == '.') || (codepoint == '-') ||
2783 (codepoint == '_') || (codepoint == ':') ||
2784 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
2785 break;
2786 case XML_REGEXP_NOTDECIMAL:
2787 neg = !neg;
2788 /* Falls through. */
2789 case XML_REGEXP_DECIMAL:
2790 ret = xmlUCSIsCatNd(codepoint);
2791 break;
2792 case XML_REGEXP_REALCHAR:
2793 neg = !neg;
2794 /* Falls through. */
2795 case XML_REGEXP_NOTREALCHAR:
2796 ret = xmlUCSIsCatP(codepoint);
2797 if (ret == 0)
2798 ret = xmlUCSIsCatZ(codepoint);
2799 if (ret == 0)
2800 ret = xmlUCSIsCatC(codepoint);
2801 break;
2802 case XML_REGEXP_LETTER:
2803 ret = xmlUCSIsCatL(codepoint);
2804 break;
2805 case XML_REGEXP_LETTER_UPPERCASE:
2806 ret = xmlUCSIsCatLu(codepoint);
2807 break;
2808 case XML_REGEXP_LETTER_LOWERCASE:
2809 ret = xmlUCSIsCatLl(codepoint);
2810 break;
2811 case XML_REGEXP_LETTER_TITLECASE:
2812 ret = xmlUCSIsCatLt(codepoint);
2813 break;
2814 case XML_REGEXP_LETTER_MODIFIER:
2815 ret = xmlUCSIsCatLm(codepoint);
2816 break;
2817 case XML_REGEXP_LETTER_OTHERS:
2818 ret = xmlUCSIsCatLo(codepoint);
2819 break;
2820 case XML_REGEXP_MARK:
2821 ret = xmlUCSIsCatM(codepoint);
2822 break;
2823 case XML_REGEXP_MARK_NONSPACING:
2824 ret = xmlUCSIsCatMn(codepoint);
2825 break;
2826 case XML_REGEXP_MARK_SPACECOMBINING:
2827 ret = xmlUCSIsCatMc(codepoint);
2828 break;
2829 case XML_REGEXP_MARK_ENCLOSING:
2830 ret = xmlUCSIsCatMe(codepoint);
2831 break;
2832 case XML_REGEXP_NUMBER:
2833 ret = xmlUCSIsCatN(codepoint);
2834 break;
2835 case XML_REGEXP_NUMBER_DECIMAL:
2836 ret = xmlUCSIsCatNd(codepoint);
2837 break;
2838 case XML_REGEXP_NUMBER_LETTER:
2839 ret = xmlUCSIsCatNl(codepoint);
2840 break;
2841 case XML_REGEXP_NUMBER_OTHERS:
2842 ret = xmlUCSIsCatNo(codepoint);
2843 break;
2844 case XML_REGEXP_PUNCT:
2845 ret = xmlUCSIsCatP(codepoint);
2846 break;
2847 case XML_REGEXP_PUNCT_CONNECTOR:
2848 ret = xmlUCSIsCatPc(codepoint);
2849 break;
2850 case XML_REGEXP_PUNCT_DASH:
2851 ret = xmlUCSIsCatPd(codepoint);
2852 break;
2853 case XML_REGEXP_PUNCT_OPEN:
2854 ret = xmlUCSIsCatPs(codepoint);
2855 break;
2856 case XML_REGEXP_PUNCT_CLOSE:
2857 ret = xmlUCSIsCatPe(codepoint);
2858 break;
2859 case XML_REGEXP_PUNCT_INITQUOTE:
2860 ret = xmlUCSIsCatPi(codepoint);
2861 break;
2862 case XML_REGEXP_PUNCT_FINQUOTE:
2863 ret = xmlUCSIsCatPf(codepoint);
2864 break;
2865 case XML_REGEXP_PUNCT_OTHERS:
2866 ret = xmlUCSIsCatPo(codepoint);
2867 break;
2868 case XML_REGEXP_SEPAR:
2869 ret = xmlUCSIsCatZ(codepoint);
2870 break;
2871 case XML_REGEXP_SEPAR_SPACE:
2872 ret = xmlUCSIsCatZs(codepoint);
2873 break;
2874 case XML_REGEXP_SEPAR_LINE:
2875 ret = xmlUCSIsCatZl(codepoint);
2876 break;
2877 case XML_REGEXP_SEPAR_PARA:
2878 ret = xmlUCSIsCatZp(codepoint);
2879 break;
2880 case XML_REGEXP_SYMBOL:
2881 ret = xmlUCSIsCatS(codepoint);
2882 break;
2883 case XML_REGEXP_SYMBOL_MATH:
2884 ret = xmlUCSIsCatSm(codepoint);
2885 break;
2886 case XML_REGEXP_SYMBOL_CURRENCY:
2887 ret = xmlUCSIsCatSc(codepoint);
2888 break;
2889 case XML_REGEXP_SYMBOL_MODIFIER:
2890 ret = xmlUCSIsCatSk(codepoint);
2891 break;
2892 case XML_REGEXP_SYMBOL_OTHERS:
2893 ret = xmlUCSIsCatSo(codepoint);
2894 break;
2895 case XML_REGEXP_OTHER:
2896 ret = xmlUCSIsCatC(codepoint);
2897 break;
2898 case XML_REGEXP_OTHER_CONTROL:
2899 ret = xmlUCSIsCatCc(codepoint);
2900 break;
2901 case XML_REGEXP_OTHER_FORMAT:
2902 ret = xmlUCSIsCatCf(codepoint);
2903 break;
2904 case XML_REGEXP_OTHER_PRIVATE:
2905 ret = xmlUCSIsCatCo(codepoint);
2906 break;
2907 case XML_REGEXP_OTHER_NA:
2908 /* ret = xmlUCSIsCatCn(codepoint); */
2909 /* Seems it doesn't exist anymore in recent Unicode releases */
2910 ret = 0;
2911 break;
2912 case XML_REGEXP_BLOCK_NAME:
2913 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
2914 break;
2915 }
2916 if (neg)
2917 return(!ret);
2918 return(ret);
2919 }
2920
2921 static int
xmlRegCheckCharacter(xmlRegAtomPtr atom,int codepoint)2922 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
2923 int i, ret = 0;
2924 xmlRegRangePtr range;
2925
2926 if ((atom == NULL) || (!IS_CHAR(codepoint)))
2927 return(-1);
2928
2929 switch (atom->type) {
2930 case XML_REGEXP_SUBREG:
2931 case XML_REGEXP_EPSILON:
2932 return(-1);
2933 case XML_REGEXP_CHARVAL:
2934 return(codepoint == atom->codepoint);
2935 case XML_REGEXP_RANGES: {
2936 int accept = 0;
2937
2938 for (i = 0;i < atom->nbRanges;i++) {
2939 range = atom->ranges[i];
2940 if (range->neg == 2) {
2941 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2942 0, range->start, range->end,
2943 range->blockName);
2944 if (ret != 0)
2945 return(0); /* excluded char */
2946 } else if (range->neg) {
2947 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2948 0, range->start, range->end,
2949 range->blockName);
2950 if (ret == 0)
2951 accept = 1;
2952 else
2953 return(0);
2954 } else {
2955 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2956 0, range->start, range->end,
2957 range->blockName);
2958 if (ret != 0)
2959 accept = 1; /* might still be excluded */
2960 }
2961 }
2962 return(accept);
2963 }
2964 case XML_REGEXP_STRING:
2965 return(-1);
2966 case XML_REGEXP_ANYCHAR:
2967 case XML_REGEXP_ANYSPACE:
2968 case XML_REGEXP_NOTSPACE:
2969 case XML_REGEXP_INITNAME:
2970 case XML_REGEXP_NOTINITNAME:
2971 case XML_REGEXP_NAMECHAR:
2972 case XML_REGEXP_NOTNAMECHAR:
2973 case XML_REGEXP_DECIMAL:
2974 case XML_REGEXP_NOTDECIMAL:
2975 case XML_REGEXP_REALCHAR:
2976 case XML_REGEXP_NOTREALCHAR:
2977 case XML_REGEXP_LETTER:
2978 case XML_REGEXP_LETTER_UPPERCASE:
2979 case XML_REGEXP_LETTER_LOWERCASE:
2980 case XML_REGEXP_LETTER_TITLECASE:
2981 case XML_REGEXP_LETTER_MODIFIER:
2982 case XML_REGEXP_LETTER_OTHERS:
2983 case XML_REGEXP_MARK:
2984 case XML_REGEXP_MARK_NONSPACING:
2985 case XML_REGEXP_MARK_SPACECOMBINING:
2986 case XML_REGEXP_MARK_ENCLOSING:
2987 case XML_REGEXP_NUMBER:
2988 case XML_REGEXP_NUMBER_DECIMAL:
2989 case XML_REGEXP_NUMBER_LETTER:
2990 case XML_REGEXP_NUMBER_OTHERS:
2991 case XML_REGEXP_PUNCT:
2992 case XML_REGEXP_PUNCT_CONNECTOR:
2993 case XML_REGEXP_PUNCT_DASH:
2994 case XML_REGEXP_PUNCT_OPEN:
2995 case XML_REGEXP_PUNCT_CLOSE:
2996 case XML_REGEXP_PUNCT_INITQUOTE:
2997 case XML_REGEXP_PUNCT_FINQUOTE:
2998 case XML_REGEXP_PUNCT_OTHERS:
2999 case XML_REGEXP_SEPAR:
3000 case XML_REGEXP_SEPAR_SPACE:
3001 case XML_REGEXP_SEPAR_LINE:
3002 case XML_REGEXP_SEPAR_PARA:
3003 case XML_REGEXP_SYMBOL:
3004 case XML_REGEXP_SYMBOL_MATH:
3005 case XML_REGEXP_SYMBOL_CURRENCY:
3006 case XML_REGEXP_SYMBOL_MODIFIER:
3007 case XML_REGEXP_SYMBOL_OTHERS:
3008 case XML_REGEXP_OTHER:
3009 case XML_REGEXP_OTHER_CONTROL:
3010 case XML_REGEXP_OTHER_FORMAT:
3011 case XML_REGEXP_OTHER_PRIVATE:
3012 case XML_REGEXP_OTHER_NA:
3013 case XML_REGEXP_BLOCK_NAME:
3014 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3015 (const xmlChar *)atom->valuep);
3016 if (atom->neg)
3017 ret = !ret;
3018 break;
3019 }
3020 return(ret);
3021 }
3022
3023 /************************************************************************
3024 * *
3025 * Saving and restoring state of an execution context *
3026 * *
3027 ************************************************************************/
3028
3029 static void
xmlFARegExecSave(xmlRegExecCtxtPtr exec)3030 xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3031 #ifdef MAX_PUSH
3032 if (exec->nbPush > MAX_PUSH) {
3033 exec->status = XML_REGEXP_INTERNAL_LIMIT;
3034 return;
3035 }
3036 exec->nbPush++;
3037 #endif
3038
3039 if (exec->nbRollbacks >= exec->maxRollbacks) {
3040 xmlRegExecRollback *tmp;
3041 int newSize;
3042 int len = exec->nbRollbacks;
3043
3044 newSize = xmlGrowCapacity(exec->maxRollbacks, sizeof(tmp[0]),
3045 4, XML_MAX_ITEMS);
3046 if (newSize < 0) {
3047 exec->status = XML_REGEXP_OUT_OF_MEMORY;
3048 return;
3049 }
3050 tmp = xmlRealloc(exec->rollbacks, newSize * sizeof(tmp[0]));
3051 if (tmp == NULL) {
3052 exec->status = XML_REGEXP_OUT_OF_MEMORY;
3053 return;
3054 }
3055 exec->rollbacks = tmp;
3056 exec->maxRollbacks = newSize;
3057 tmp = &exec->rollbacks[len];
3058 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3059 }
3060 exec->rollbacks[exec->nbRollbacks].state = exec->state;
3061 exec->rollbacks[exec->nbRollbacks].index = exec->index;
3062 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3063 if (exec->comp->nbCounters > 0) {
3064 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3065 exec->rollbacks[exec->nbRollbacks].counts = (int *)
3066 xmlMalloc(exec->comp->nbCounters * sizeof(int));
3067 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3068 exec->status = XML_REGEXP_OUT_OF_MEMORY;
3069 return;
3070 }
3071 }
3072 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3073 exec->comp->nbCounters * sizeof(int));
3074 }
3075 exec->nbRollbacks++;
3076 }
3077
3078 static void
xmlFARegExecRollBack(xmlRegExecCtxtPtr exec)3079 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3080 if (exec->status != XML_REGEXP_OK)
3081 return;
3082 if (exec->nbRollbacks <= 0) {
3083 exec->status = XML_REGEXP_NOT_FOUND;
3084 return;
3085 }
3086 exec->nbRollbacks--;
3087 exec->state = exec->rollbacks[exec->nbRollbacks].state;
3088 exec->index = exec->rollbacks[exec->nbRollbacks].index;
3089 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3090 if (exec->comp->nbCounters > 0) {
3091 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3092 exec->status = XML_REGEXP_INTERNAL_ERROR;
3093 return;
3094 }
3095 if (exec->counts) {
3096 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
3097 exec->comp->nbCounters * sizeof(int));
3098 }
3099 }
3100 }
3101
3102 /************************************************************************
3103 * *
3104 * Verifier, running an input against a compiled regexp *
3105 * *
3106 ************************************************************************/
3107
3108 static int
xmlFARegExec(xmlRegexpPtr comp,const xmlChar * content)3109 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3110 xmlRegExecCtxt execval;
3111 xmlRegExecCtxtPtr exec = &execval;
3112 int ret, codepoint = 0, len, deter;
3113
3114 exec->inputString = content;
3115 exec->index = 0;
3116 exec->nbPush = 0;
3117 exec->determinist = 1;
3118 exec->maxRollbacks = 0;
3119 exec->nbRollbacks = 0;
3120 exec->rollbacks = NULL;
3121 exec->status = XML_REGEXP_OK;
3122 exec->comp = comp;
3123 exec->state = comp->states[0];
3124 exec->transno = 0;
3125 exec->transcount = 0;
3126 exec->inputStack = NULL;
3127 exec->inputStackMax = 0;
3128 if (comp->nbCounters > 0) {
3129 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
3130 if (exec->counts == NULL) {
3131 return(XML_REGEXP_OUT_OF_MEMORY);
3132 }
3133 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3134 } else
3135 exec->counts = NULL;
3136 while ((exec->status == XML_REGEXP_OK) && (exec->state != NULL) &&
3137 ((exec->inputString[exec->index] != 0) ||
3138 ((exec->state != NULL) &&
3139 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3140 xmlRegTransPtr trans;
3141 xmlRegAtomPtr atom;
3142
3143 /*
3144 * If end of input on non-terminal state, rollback, however we may
3145 * still have epsilon like transition for counted transitions
3146 * on counters, in that case don't break too early. Additionally,
3147 * if we are working on a range like "AB{0,2}", where B is not present,
3148 * we don't want to break.
3149 */
3150 len = 1;
3151 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
3152 /*
3153 * if there is a transition, we must check if
3154 * atom allows minOccurs of 0
3155 */
3156 if (exec->transno < exec->state->nbTrans) {
3157 trans = &exec->state->trans[exec->transno];
3158 if (trans->to >=0) {
3159 atom = trans->atom;
3160 if (!((atom->min == 0) && (atom->max > 0)))
3161 goto rollback;
3162 }
3163 } else
3164 goto rollback;
3165 }
3166
3167 exec->transcount = 0;
3168 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3169 trans = &exec->state->trans[exec->transno];
3170 if (trans->to < 0)
3171 continue;
3172 atom = trans->atom;
3173 ret = 0;
3174 deter = 1;
3175 if (trans->count >= 0) {
3176 int count;
3177 xmlRegCounterPtr counter;
3178
3179 if (exec->counts == NULL) {
3180 exec->status = XML_REGEXP_INTERNAL_ERROR;
3181 goto error;
3182 }
3183 /*
3184 * A counted transition.
3185 */
3186
3187 count = exec->counts[trans->count];
3188 counter = &exec->comp->counters[trans->count];
3189 ret = ((count >= counter->min) && (count <= counter->max));
3190 if ((ret) && (counter->min != counter->max))
3191 deter = 0;
3192 } else if (atom == NULL) {
3193 exec->status = XML_REGEXP_INTERNAL_ERROR;
3194 break;
3195 } else if (exec->inputString[exec->index] != 0) {
3196 len = 4;
3197 codepoint = xmlGetUTF8Char(&exec->inputString[exec->index],
3198 &len);
3199 if (codepoint < 0) {
3200 exec->status = XML_REGEXP_INVALID_UTF8;
3201 goto error;
3202 }
3203 ret = xmlRegCheckCharacter(atom, codepoint);
3204 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
3205 xmlRegStatePtr to = comp->states[trans->to];
3206
3207 /*
3208 * this is a multiple input sequence
3209 * If there is a counter associated increment it now.
3210 * do not increment if the counter is already over the
3211 * maximum limit in which case get to next transition
3212 */
3213 if (trans->counter >= 0) {
3214 xmlRegCounterPtr counter;
3215
3216 if ((exec->counts == NULL) ||
3217 (exec->comp == NULL) ||
3218 (exec->comp->counters == NULL)) {
3219 exec->status = XML_REGEXP_INTERNAL_ERROR;
3220 goto error;
3221 }
3222 counter = &exec->comp->counters[trans->counter];
3223 if (exec->counts[trans->counter] >= counter->max)
3224 continue; /* for loop on transitions */
3225 }
3226 /* Save before incrementing */
3227 if (exec->state->nbTrans > exec->transno + 1) {
3228 xmlFARegExecSave(exec);
3229 if (exec->status != XML_REGEXP_OK)
3230 goto error;
3231 }
3232 if (trans->counter >= 0) {
3233 exec->counts[trans->counter]++;
3234 }
3235 exec->transcount = 1;
3236 do {
3237 /*
3238 * Try to progress as much as possible on the input
3239 */
3240 if (exec->transcount == atom->max) {
3241 break;
3242 }
3243 exec->index += len;
3244 /*
3245 * End of input: stop here
3246 */
3247 if (exec->inputString[exec->index] == 0) {
3248 exec->index -= len;
3249 break;
3250 }
3251 if (exec->transcount >= atom->min) {
3252 int transno = exec->transno;
3253 xmlRegStatePtr state = exec->state;
3254
3255 /*
3256 * The transition is acceptable save it
3257 */
3258 exec->transno = -1; /* trick */
3259 exec->state = to;
3260 xmlFARegExecSave(exec);
3261 if (exec->status != XML_REGEXP_OK)
3262 goto error;
3263 exec->transno = transno;
3264 exec->state = state;
3265 }
3266 len = 4;
3267 codepoint = xmlGetUTF8Char(
3268 &exec->inputString[exec->index], &len);
3269 if (codepoint < 0) {
3270 exec->status = XML_REGEXP_INVALID_UTF8;
3271 goto error;
3272 }
3273 ret = xmlRegCheckCharacter(atom, codepoint);
3274 exec->transcount++;
3275 } while (ret == 1);
3276 if (exec->transcount < atom->min)
3277 ret = 0;
3278
3279 /*
3280 * If the last check failed but one transition was found
3281 * possible, rollback
3282 */
3283 if (ret < 0)
3284 ret = 0;
3285 if (ret == 0) {
3286 goto rollback;
3287 }
3288 if (trans->counter >= 0) {
3289 if (exec->counts == NULL) {
3290 exec->status = XML_REGEXP_INTERNAL_ERROR;
3291 goto error;
3292 }
3293 exec->counts[trans->counter]--;
3294 }
3295 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3296 /*
3297 * we don't match on the codepoint, but minOccurs of 0
3298 * says that's ok. Setting len to 0 inhibits stepping
3299 * over the codepoint.
3300 */
3301 exec->transcount = 1;
3302 len = 0;
3303 ret = 1;
3304 }
3305 } else if ((atom->min == 0) && (atom->max > 0)) {
3306 /* another spot to match when minOccurs is 0 */
3307 exec->transcount = 1;
3308 len = 0;
3309 ret = 1;
3310 }
3311 if (ret == 1) {
3312 if ((trans->nd == 1) ||
3313 ((trans->count >= 0) && (deter == 0) &&
3314 (exec->state->nbTrans > exec->transno + 1))) {
3315 xmlFARegExecSave(exec);
3316 if (exec->status != XML_REGEXP_OK)
3317 goto error;
3318 }
3319 if (trans->counter >= 0) {
3320 xmlRegCounterPtr counter;
3321
3322 /* make sure we don't go over the counter maximum value */
3323 if ((exec->counts == NULL) ||
3324 (exec->comp == NULL) ||
3325 (exec->comp->counters == NULL)) {
3326 exec->status = XML_REGEXP_INTERNAL_ERROR;
3327 goto error;
3328 }
3329 counter = &exec->comp->counters[trans->counter];
3330 if (exec->counts[trans->counter] >= counter->max)
3331 continue; /* for loop on transitions */
3332 exec->counts[trans->counter]++;
3333 }
3334 if ((trans->count >= 0) &&
3335 (trans->count < REGEXP_ALL_COUNTER)) {
3336 if (exec->counts == NULL) {
3337 exec->status = XML_REGEXP_INTERNAL_ERROR;
3338 goto error;
3339 }
3340 exec->counts[trans->count] = 0;
3341 }
3342 exec->state = comp->states[trans->to];
3343 exec->transno = 0;
3344 if (trans->atom != NULL) {
3345 exec->index += len;
3346 }
3347 goto progress;
3348 } else if (ret < 0) {
3349 exec->status = XML_REGEXP_INTERNAL_ERROR;
3350 break;
3351 }
3352 }
3353 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3354 rollback:
3355 /*
3356 * Failed to find a way out
3357 */
3358 exec->determinist = 0;
3359 xmlFARegExecRollBack(exec);
3360 }
3361 progress:
3362 continue;
3363 }
3364 error:
3365 if (exec->rollbacks != NULL) {
3366 if (exec->counts != NULL) {
3367 int i;
3368
3369 for (i = 0;i < exec->maxRollbacks;i++)
3370 if (exec->rollbacks[i].counts != NULL)
3371 xmlFree(exec->rollbacks[i].counts);
3372 }
3373 xmlFree(exec->rollbacks);
3374 }
3375 if (exec->state == NULL)
3376 return(XML_REGEXP_INTERNAL_ERROR);
3377 if (exec->counts != NULL)
3378 xmlFree(exec->counts);
3379 if (exec->status == XML_REGEXP_OK)
3380 return(1);
3381 if (exec->status == XML_REGEXP_NOT_FOUND)
3382 return(0);
3383 return(exec->status);
3384 }
3385
3386 /************************************************************************
3387 * *
3388 * Progressive interface to the verifier one atom at a time *
3389 * *
3390 ************************************************************************/
3391
3392 /**
3393 * xmlRegNewExecCtxt:
3394 * @comp: a precompiled regular expression
3395 * @callback: a callback function used for handling progresses in the
3396 * automata matching phase
3397 * @data: the context data associated to the callback in this context
3398 *
3399 * Build a context used for progressive evaluation of a regexp.
3400 *
3401 * Returns the new context
3402 */
3403 xmlRegExecCtxtPtr
xmlRegNewExecCtxt(xmlRegexpPtr comp,xmlRegExecCallbacks callback,void * data)3404 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3405 xmlRegExecCtxtPtr exec;
3406
3407 if (comp == NULL)
3408 return(NULL);
3409 if ((comp->compact == NULL) && (comp->states == NULL))
3410 return(NULL);
3411 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3412 if (exec == NULL)
3413 return(NULL);
3414 memset(exec, 0, sizeof(xmlRegExecCtxt));
3415 exec->inputString = NULL;
3416 exec->index = 0;
3417 exec->determinist = 1;
3418 exec->maxRollbacks = 0;
3419 exec->nbRollbacks = 0;
3420 exec->rollbacks = NULL;
3421 exec->status = XML_REGEXP_OK;
3422 exec->comp = comp;
3423 if (comp->compact == NULL)
3424 exec->state = comp->states[0];
3425 exec->transno = 0;
3426 exec->transcount = 0;
3427 exec->callback = callback;
3428 exec->data = data;
3429 if (comp->nbCounters > 0) {
3430 /*
3431 * For error handling, exec->counts is allocated twice the size
3432 * the second half is used to store the data in case of rollback
3433 */
3434 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3435 * 2);
3436 if (exec->counts == NULL) {
3437 xmlFree(exec);
3438 return(NULL);
3439 }
3440 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3441 exec->errCounts = &exec->counts[comp->nbCounters];
3442 } else {
3443 exec->counts = NULL;
3444 exec->errCounts = NULL;
3445 }
3446 exec->inputStackMax = 0;
3447 exec->inputStackNr = 0;
3448 exec->inputStack = NULL;
3449 exec->errStateNo = -1;
3450 exec->errString = NULL;
3451 exec->nbPush = 0;
3452 return(exec);
3453 }
3454
3455 /**
3456 * xmlRegFreeExecCtxt:
3457 * @exec: a regular expression evaluation context
3458 *
3459 * Free the structures associated to a regular expression evaluation context.
3460 */
3461 void
xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec)3462 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3463 if (exec == NULL)
3464 return;
3465
3466 if (exec->rollbacks != NULL) {
3467 if (exec->counts != NULL) {
3468 int i;
3469
3470 for (i = 0;i < exec->maxRollbacks;i++)
3471 if (exec->rollbacks[i].counts != NULL)
3472 xmlFree(exec->rollbacks[i].counts);
3473 }
3474 xmlFree(exec->rollbacks);
3475 }
3476 if (exec->counts != NULL)
3477 xmlFree(exec->counts);
3478 if (exec->inputStack != NULL) {
3479 int i;
3480
3481 for (i = 0;i < exec->inputStackNr;i++) {
3482 if (exec->inputStack[i].value != NULL)
3483 xmlFree(exec->inputStack[i].value);
3484 }
3485 xmlFree(exec->inputStack);
3486 }
3487 if (exec->errString != NULL)
3488 xmlFree(exec->errString);
3489 xmlFree(exec);
3490 }
3491
3492 static int
xmlRegExecSetErrString(xmlRegExecCtxtPtr exec,const xmlChar * value)3493 xmlRegExecSetErrString(xmlRegExecCtxtPtr exec, const xmlChar *value) {
3494 if (exec->errString != NULL)
3495 xmlFree(exec->errString);
3496 if (value == NULL) {
3497 exec->errString = NULL;
3498 } else {
3499 exec->errString = xmlStrdup(value);
3500 if (exec->errString == NULL) {
3501 exec->status = XML_REGEXP_OUT_OF_MEMORY;
3502 return(-1);
3503 }
3504 }
3505 return(0);
3506 }
3507
3508 static void
xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)3509 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3510 void *data) {
3511 if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3512 xmlRegInputTokenPtr tmp;
3513 int newSize;
3514
3515 newSize = xmlGrowCapacity(exec->inputStackMax, sizeof(tmp[0]),
3516 4, XML_MAX_ITEMS);
3517 if (newSize < 0) {
3518 exec->status = XML_REGEXP_OUT_OF_MEMORY;
3519 return;
3520 }
3521 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3522 if (newSize < 2)
3523 newSize = 2;
3524 #endif
3525 tmp = xmlRealloc(exec->inputStack, newSize * sizeof(tmp[0]));
3526 if (tmp == NULL) {
3527 exec->status = XML_REGEXP_OUT_OF_MEMORY;
3528 return;
3529 }
3530 exec->inputStack = tmp;
3531 exec->inputStackMax = newSize;
3532 }
3533 if (value == NULL) {
3534 exec->inputStack[exec->inputStackNr].value = NULL;
3535 } else {
3536 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3537 if (exec->inputStack[exec->inputStackNr].value == NULL) {
3538 exec->status = XML_REGEXP_OUT_OF_MEMORY;
3539 return;
3540 }
3541 }
3542 exec->inputStack[exec->inputStackNr].data = data;
3543 exec->inputStackNr++;
3544 exec->inputStack[exec->inputStackNr].value = NULL;
3545 exec->inputStack[exec->inputStackNr].data = NULL;
3546 }
3547
3548 /**
3549 * xmlRegStrEqualWildcard:
3550 * @expStr: the string to be evaluated
3551 * @valStr: the validation string
3552 *
3553 * Checks if both strings are equal or have the same content. "*"
3554 * can be used as a wildcard in @valStr; "|" is used as a separator of
3555 * substrings in both @expStr and @valStr.
3556 *
3557 * Returns 1 if the comparison is satisfied and the number of substrings
3558 * is equal, 0 otherwise.
3559 */
3560
3561 static int
xmlRegStrEqualWildcard(const xmlChar * expStr,const xmlChar * valStr)3562 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3563 if (expStr == valStr) return(1);
3564 if (expStr == NULL) return(0);
3565 if (valStr == NULL) return(0);
3566 do {
3567 /*
3568 * Eval if we have a wildcard for the current item.
3569 */
3570 if (*expStr != *valStr) {
3571 /* if one of them starts with a wildcard make valStr be it */
3572 if (*valStr == '*') {
3573 const xmlChar *tmp;
3574
3575 tmp = valStr;
3576 valStr = expStr;
3577 expStr = tmp;
3578 }
3579 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3580 do {
3581 if (*valStr == XML_REG_STRING_SEPARATOR)
3582 break;
3583 valStr++;
3584 } while (*valStr != 0);
3585 continue;
3586 } else
3587 return(0);
3588 }
3589 expStr++;
3590 valStr++;
3591 } while (*valStr != 0);
3592 if (*expStr != 0)
3593 return (0);
3594 else
3595 return (1);
3596 }
3597
3598 /**
3599 * xmlRegCompactPushString:
3600 * @exec: a regexp execution context
3601 * @comp: the precompiled exec with a compact table
3602 * @value: a string token input
3603 * @data: data associated to the token to reuse in callbacks
3604 *
3605 * Push one input token in the execution context
3606 *
3607 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3608 * a negative value in case of error.
3609 */
3610 static int
xmlRegCompactPushString(xmlRegExecCtxtPtr exec,xmlRegexpPtr comp,const xmlChar * value,void * data)3611 xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3612 xmlRegexpPtr comp,
3613 const xmlChar *value,
3614 void *data) {
3615 int state = exec->index;
3616 int i, target;
3617
3618 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3619 return(-1);
3620
3621 if (value == NULL) {
3622 /*
3623 * are we at a final state ?
3624 */
3625 if (comp->compact[state * (comp->nbstrings + 1)] ==
3626 XML_REGEXP_FINAL_STATE)
3627 return(1);
3628 return(0);
3629 }
3630
3631 /*
3632 * Examine all outside transitions from current state
3633 */
3634 for (i = 0;i < comp->nbstrings;i++) {
3635 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3636 if ((target > 0) && (target <= comp->nbstates)) {
3637 target--; /* to avoid 0 */
3638 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3639 exec->index = target;
3640 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3641 exec->callback(exec->data, value,
3642 comp->transdata[state * comp->nbstrings + i], data);
3643 }
3644 if (comp->compact[target * (comp->nbstrings + 1)] ==
3645 XML_REGEXP_SINK_STATE)
3646 goto error;
3647
3648 if (comp->compact[target * (comp->nbstrings + 1)] ==
3649 XML_REGEXP_FINAL_STATE)
3650 return(1);
3651 return(0);
3652 }
3653 }
3654 }
3655 /*
3656 * Failed to find an exit transition out from current state for the
3657 * current token
3658 */
3659 error:
3660 exec->errStateNo = state;
3661 exec->status = XML_REGEXP_NOT_FOUND;
3662 xmlRegExecSetErrString(exec, value);
3663 return(exec->status);
3664 }
3665
3666 /**
3667 * xmlRegExecPushStringInternal:
3668 * @exec: a regexp execution context or NULL to indicate the end
3669 * @value: a string token input
3670 * @data: data associated to the token to reuse in callbacks
3671 * @compound: value was assembled from 2 strings
3672 *
3673 * Push one input token in the execution context
3674 *
3675 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3676 * a negative value in case of error.
3677 */
3678 static int
xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data,int compound)3679 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3680 void *data, int compound) {
3681 xmlRegTransPtr trans;
3682 xmlRegAtomPtr atom;
3683 int ret;
3684 int final = 0;
3685 int progress = 1;
3686
3687 if (exec == NULL)
3688 return(-1);
3689 if (exec->comp == NULL)
3690 return(-1);
3691 if (exec->status != XML_REGEXP_OK)
3692 return(exec->status);
3693
3694 if (exec->comp->compact != NULL)
3695 return(xmlRegCompactPushString(exec, exec->comp, value, data));
3696
3697 if (value == NULL) {
3698 if (exec->state->type == XML_REGEXP_FINAL_STATE)
3699 return(1);
3700 final = 1;
3701 }
3702
3703 /*
3704 * If we have an active rollback stack push the new value there
3705 * and get back to where we were left
3706 */
3707 if ((value != NULL) && (exec->inputStackNr > 0)) {
3708 xmlFARegExecSaveInputString(exec, value, data);
3709 value = exec->inputStack[exec->index].value;
3710 data = exec->inputStack[exec->index].data;
3711 }
3712
3713 while ((exec->status == XML_REGEXP_OK) &&
3714 ((value != NULL) ||
3715 ((final == 1) &&
3716 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3717
3718 /*
3719 * End of input on non-terminal state, rollback, however we may
3720 * still have epsilon like transition for counted transitions
3721 * on counters, in that case don't break too early.
3722 */
3723 if ((value == NULL) && (exec->counts == NULL))
3724 goto rollback;
3725
3726 exec->transcount = 0;
3727 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3728 trans = &exec->state->trans[exec->transno];
3729 if (trans->to < 0)
3730 continue;
3731 atom = trans->atom;
3732 ret = 0;
3733 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3734 int i;
3735 int count;
3736 xmlRegTransPtr t;
3737 xmlRegCounterPtr counter;
3738
3739 ret = 0;
3740
3741 /*
3742 * Check all counted transitions from the current state
3743 */
3744 if ((value == NULL) && (final)) {
3745 ret = 1;
3746 } else if (value != NULL) {
3747 for (i = 0;i < exec->state->nbTrans;i++) {
3748 t = &exec->state->trans[i];
3749 if ((t->counter < 0) || (t == trans))
3750 continue;
3751 counter = &exec->comp->counters[t->counter];
3752 count = exec->counts[t->counter];
3753 if ((count < counter->max) &&
3754 (t->atom != NULL) &&
3755 (xmlStrEqual(value, t->atom->valuep))) {
3756 ret = 0;
3757 break;
3758 }
3759 if ((count >= counter->min) &&
3760 (count < counter->max) &&
3761 (t->atom != NULL) &&
3762 (xmlStrEqual(value, t->atom->valuep))) {
3763 ret = 1;
3764 break;
3765 }
3766 }
3767 }
3768 } else if (trans->count == REGEXP_ALL_COUNTER) {
3769 int i;
3770 int count;
3771 xmlRegTransPtr t;
3772 xmlRegCounterPtr counter;
3773
3774 ret = 1;
3775
3776 /*
3777 * Check all counted transitions from the current state
3778 */
3779 for (i = 0;i < exec->state->nbTrans;i++) {
3780 t = &exec->state->trans[i];
3781 if ((t->counter < 0) || (t == trans))
3782 continue;
3783 counter = &exec->comp->counters[t->counter];
3784 count = exec->counts[t->counter];
3785 if ((count < counter->min) || (count > counter->max)) {
3786 ret = 0;
3787 break;
3788 }
3789 }
3790 } else if (trans->count >= 0) {
3791 int count;
3792 xmlRegCounterPtr counter;
3793
3794 /*
3795 * A counted transition.
3796 */
3797
3798 count = exec->counts[trans->count];
3799 counter = &exec->comp->counters[trans->count];
3800 ret = ((count >= counter->min) && (count <= counter->max));
3801 } else if (atom == NULL) {
3802 exec->status = XML_REGEXP_INTERNAL_ERROR;
3803 break;
3804 } else if (value != NULL) {
3805 ret = xmlRegStrEqualWildcard(atom->valuep, value);
3806 if (atom->neg) {
3807 ret = !ret;
3808 if (!compound)
3809 ret = 0;
3810 }
3811 if ((ret == 1) && (trans->counter >= 0)) {
3812 xmlRegCounterPtr counter;
3813 int count;
3814
3815 count = exec->counts[trans->counter];
3816 counter = &exec->comp->counters[trans->counter];
3817 if (count >= counter->max)
3818 ret = 0;
3819 }
3820
3821 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3822 xmlRegStatePtr to = exec->comp->states[trans->to];
3823
3824 /*
3825 * this is a multiple input sequence
3826 */
3827 if (exec->state->nbTrans > exec->transno + 1) {
3828 if (exec->inputStackNr <= 0) {
3829 xmlFARegExecSaveInputString(exec, value, data);
3830 }
3831 xmlFARegExecSave(exec);
3832 }
3833 exec->transcount = 1;
3834 do {
3835 /*
3836 * Try to progress as much as possible on the input
3837 */
3838 if (exec->transcount == atom->max) {
3839 break;
3840 }
3841 exec->index++;
3842 value = exec->inputStack[exec->index].value;
3843 data = exec->inputStack[exec->index].data;
3844
3845 /*
3846 * End of input: stop here
3847 */
3848 if (value == NULL) {
3849 exec->index --;
3850 break;
3851 }
3852 if (exec->transcount >= atom->min) {
3853 int transno = exec->transno;
3854 xmlRegStatePtr state = exec->state;
3855
3856 /*
3857 * The transition is acceptable save it
3858 */
3859 exec->transno = -1; /* trick */
3860 exec->state = to;
3861 if (exec->inputStackNr <= 0) {
3862 xmlFARegExecSaveInputString(exec, value, data);
3863 }
3864 xmlFARegExecSave(exec);
3865 exec->transno = transno;
3866 exec->state = state;
3867 }
3868 ret = xmlStrEqual(value, atom->valuep);
3869 exec->transcount++;
3870 } while (ret == 1);
3871 if (exec->transcount < atom->min)
3872 ret = 0;
3873
3874 /*
3875 * If the last check failed but one transition was found
3876 * possible, rollback
3877 */
3878 if (ret < 0)
3879 ret = 0;
3880 if (ret == 0) {
3881 goto rollback;
3882 }
3883 }
3884 }
3885 if (ret == 1) {
3886 if ((exec->callback != NULL) && (atom != NULL) &&
3887 (data != NULL)) {
3888 exec->callback(exec->data, atom->valuep,
3889 atom->data, data);
3890 }
3891 if (exec->state->nbTrans > exec->transno + 1) {
3892 if (exec->inputStackNr <= 0) {
3893 xmlFARegExecSaveInputString(exec, value, data);
3894 }
3895 xmlFARegExecSave(exec);
3896 }
3897 if (trans->counter >= 0) {
3898 exec->counts[trans->counter]++;
3899 }
3900 if ((trans->count >= 0) &&
3901 (trans->count < REGEXP_ALL_COUNTER)) {
3902 exec->counts[trans->count] = 0;
3903 }
3904 if ((exec->comp->states[trans->to] != NULL) &&
3905 (exec->comp->states[trans->to]->type ==
3906 XML_REGEXP_SINK_STATE)) {
3907 /*
3908 * entering a sink state, save the current state as error
3909 * state.
3910 */
3911 if (xmlRegExecSetErrString(exec, value) < 0)
3912 break;
3913 exec->errState = exec->state;
3914 memcpy(exec->errCounts, exec->counts,
3915 exec->comp->nbCounters * sizeof(int));
3916 }
3917 exec->state = exec->comp->states[trans->to];
3918 exec->transno = 0;
3919 if (trans->atom != NULL) {
3920 if (exec->inputStack != NULL) {
3921 exec->index++;
3922 if (exec->index < exec->inputStackNr) {
3923 value = exec->inputStack[exec->index].value;
3924 data = exec->inputStack[exec->index].data;
3925 } else {
3926 value = NULL;
3927 data = NULL;
3928 }
3929 } else {
3930 value = NULL;
3931 data = NULL;
3932 }
3933 }
3934 goto progress;
3935 } else if (ret < 0) {
3936 exec->status = XML_REGEXP_INTERNAL_ERROR;
3937 break;
3938 }
3939 }
3940 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3941 rollback:
3942 /*
3943 * if we didn't yet rollback on the current input
3944 * store the current state as the error state.
3945 */
3946 if ((progress) && (exec->state != NULL) &&
3947 (exec->state->type != XML_REGEXP_SINK_STATE)) {
3948 progress = 0;
3949 if (xmlRegExecSetErrString(exec, value) < 0)
3950 break;
3951 exec->errState = exec->state;
3952 if (exec->comp->nbCounters)
3953 memcpy(exec->errCounts, exec->counts,
3954 exec->comp->nbCounters * sizeof(int));
3955 }
3956
3957 /*
3958 * Failed to find a way out
3959 */
3960 exec->determinist = 0;
3961 xmlFARegExecRollBack(exec);
3962 if ((exec->inputStack != NULL ) &&
3963 (exec->status == XML_REGEXP_OK)) {
3964 value = exec->inputStack[exec->index].value;
3965 data = exec->inputStack[exec->index].data;
3966 }
3967 }
3968 continue;
3969 progress:
3970 progress = 1;
3971 }
3972 if (exec->status == XML_REGEXP_OK) {
3973 return(exec->state->type == XML_REGEXP_FINAL_STATE);
3974 }
3975 return(exec->status);
3976 }
3977
3978 /**
3979 * xmlRegExecPushString:
3980 * @exec: a regexp execution context or NULL to indicate the end
3981 * @value: a string token input
3982 * @data: data associated to the token to reuse in callbacks
3983 *
3984 * Push one input token in the execution context
3985 *
3986 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3987 * a negative value in case of error.
3988 */
3989 int
xmlRegExecPushString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)3990 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3991 void *data) {
3992 return(xmlRegExecPushStringInternal(exec, value, data, 0));
3993 }
3994
3995 /**
3996 * xmlRegExecPushString2:
3997 * @exec: a regexp execution context or NULL to indicate the end
3998 * @value: the first string token input
3999 * @value2: the second string token input
4000 * @data: data associated to the token to reuse in callbacks
4001 *
4002 * Push one input token in the execution context
4003 *
4004 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4005 * a negative value in case of error.
4006 */
4007 int
xmlRegExecPushString2(xmlRegExecCtxtPtr exec,const xmlChar * value,const xmlChar * value2,void * data)4008 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4009 const xmlChar *value2, void *data) {
4010 xmlChar buf[150];
4011 int lenn, lenp, ret;
4012 xmlChar *str;
4013
4014 if (exec == NULL)
4015 return(-1);
4016 if (exec->comp == NULL)
4017 return(-1);
4018 if (exec->status != XML_REGEXP_OK)
4019 return(exec->status);
4020
4021 if (value2 == NULL)
4022 return(xmlRegExecPushString(exec, value, data));
4023
4024 lenn = strlen((char *) value2);
4025 lenp = strlen((char *) value);
4026
4027 if (150 < lenn + lenp + 2) {
4028 str = xmlMalloc(lenn + lenp + 2);
4029 if (str == NULL) {
4030 exec->status = XML_REGEXP_OUT_OF_MEMORY;
4031 return(-1);
4032 }
4033 } else {
4034 str = buf;
4035 }
4036 memcpy(&str[0], value, lenp);
4037 str[lenp] = XML_REG_STRING_SEPARATOR;
4038 memcpy(&str[lenp + 1], value2, lenn);
4039 str[lenn + lenp + 1] = 0;
4040
4041 if (exec->comp->compact != NULL)
4042 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4043 else
4044 ret = xmlRegExecPushStringInternal(exec, str, data, 1);
4045
4046 if (str != buf)
4047 xmlFree(str);
4048 return(ret);
4049 }
4050
4051 /**
4052 * xmlRegExecGetValues:
4053 * @exec: a regexp execution context
4054 * @err: error extraction or normal one
4055 * @nbval: pointer to the number of accepted values IN/OUT
4056 * @nbneg: return number of negative transitions
4057 * @values: pointer to the array of acceptable values
4058 * @terminal: return value if this was a terminal state
4059 *
4060 * Extract information from the regexp execution, internal routine to
4061 * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
4062 *
4063 * Returns: 0 in case of success or -1 in case of error.
4064 */
4065 static int
xmlRegExecGetValues(xmlRegExecCtxtPtr exec,int err,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4066 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
4067 int *nbval, int *nbneg,
4068 xmlChar **values, int *terminal) {
4069 int maxval;
4070 int nb = 0;
4071
4072 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
4073 (values == NULL) || (*nbval <= 0))
4074 return(-1);
4075
4076 maxval = *nbval;
4077 *nbval = 0;
4078 *nbneg = 0;
4079 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4080 xmlRegexpPtr comp;
4081 int target, i, state;
4082
4083 comp = exec->comp;
4084
4085 if (err) {
4086 if (exec->errStateNo == -1) return(-1);
4087 state = exec->errStateNo;
4088 } else {
4089 state = exec->index;
4090 }
4091 if (terminal != NULL) {
4092 if (comp->compact[state * (comp->nbstrings + 1)] ==
4093 XML_REGEXP_FINAL_STATE)
4094 *terminal = 1;
4095 else
4096 *terminal = 0;
4097 }
4098 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4099 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4100 if ((target > 0) && (target <= comp->nbstates) &&
4101 (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4102 XML_REGEXP_SINK_STATE)) {
4103 values[nb++] = comp->stringMap[i];
4104 (*nbval)++;
4105 }
4106 }
4107 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4108 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4109 if ((target > 0) && (target <= comp->nbstates) &&
4110 (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4111 XML_REGEXP_SINK_STATE)) {
4112 values[nb++] = comp->stringMap[i];
4113 (*nbneg)++;
4114 }
4115 }
4116 } else {
4117 int transno;
4118 xmlRegTransPtr trans;
4119 xmlRegAtomPtr atom;
4120 xmlRegStatePtr state;
4121
4122 if (terminal != NULL) {
4123 if (exec->state->type == XML_REGEXP_FINAL_STATE)
4124 *terminal = 1;
4125 else
4126 *terminal = 0;
4127 }
4128
4129 if (err) {
4130 if (exec->errState == NULL) return(-1);
4131 state = exec->errState;
4132 } else {
4133 if (exec->state == NULL) return(-1);
4134 state = exec->state;
4135 }
4136 for (transno = 0;
4137 (transno < state->nbTrans) && (nb < maxval);
4138 transno++) {
4139 trans = &state->trans[transno];
4140 if (trans->to < 0)
4141 continue;
4142 atom = trans->atom;
4143 if ((atom == NULL) || (atom->valuep == NULL))
4144 continue;
4145 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4146 /* this should not be reached but ... */
4147 } else if (trans->count == REGEXP_ALL_COUNTER) {
4148 /* this should not be reached but ... */
4149 } else if (trans->counter >= 0) {
4150 xmlRegCounterPtr counter = NULL;
4151 int count;
4152
4153 if (err)
4154 count = exec->errCounts[trans->counter];
4155 else
4156 count = exec->counts[trans->counter];
4157 if (exec->comp != NULL)
4158 counter = &exec->comp->counters[trans->counter];
4159 if ((counter == NULL) || (count < counter->max)) {
4160 if (atom->neg)
4161 values[nb++] = (xmlChar *) atom->valuep2;
4162 else
4163 values[nb++] = (xmlChar *) atom->valuep;
4164 (*nbval)++;
4165 }
4166 } else {
4167 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
4168 (exec->comp->states[trans->to]->type !=
4169 XML_REGEXP_SINK_STATE)) {
4170 if (atom->neg)
4171 values[nb++] = (xmlChar *) atom->valuep2;
4172 else
4173 values[nb++] = (xmlChar *) atom->valuep;
4174 (*nbval)++;
4175 }
4176 }
4177 }
4178 for (transno = 0;
4179 (transno < state->nbTrans) && (nb < maxval);
4180 transno++) {
4181 trans = &state->trans[transno];
4182 if (trans->to < 0)
4183 continue;
4184 atom = trans->atom;
4185 if ((atom == NULL) || (atom->valuep == NULL))
4186 continue;
4187 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4188 continue;
4189 } else if (trans->count == REGEXP_ALL_COUNTER) {
4190 continue;
4191 } else if (trans->counter >= 0) {
4192 continue;
4193 } else {
4194 if ((exec->comp->states[trans->to] != NULL) &&
4195 (exec->comp->states[trans->to]->type ==
4196 XML_REGEXP_SINK_STATE)) {
4197 if (atom->neg)
4198 values[nb++] = (xmlChar *) atom->valuep2;
4199 else
4200 values[nb++] = (xmlChar *) atom->valuep;
4201 (*nbneg)++;
4202 }
4203 }
4204 }
4205 }
4206 return(0);
4207 }
4208
4209 /**
4210 * xmlRegExecNextValues:
4211 * @exec: a regexp execution context
4212 * @nbval: pointer to the number of accepted values IN/OUT
4213 * @nbneg: return number of negative transitions
4214 * @values: pointer to the array of acceptable values
4215 * @terminal: return value if this was a terminal state
4216 *
4217 * Extract information from the regexp execution,
4218 * the parameter @values must point to an array of @nbval string pointers
4219 * on return nbval will contain the number of possible strings in that
4220 * state and the @values array will be updated with them. The string values
4221 * returned will be freed with the @exec context and don't need to be
4222 * deallocated.
4223 *
4224 * Returns: 0 in case of success or -1 in case of error.
4225 */
4226 int
xmlRegExecNextValues(xmlRegExecCtxtPtr exec,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4227 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4228 xmlChar **values, int *terminal) {
4229 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
4230 }
4231
4232 /**
4233 * xmlRegExecErrInfo:
4234 * @exec: a regexp execution context generating an error
4235 * @string: return value for the error string
4236 * @nbval: pointer to the number of accepted values IN/OUT
4237 * @nbneg: return number of negative transitions
4238 * @values: pointer to the array of acceptable values
4239 * @terminal: return value if this was a terminal state
4240 *
4241 * Extract error information from the regexp execution, the parameter
4242 * @string will be updated with the value pushed and not accepted,
4243 * the parameter @values must point to an array of @nbval string pointers
4244 * on return nbval will contain the number of possible strings in that
4245 * state and the @values array will be updated with them. The string values
4246 * returned will be freed with the @exec context and don't need to be
4247 * deallocated.
4248 *
4249 * Returns: 0 in case of success or -1 in case of error.
4250 */
4251 int
xmlRegExecErrInfo(xmlRegExecCtxtPtr exec,const xmlChar ** string,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4252 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
4253 int *nbval, int *nbneg, xmlChar **values, int *terminal) {
4254 if (exec == NULL)
4255 return(-1);
4256 if (string != NULL) {
4257 if (exec->status != XML_REGEXP_OK)
4258 *string = exec->errString;
4259 else
4260 *string = NULL;
4261 }
4262 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
4263 }
4264
4265 /************************************************************************
4266 * *
4267 * Parser for the Schemas Datatype Regular Expressions *
4268 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
4269 * *
4270 ************************************************************************/
4271
4272 /**
4273 * xmlFAIsChar:
4274 * @ctxt: a regexp parser context
4275 *
4276 * [10] Char ::= [^.\?*+()|#x5B#x5D]
4277 */
4278 static int
xmlFAIsChar(xmlRegParserCtxtPtr ctxt)4279 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4280 int cur;
4281 int len;
4282
4283 len = 4;
4284 cur = xmlGetUTF8Char(ctxt->cur, &len);
4285 if (cur < 0) {
4286 ERROR("Invalid UTF-8");
4287 return(0);
4288 }
4289 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4290 (cur == '*') || (cur == '+') || (cur == '(') ||
4291 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
4292 (cur == 0x5D) || (cur == 0))
4293 return(-1);
4294 return(cur);
4295 }
4296
4297 /**
4298 * xmlFAParseCharProp:
4299 * @ctxt: a regexp parser context
4300 *
4301 * [27] charProp ::= IsCategory | IsBlock
4302 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
4303 * Separators | Symbols | Others
4304 * [29] Letters ::= 'L' [ultmo]?
4305 * [30] Marks ::= 'M' [nce]?
4306 * [31] Numbers ::= 'N' [dlo]?
4307 * [32] Punctuation ::= 'P' [cdseifo]?
4308 * [33] Separators ::= 'Z' [slp]?
4309 * [34] Symbols ::= 'S' [mcko]?
4310 * [35] Others ::= 'C' [cfon]?
4311 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
4312 */
4313 static void
xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt)4314 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4315 int cur;
4316 xmlRegAtomType type = (xmlRegAtomType) 0;
4317 xmlChar *blockName = NULL;
4318
4319 cur = CUR;
4320 if (cur == 'L') {
4321 NEXT;
4322 cur = CUR;
4323 if (cur == 'u') {
4324 NEXT;
4325 type = XML_REGEXP_LETTER_UPPERCASE;
4326 } else if (cur == 'l') {
4327 NEXT;
4328 type = XML_REGEXP_LETTER_LOWERCASE;
4329 } else if (cur == 't') {
4330 NEXT;
4331 type = XML_REGEXP_LETTER_TITLECASE;
4332 } else if (cur == 'm') {
4333 NEXT;
4334 type = XML_REGEXP_LETTER_MODIFIER;
4335 } else if (cur == 'o') {
4336 NEXT;
4337 type = XML_REGEXP_LETTER_OTHERS;
4338 } else {
4339 type = XML_REGEXP_LETTER;
4340 }
4341 } else if (cur == 'M') {
4342 NEXT;
4343 cur = CUR;
4344 if (cur == 'n') {
4345 NEXT;
4346 /* nonspacing */
4347 type = XML_REGEXP_MARK_NONSPACING;
4348 } else if (cur == 'c') {
4349 NEXT;
4350 /* spacing combining */
4351 type = XML_REGEXP_MARK_SPACECOMBINING;
4352 } else if (cur == 'e') {
4353 NEXT;
4354 /* enclosing */
4355 type = XML_REGEXP_MARK_ENCLOSING;
4356 } else {
4357 /* all marks */
4358 type = XML_REGEXP_MARK;
4359 }
4360 } else if (cur == 'N') {
4361 NEXT;
4362 cur = CUR;
4363 if (cur == 'd') {
4364 NEXT;
4365 /* digital */
4366 type = XML_REGEXP_NUMBER_DECIMAL;
4367 } else if (cur == 'l') {
4368 NEXT;
4369 /* letter */
4370 type = XML_REGEXP_NUMBER_LETTER;
4371 } else if (cur == 'o') {
4372 NEXT;
4373 /* other */
4374 type = XML_REGEXP_NUMBER_OTHERS;
4375 } else {
4376 /* all numbers */
4377 type = XML_REGEXP_NUMBER;
4378 }
4379 } else if (cur == 'P') {
4380 NEXT;
4381 cur = CUR;
4382 if (cur == 'c') {
4383 NEXT;
4384 /* connector */
4385 type = XML_REGEXP_PUNCT_CONNECTOR;
4386 } else if (cur == 'd') {
4387 NEXT;
4388 /* dash */
4389 type = XML_REGEXP_PUNCT_DASH;
4390 } else if (cur == 's') {
4391 NEXT;
4392 /* open */
4393 type = XML_REGEXP_PUNCT_OPEN;
4394 } else if (cur == 'e') {
4395 NEXT;
4396 /* close */
4397 type = XML_REGEXP_PUNCT_CLOSE;
4398 } else if (cur == 'i') {
4399 NEXT;
4400 /* initial quote */
4401 type = XML_REGEXP_PUNCT_INITQUOTE;
4402 } else if (cur == 'f') {
4403 NEXT;
4404 /* final quote */
4405 type = XML_REGEXP_PUNCT_FINQUOTE;
4406 } else if (cur == 'o') {
4407 NEXT;
4408 /* other */
4409 type = XML_REGEXP_PUNCT_OTHERS;
4410 } else {
4411 /* all punctuation */
4412 type = XML_REGEXP_PUNCT;
4413 }
4414 } else if (cur == 'Z') {
4415 NEXT;
4416 cur = CUR;
4417 if (cur == 's') {
4418 NEXT;
4419 /* space */
4420 type = XML_REGEXP_SEPAR_SPACE;
4421 } else if (cur == 'l') {
4422 NEXT;
4423 /* line */
4424 type = XML_REGEXP_SEPAR_LINE;
4425 } else if (cur == 'p') {
4426 NEXT;
4427 /* paragraph */
4428 type = XML_REGEXP_SEPAR_PARA;
4429 } else {
4430 /* all separators */
4431 type = XML_REGEXP_SEPAR;
4432 }
4433 } else if (cur == 'S') {
4434 NEXT;
4435 cur = CUR;
4436 if (cur == 'm') {
4437 NEXT;
4438 type = XML_REGEXP_SYMBOL_MATH;
4439 /* math */
4440 } else if (cur == 'c') {
4441 NEXT;
4442 type = XML_REGEXP_SYMBOL_CURRENCY;
4443 /* currency */
4444 } else if (cur == 'k') {
4445 NEXT;
4446 type = XML_REGEXP_SYMBOL_MODIFIER;
4447 /* modifiers */
4448 } else if (cur == 'o') {
4449 NEXT;
4450 type = XML_REGEXP_SYMBOL_OTHERS;
4451 /* other */
4452 } else {
4453 /* all symbols */
4454 type = XML_REGEXP_SYMBOL;
4455 }
4456 } else if (cur == 'C') {
4457 NEXT;
4458 cur = CUR;
4459 if (cur == 'c') {
4460 NEXT;
4461 /* control */
4462 type = XML_REGEXP_OTHER_CONTROL;
4463 } else if (cur == 'f') {
4464 NEXT;
4465 /* format */
4466 type = XML_REGEXP_OTHER_FORMAT;
4467 } else if (cur == 'o') {
4468 NEXT;
4469 /* private use */
4470 type = XML_REGEXP_OTHER_PRIVATE;
4471 } else if (cur == 'n') {
4472 NEXT;
4473 /* not assigned */
4474 type = XML_REGEXP_OTHER_NA;
4475 } else {
4476 /* all others */
4477 type = XML_REGEXP_OTHER;
4478 }
4479 } else if (cur == 'I') {
4480 const xmlChar *start;
4481 NEXT;
4482 cur = CUR;
4483 if (cur != 's') {
4484 ERROR("IsXXXX expected");
4485 return;
4486 }
4487 NEXT;
4488 start = ctxt->cur;
4489 cur = CUR;
4490 if (((cur >= 'a') && (cur <= 'z')) ||
4491 ((cur >= 'A') && (cur <= 'Z')) ||
4492 ((cur >= '0') && (cur <= '9')) ||
4493 (cur == 0x2D)) {
4494 NEXT;
4495 cur = CUR;
4496 while (((cur >= 'a') && (cur <= 'z')) ||
4497 ((cur >= 'A') && (cur <= 'Z')) ||
4498 ((cur >= '0') && (cur <= '9')) ||
4499 (cur == 0x2D)) {
4500 NEXT;
4501 cur = CUR;
4502 }
4503 }
4504 type = XML_REGEXP_BLOCK_NAME;
4505 blockName = xmlStrndup(start, ctxt->cur - start);
4506 if (blockName == NULL)
4507 xmlRegexpErrMemory(ctxt);
4508 } else {
4509 ERROR("Unknown char property");
4510 return;
4511 }
4512 if (ctxt->atom == NULL) {
4513 ctxt->atom = xmlRegNewAtom(ctxt, type);
4514 if (ctxt->atom == NULL) {
4515 xmlFree(blockName);
4516 return;
4517 }
4518 ctxt->atom->valuep = blockName;
4519 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4520 if (xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4521 type, 0, 0, blockName) == NULL) {
4522 xmlFree(blockName);
4523 }
4524 }
4525 }
4526
parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt)4527 static int parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt)
4528 {
4529 int val = 0, i, cur;
4530 for (i = 0; i < 4; i++) {
4531 NEXT;
4532 val *= 16;
4533 cur = CUR;
4534 if (cur >= '0' && cur <= '9') {
4535 val += cur - '0';
4536 } else if (cur >= 'A' && cur <= 'F') {
4537 val += cur - 'A' + 10;
4538 } else if (cur >= 'a' && cur <= 'f') {
4539 val += cur - 'a' + 10;
4540 } else {
4541 ERROR("Expecting hex digit");
4542 return -1;
4543 }
4544 }
4545 return val;
4546 }
4547
parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt)4548 static int parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt)
4549 {
4550 int val = parse_escaped_codeunit(ctxt);
4551 if (0xD800 <= val && val <= 0xDBFF) {
4552 NEXT;
4553 if (CUR == '\\') {
4554 NEXT;
4555 if (CUR == 'u') {
4556 int low = parse_escaped_codeunit(ctxt);
4557 if (0xDC00 <= low && low <= 0xDFFF) {
4558 return (val - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;
4559 }
4560 }
4561 }
4562 ERROR("Invalid low surrogate pair code unit");
4563 val = -1;
4564 }
4565 return val;
4566 }
4567
4568 /**
4569 * xmlFAParseCharClassEsc:
4570 * @ctxt: a regexp parser context
4571 *
4572 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4573 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4574 * [25] catEsc ::= '\p{' charProp '}'
4575 * [26] complEsc ::= '\P{' charProp '}'
4576 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4577 */
4578 static void
xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt)4579 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4580 int cur;
4581
4582 if (CUR == '.') {
4583 if (ctxt->atom == NULL) {
4584 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4585 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4586 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4587 XML_REGEXP_ANYCHAR, 0, 0, NULL);
4588 }
4589 NEXT;
4590 return;
4591 }
4592 if (CUR != '\\') {
4593 ERROR("Escaped sequence: expecting \\");
4594 return;
4595 }
4596 NEXT;
4597 cur = CUR;
4598 if (cur == 'p') {
4599 NEXT;
4600 if (CUR != '{') {
4601 ERROR("Expecting '{'");
4602 return;
4603 }
4604 NEXT;
4605 xmlFAParseCharProp(ctxt);
4606 if (CUR != '}') {
4607 ERROR("Expecting '}'");
4608 return;
4609 }
4610 NEXT;
4611 } else if (cur == 'P') {
4612 NEXT;
4613 if (CUR != '{') {
4614 ERROR("Expecting '{'");
4615 return;
4616 }
4617 NEXT;
4618 xmlFAParseCharProp(ctxt);
4619 if (ctxt->atom != NULL)
4620 ctxt->atom->neg = 1;
4621 if (CUR != '}') {
4622 ERROR("Expecting '}'");
4623 return;
4624 }
4625 NEXT;
4626 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4627 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4628 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4629 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4630 (cur == 0x5E) ||
4631
4632 /* Non-standard escape sequences:
4633 * Java 1.8|.NET Core 3.1|MSXML 6 */
4634 (cur == '!') || /* + | + | + */
4635 (cur == '"') || /* + | + | + */
4636 (cur == '#') || /* + | + | + */
4637 (cur == '$') || /* + | + | + */
4638 (cur == '%') || /* + | + | + */
4639 (cur == ',') || /* + | + | + */
4640 (cur == '/') || /* + | + | + */
4641 (cur == ':') || /* + | + | + */
4642 (cur == ';') || /* + | + | + */
4643 (cur == '=') || /* + | + | + */
4644 (cur == '>') || /* | + | + */
4645 (cur == '@') || /* + | + | + */
4646 (cur == '`') || /* + | + | + */
4647 (cur == '~') || /* + | + | + */
4648 (cur == 'u')) { /* | + | + */
4649 if (ctxt->atom == NULL) {
4650 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
4651 if (ctxt->atom != NULL) {
4652 switch (cur) {
4653 case 'n':
4654 ctxt->atom->codepoint = '\n';
4655 break;
4656 case 'r':
4657 ctxt->atom->codepoint = '\r';
4658 break;
4659 case 't':
4660 ctxt->atom->codepoint = '\t';
4661 break;
4662 case 'u':
4663 cur = parse_escaped_codepoint(ctxt);
4664 if (cur < 0) {
4665 return;
4666 }
4667 ctxt->atom->codepoint = cur;
4668 break;
4669 default:
4670 ctxt->atom->codepoint = cur;
4671 }
4672 }
4673 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4674 switch (cur) {
4675 case 'n':
4676 cur = '\n';
4677 break;
4678 case 'r':
4679 cur = '\r';
4680 break;
4681 case 't':
4682 cur = '\t';
4683 break;
4684 }
4685 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4686 XML_REGEXP_CHARVAL, cur, cur, NULL);
4687 }
4688 NEXT;
4689 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
4690 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
4691 (cur == 'w') || (cur == 'W')) {
4692 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
4693
4694 switch (cur) {
4695 case 's':
4696 type = XML_REGEXP_ANYSPACE;
4697 break;
4698 case 'S':
4699 type = XML_REGEXP_NOTSPACE;
4700 break;
4701 case 'i':
4702 type = XML_REGEXP_INITNAME;
4703 break;
4704 case 'I':
4705 type = XML_REGEXP_NOTINITNAME;
4706 break;
4707 case 'c':
4708 type = XML_REGEXP_NAMECHAR;
4709 break;
4710 case 'C':
4711 type = XML_REGEXP_NOTNAMECHAR;
4712 break;
4713 case 'd':
4714 type = XML_REGEXP_DECIMAL;
4715 break;
4716 case 'D':
4717 type = XML_REGEXP_NOTDECIMAL;
4718 break;
4719 case 'w':
4720 type = XML_REGEXP_REALCHAR;
4721 break;
4722 case 'W':
4723 type = XML_REGEXP_NOTREALCHAR;
4724 break;
4725 }
4726 NEXT;
4727 if (ctxt->atom == NULL) {
4728 ctxt->atom = xmlRegNewAtom(ctxt, type);
4729 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4730 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4731 type, 0, 0, NULL);
4732 }
4733 } else {
4734 ERROR("Wrong escape sequence, misuse of character '\\'");
4735 }
4736 }
4737
4738 /**
4739 * xmlFAParseCharRange:
4740 * @ctxt: a regexp parser context
4741 *
4742 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
4743 * [18] seRange ::= charOrEsc '-' charOrEsc
4744 * [20] charOrEsc ::= XmlChar | SingleCharEsc
4745 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
4746 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
4747 */
4748 static void
xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt)4749 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
4750 int cur, len;
4751 int start = -1;
4752 int end = -1;
4753
4754 if (CUR == '\0') {
4755 ERROR("Expecting ']'");
4756 return;
4757 }
4758
4759 cur = CUR;
4760 if (cur == '\\') {
4761 NEXT;
4762 cur = CUR;
4763 switch (cur) {
4764 case 'n': start = 0xA; break;
4765 case 'r': start = 0xD; break;
4766 case 't': start = 0x9; break;
4767 case '\\': case '|': case '.': case '-': case '^': case '?':
4768 case '*': case '+': case '{': case '}': case '(': case ')':
4769 case '[': case ']':
4770 start = cur; break;
4771 default:
4772 ERROR("Invalid escape value");
4773 return;
4774 }
4775 end = start;
4776 len = 1;
4777 } else if ((cur != 0x5B) && (cur != 0x5D)) {
4778 len = 4;
4779 end = start = xmlGetUTF8Char(ctxt->cur, &len);
4780 if (start < 0) {
4781 ERROR("Invalid UTF-8");
4782 return;
4783 }
4784 } else {
4785 ERROR("Expecting a char range");
4786 return;
4787 }
4788 /*
4789 * Since we are "inside" a range, we can assume ctxt->cur is past
4790 * the start of ctxt->string, and PREV should be safe
4791 */
4792 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
4793 NEXTL(len);
4794 return;
4795 }
4796 NEXTL(len);
4797 cur = CUR;
4798 if ((cur != '-') || (NXT(1) == '[') || (NXT(1) == ']')) {
4799 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4800 XML_REGEXP_CHARVAL, start, end, NULL);
4801 return;
4802 }
4803 NEXT;
4804 cur = CUR;
4805 if (cur == '\\') {
4806 NEXT;
4807 cur = CUR;
4808 switch (cur) {
4809 case 'n': end = 0xA; break;
4810 case 'r': end = 0xD; break;
4811 case 't': end = 0x9; break;
4812 case '\\': case '|': case '.': case '-': case '^': case '?':
4813 case '*': case '+': case '{': case '}': case '(': case ')':
4814 case '[': case ']':
4815 end = cur; break;
4816 default:
4817 ERROR("Invalid escape value");
4818 return;
4819 }
4820 len = 1;
4821 } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
4822 len = 4;
4823 end = xmlGetUTF8Char(ctxt->cur, &len);
4824 if (end < 0) {
4825 ERROR("Invalid UTF-8");
4826 return;
4827 }
4828 } else {
4829 ERROR("Expecting the end of a char range");
4830 return;
4831 }
4832
4833 /* TODO check that the values are acceptable character ranges for XML */
4834 if (end < start) {
4835 ERROR("End of range is before start of range");
4836 } else {
4837 NEXTL(len);
4838 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4839 XML_REGEXP_CHARVAL, start, end, NULL);
4840 }
4841 }
4842
4843 /**
4844 * xmlFAParsePosCharGroup:
4845 * @ctxt: a regexp parser context
4846 *
4847 * [14] posCharGroup ::= ( charRange | charClassEsc )+
4848 */
4849 static void
xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt)4850 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
4851 do {
4852 if (CUR == '\\') {
4853 xmlFAParseCharClassEsc(ctxt);
4854 } else {
4855 xmlFAParseCharRange(ctxt);
4856 }
4857 } while ((CUR != ']') && (CUR != '-') &&
4858 (CUR != 0) && (ctxt->error == 0));
4859 }
4860
4861 /**
4862 * xmlFAParseCharGroup:
4863 * @ctxt: a regexp parser context
4864 *
4865 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
4866 * [15] negCharGroup ::= '^' posCharGroup
4867 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
4868 * [12] charClassExpr ::= '[' charGroup ']'
4869 */
4870 static void
xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt)4871 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
4872 int neg = ctxt->neg;
4873
4874 if (CUR == '^') {
4875 NEXT;
4876 ctxt->neg = !ctxt->neg;
4877 xmlFAParsePosCharGroup(ctxt);
4878 ctxt->neg = neg;
4879 }
4880 while ((CUR != ']') && (ctxt->error == 0)) {
4881 if ((CUR == '-') && (NXT(1) == '[')) {
4882 NEXT; /* eat the '-' */
4883 NEXT; /* eat the '[' */
4884 ctxt->neg = 2;
4885 xmlFAParseCharGroup(ctxt);
4886 ctxt->neg = neg;
4887 if (CUR == ']') {
4888 NEXT;
4889 } else {
4890 ERROR("charClassExpr: ']' expected");
4891 }
4892 break;
4893 } else {
4894 xmlFAParsePosCharGroup(ctxt);
4895 }
4896 }
4897 }
4898
4899 /**
4900 * xmlFAParseCharClass:
4901 * @ctxt: a regexp parser context
4902 *
4903 * [11] charClass ::= charClassEsc | charClassExpr
4904 * [12] charClassExpr ::= '[' charGroup ']'
4905 */
4906 static void
xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt)4907 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
4908 if (CUR == '[') {
4909 NEXT;
4910 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
4911 if (ctxt->atom == NULL)
4912 return;
4913 xmlFAParseCharGroup(ctxt);
4914 if (CUR == ']') {
4915 NEXT;
4916 } else {
4917 ERROR("xmlFAParseCharClass: ']' expected");
4918 }
4919 } else {
4920 xmlFAParseCharClassEsc(ctxt);
4921 }
4922 }
4923
4924 /**
4925 * xmlFAParseQuantExact:
4926 * @ctxt: a regexp parser context
4927 *
4928 * [8] QuantExact ::= [0-9]+
4929 *
4930 * Returns 0 if success or -1 in case of error
4931 */
4932 static int
xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt)4933 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
4934 int ret = 0;
4935 int ok = 0;
4936 int overflow = 0;
4937
4938 while ((CUR >= '0') && (CUR <= '9')) {
4939 if (ret > INT_MAX / 10) {
4940 overflow = 1;
4941 } else {
4942 int digit = CUR - '0';
4943
4944 ret *= 10;
4945 if (ret > INT_MAX - digit)
4946 overflow = 1;
4947 else
4948 ret += digit;
4949 }
4950 ok = 1;
4951 NEXT;
4952 }
4953 if ((ok != 1) || (overflow == 1)) {
4954 return(-1);
4955 }
4956 return(ret);
4957 }
4958
4959 /**
4960 * xmlFAParseQuantifier:
4961 * @ctxt: a regexp parser context
4962 *
4963 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
4964 * [5] quantity ::= quantRange | quantMin | QuantExact
4965 * [6] quantRange ::= QuantExact ',' QuantExact
4966 * [7] quantMin ::= QuantExact ','
4967 * [8] QuantExact ::= [0-9]+
4968 */
4969 static int
xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt)4970 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
4971 int cur;
4972
4973 cur = CUR;
4974 if ((cur == '?') || (cur == '*') || (cur == '+')) {
4975 if (ctxt->atom != NULL) {
4976 if (cur == '?')
4977 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
4978 else if (cur == '*')
4979 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
4980 else if (cur == '+')
4981 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
4982 }
4983 NEXT;
4984 return(1);
4985 }
4986 if (cur == '{') {
4987 int min = 0, max = 0;
4988
4989 NEXT;
4990 cur = xmlFAParseQuantExact(ctxt);
4991 if (cur >= 0)
4992 min = cur;
4993 else {
4994 ERROR("Improper quantifier");
4995 }
4996 if (CUR == ',') {
4997 NEXT;
4998 if (CUR == '}')
4999 max = INT_MAX;
5000 else {
5001 cur = xmlFAParseQuantExact(ctxt);
5002 if (cur >= 0)
5003 max = cur;
5004 else {
5005 ERROR("Improper quantifier");
5006 }
5007 }
5008 }
5009 if (CUR == '}') {
5010 NEXT;
5011 } else {
5012 ERROR("Unterminated quantifier");
5013 }
5014 if (max == 0)
5015 max = min;
5016 if (ctxt->atom != NULL) {
5017 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5018 ctxt->atom->min = min;
5019 ctxt->atom->max = max;
5020 }
5021 return(1);
5022 }
5023 return(0);
5024 }
5025
5026 /**
5027 * xmlFAParseAtom:
5028 * @ctxt: a regexp parser context
5029 *
5030 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
5031 */
5032 static int
xmlFAParseAtom(xmlRegParserCtxtPtr ctxt)5033 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5034 int codepoint, len;
5035
5036 codepoint = xmlFAIsChar(ctxt);
5037 if (codepoint > 0) {
5038 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5039 if (ctxt->atom == NULL)
5040 return(-1);
5041 len = 4;
5042 codepoint = xmlGetUTF8Char(ctxt->cur, &len);
5043 if (codepoint < 0) {
5044 ERROR("Invalid UTF-8");
5045 return(-1);
5046 }
5047 ctxt->atom->codepoint = codepoint;
5048 NEXTL(len);
5049 return(1);
5050 } else if (CUR == '|') {
5051 return(0);
5052 } else if (CUR == 0) {
5053 return(0);
5054 } else if (CUR == ')') {
5055 return(0);
5056 } else if (CUR == '(') {
5057 xmlRegStatePtr start, oldend, start0;
5058
5059 NEXT;
5060 if (ctxt->depth >= 50) {
5061 ERROR("xmlFAParseAtom: maximum nesting depth exceeded");
5062 return(-1);
5063 }
5064 /*
5065 * this extra Epsilon transition is needed if we count with 0 allowed
5066 * unfortunately this can't be known at that point
5067 */
5068 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5069 start0 = ctxt->state;
5070 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5071 start = ctxt->state;
5072 oldend = ctxt->end;
5073 ctxt->end = NULL;
5074 ctxt->atom = NULL;
5075 ctxt->depth++;
5076 xmlFAParseRegExp(ctxt, 0);
5077 ctxt->depth--;
5078 if (CUR == ')') {
5079 NEXT;
5080 } else {
5081 ERROR("xmlFAParseAtom: expecting ')'");
5082 }
5083 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5084 if (ctxt->atom == NULL)
5085 return(-1);
5086 ctxt->atom->start = start;
5087 ctxt->atom->start0 = start0;
5088 ctxt->atom->stop = ctxt->state;
5089 ctxt->end = oldend;
5090 return(1);
5091 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5092 xmlFAParseCharClass(ctxt);
5093 return(1);
5094 }
5095 return(0);
5096 }
5097
5098 /**
5099 * xmlFAParsePiece:
5100 * @ctxt: a regexp parser context
5101 *
5102 * [3] piece ::= atom quantifier?
5103 */
5104 static int
xmlFAParsePiece(xmlRegParserCtxtPtr ctxt)5105 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5106 int ret;
5107
5108 ctxt->atom = NULL;
5109 ret = xmlFAParseAtom(ctxt);
5110 if (ret == 0)
5111 return(0);
5112 if (ctxt->atom == NULL) {
5113 ERROR("internal: no atom generated");
5114 }
5115 xmlFAParseQuantifier(ctxt);
5116 return(1);
5117 }
5118
5119 /**
5120 * xmlFAParseBranch:
5121 * @ctxt: a regexp parser context
5122 * @to: optional target to the end of the branch
5123 *
5124 * @to is used to optimize by removing duplicate path in automata
5125 * in expressions like (a|b)(c|d)
5126 *
5127 * [2] branch ::= piece*
5128 */
5129 static int
xmlFAParseBranch(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr to)5130 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
5131 xmlRegStatePtr previous;
5132 int ret;
5133
5134 previous = ctxt->state;
5135 ret = xmlFAParsePiece(ctxt);
5136 if (ret == 0) {
5137 /* Empty branch */
5138 xmlFAGenerateEpsilonTransition(ctxt, previous, to);
5139 } else {
5140 if (xmlFAGenerateTransitions(ctxt, previous,
5141 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5142 ctxt->atom) < 0) {
5143 xmlRegFreeAtom(ctxt->atom);
5144 ctxt->atom = NULL;
5145 return(-1);
5146 }
5147 previous = ctxt->state;
5148 ctxt->atom = NULL;
5149 }
5150 while ((ret != 0) && (ctxt->error == 0)) {
5151 ret = xmlFAParsePiece(ctxt);
5152 if (ret != 0) {
5153 if (xmlFAGenerateTransitions(ctxt, previous,
5154 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5155 ctxt->atom) < 0) {
5156 xmlRegFreeAtom(ctxt->atom);
5157 ctxt->atom = NULL;
5158 return(-1);
5159 }
5160 previous = ctxt->state;
5161 ctxt->atom = NULL;
5162 }
5163 }
5164 return(0);
5165 }
5166
5167 /**
5168 * xmlFAParseRegExp:
5169 * @ctxt: a regexp parser context
5170 * @top: is this the top-level expression ?
5171 *
5172 * [1] regExp ::= branch ( '|' branch )*
5173 */
5174 static void
xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt,int top)5175 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
5176 xmlRegStatePtr start, end;
5177
5178 /* if not top start should have been generated by an epsilon trans */
5179 start = ctxt->state;
5180 ctxt->end = NULL;
5181 xmlFAParseBranch(ctxt, NULL);
5182 if (top) {
5183 ctxt->state->type = XML_REGEXP_FINAL_STATE;
5184 }
5185 if (CUR != '|') {
5186 ctxt->end = ctxt->state;
5187 return;
5188 }
5189 end = ctxt->state;
5190 while ((CUR == '|') && (ctxt->error == 0)) {
5191 NEXT;
5192 ctxt->state = start;
5193 ctxt->end = NULL;
5194 xmlFAParseBranch(ctxt, end);
5195 }
5196 if (!top) {
5197 ctxt->state = end;
5198 ctxt->end = end;
5199 }
5200 }
5201
5202 /************************************************************************
5203 * *
5204 * The basic API *
5205 * *
5206 ************************************************************************/
5207
5208 /**
5209 * xmlRegexpPrint:
5210 * @output: the file for the output debug
5211 * @regexp: the compiled regexp
5212 *
5213 * Print the content of the compiled regular expression
5214 */
5215 void
xmlRegexpPrint(FILE * output,xmlRegexpPtr regexp)5216 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5217 int i;
5218
5219 if (output == NULL)
5220 return;
5221 fprintf(output, " regexp: ");
5222 if (regexp == NULL) {
5223 fprintf(output, "NULL\n");
5224 return;
5225 }
5226 fprintf(output, "'%s' ", regexp->string);
5227 fprintf(output, "\n");
5228 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5229 for (i = 0;i < regexp->nbAtoms; i++) {
5230 fprintf(output, " %02d ", i);
5231 xmlRegPrintAtom(output, regexp->atoms[i]);
5232 }
5233 fprintf(output, "%d states:", regexp->nbStates);
5234 fprintf(output, "\n");
5235 for (i = 0;i < regexp->nbStates; i++) {
5236 xmlRegPrintState(output, regexp->states[i]);
5237 }
5238 fprintf(output, "%d counters:\n", regexp->nbCounters);
5239 for (i = 0;i < regexp->nbCounters; i++) {
5240 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5241 regexp->counters[i].max);
5242 }
5243 }
5244
5245 /**
5246 * xmlRegexpCompile:
5247 * @regexp: a regular expression string
5248 *
5249 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
5250 * Appendix F and builds an automata suitable for testing strings against
5251 * that regular expression
5252 *
5253 * Returns the compiled expression or NULL in case of error
5254 */
5255 xmlRegexpPtr
xmlRegexpCompile(const xmlChar * regexp)5256 xmlRegexpCompile(const xmlChar *regexp) {
5257 xmlRegexpPtr ret = NULL;
5258 xmlRegParserCtxtPtr ctxt;
5259
5260 if (regexp == NULL)
5261 return(NULL);
5262
5263 ctxt = xmlRegNewParserCtxt(regexp);
5264 if (ctxt == NULL)
5265 return(NULL);
5266
5267 /* initialize the parser */
5268 ctxt->state = xmlRegStatePush(ctxt);
5269 if (ctxt->state == NULL)
5270 goto error;
5271 ctxt->start = ctxt->state;
5272 ctxt->end = NULL;
5273
5274 /* parse the expression building an automata */
5275 xmlFAParseRegExp(ctxt, 1);
5276 if (CUR != 0) {
5277 ERROR("xmlFAParseRegExp: extra characters");
5278 }
5279 if (ctxt->error != 0)
5280 goto error;
5281 ctxt->end = ctxt->state;
5282 ctxt->start->type = XML_REGEXP_START_STATE;
5283 ctxt->end->type = XML_REGEXP_FINAL_STATE;
5284
5285 /* remove the Epsilon except for counted transitions */
5286 xmlFAEliminateEpsilonTransitions(ctxt);
5287
5288
5289 if (ctxt->error != 0)
5290 goto error;
5291 ret = xmlRegEpxFromParse(ctxt);
5292
5293 error:
5294 xmlRegFreeParserCtxt(ctxt);
5295 return(ret);
5296 }
5297
5298 /**
5299 * xmlRegexpExec:
5300 * @comp: the compiled regular expression
5301 * @content: the value to check against the regular expression
5302 *
5303 * Check if the regular expression generates the value
5304 *
5305 * Returns 1 if it matches, 0 if not and a negative value in case of error
5306 */
5307 int
xmlRegexpExec(xmlRegexpPtr comp,const xmlChar * content)5308 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5309 if ((comp == NULL) || (content == NULL))
5310 return(-1);
5311 return(xmlFARegExec(comp, content));
5312 }
5313
5314 /**
5315 * xmlRegexpIsDeterminist:
5316 * @comp: the compiled regular expression
5317 *
5318 * Check if the regular expression is determinist
5319 *
5320 * Returns 1 if it yes, 0 if not and a negative value in case of error
5321 */
5322 int
xmlRegexpIsDeterminist(xmlRegexpPtr comp)5323 xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5324 xmlAutomataPtr am;
5325 int ret;
5326
5327 if (comp == NULL)
5328 return(-1);
5329 if (comp->determinist != -1)
5330 return(comp->determinist);
5331
5332 am = xmlNewAutomata();
5333 if (am == NULL)
5334 return(-1);
5335 if (am->states != NULL) {
5336 int i;
5337
5338 for (i = 0;i < am->nbStates;i++)
5339 xmlRegFreeState(am->states[i]);
5340 xmlFree(am->states);
5341 }
5342 am->nbAtoms = comp->nbAtoms;
5343 am->atoms = comp->atoms;
5344 am->nbStates = comp->nbStates;
5345 am->states = comp->states;
5346 am->determinist = -1;
5347 am->flags = comp->flags;
5348 ret = xmlFAComputesDeterminism(am);
5349 am->atoms = NULL;
5350 am->states = NULL;
5351 xmlFreeAutomata(am);
5352 comp->determinist = ret;
5353 return(ret);
5354 }
5355
5356 /**
5357 * xmlRegFreeRegexp:
5358 * @regexp: the regexp
5359 *
5360 * Free a regexp
5361 */
5362 void
xmlRegFreeRegexp(xmlRegexpPtr regexp)5363 xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5364 int i;
5365 if (regexp == NULL)
5366 return;
5367
5368 if (regexp->string != NULL)
5369 xmlFree(regexp->string);
5370 if (regexp->states != NULL) {
5371 for (i = 0;i < regexp->nbStates;i++)
5372 xmlRegFreeState(regexp->states[i]);
5373 xmlFree(regexp->states);
5374 }
5375 if (regexp->atoms != NULL) {
5376 for (i = 0;i < regexp->nbAtoms;i++)
5377 xmlRegFreeAtom(regexp->atoms[i]);
5378 xmlFree(regexp->atoms);
5379 }
5380 if (regexp->counters != NULL)
5381 xmlFree(regexp->counters);
5382 if (regexp->compact != NULL)
5383 xmlFree(regexp->compact);
5384 if (regexp->transdata != NULL)
5385 xmlFree(regexp->transdata);
5386 if (regexp->stringMap != NULL) {
5387 for (i = 0; i < regexp->nbstrings;i++)
5388 xmlFree(regexp->stringMap[i]);
5389 xmlFree(regexp->stringMap);
5390 }
5391
5392 xmlFree(regexp);
5393 }
5394
5395 /************************************************************************
5396 * *
5397 * The Automata interface *
5398 * *
5399 ************************************************************************/
5400
5401 /**
5402 * xmlNewAutomata:
5403 *
5404 * Create a new automata
5405 *
5406 * Returns the new object or NULL in case of failure
5407 */
5408 xmlAutomataPtr
xmlNewAutomata(void)5409 xmlNewAutomata(void) {
5410 xmlAutomataPtr ctxt;
5411
5412 ctxt = xmlRegNewParserCtxt(NULL);
5413 if (ctxt == NULL)
5414 return(NULL);
5415
5416 /* initialize the parser */
5417 ctxt->state = xmlRegStatePush(ctxt);
5418 if (ctxt->state == NULL) {
5419 xmlFreeAutomata(ctxt);
5420 return(NULL);
5421 }
5422 ctxt->start = ctxt->state;
5423 ctxt->end = NULL;
5424
5425 ctxt->start->type = XML_REGEXP_START_STATE;
5426 ctxt->flags = 0;
5427
5428 return(ctxt);
5429 }
5430
5431 /**
5432 * xmlFreeAutomata:
5433 * @am: an automata
5434 *
5435 * Free an automata
5436 */
5437 void
xmlFreeAutomata(xmlAutomataPtr am)5438 xmlFreeAutomata(xmlAutomataPtr am) {
5439 if (am == NULL)
5440 return;
5441 xmlRegFreeParserCtxt(am);
5442 }
5443
5444 /**
5445 * xmlAutomataSetFlags:
5446 * @am: an automata
5447 * @flags: a set of internal flags
5448 *
5449 * Set some flags on the automata
5450 */
5451 void
xmlAutomataSetFlags(xmlAutomataPtr am,int flags)5452 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5453 if (am == NULL)
5454 return;
5455 am->flags |= flags;
5456 }
5457
5458 /**
5459 * xmlAutomataGetInitState:
5460 * @am: an automata
5461 *
5462 * Initial state lookup
5463 *
5464 * Returns the initial state of the automata
5465 */
5466 xmlAutomataStatePtr
xmlAutomataGetInitState(xmlAutomataPtr am)5467 xmlAutomataGetInitState(xmlAutomataPtr am) {
5468 if (am == NULL)
5469 return(NULL);
5470 return(am->start);
5471 }
5472
5473 /**
5474 * xmlAutomataSetFinalState:
5475 * @am: an automata
5476 * @state: a state in this automata
5477 *
5478 * Makes that state a final state
5479 *
5480 * Returns 0 or -1 in case of error
5481 */
5482 int
xmlAutomataSetFinalState(xmlAutomataPtr am,xmlAutomataStatePtr state)5483 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5484 if ((am == NULL) || (state == NULL))
5485 return(-1);
5486 state->type = XML_REGEXP_FINAL_STATE;
5487 return(0);
5488 }
5489
5490 /**
5491 * xmlAutomataNewTransition:
5492 * @am: an automata
5493 * @from: the starting point of the transition
5494 * @to: the target point of the transition or NULL
5495 * @token: the input string associated to that transition
5496 * @data: data passed to the callback function if the transition is activated
5497 *
5498 * If @to is NULL, this creates first a new target state in the automata
5499 * and then adds a transition from the @from state to the target state
5500 * activated by the value of @token
5501 *
5502 * Returns the target state or NULL in case of error
5503 */
5504 xmlAutomataStatePtr
xmlAutomataNewTransition(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,void * data)5505 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5506 xmlAutomataStatePtr to, const xmlChar *token,
5507 void *data) {
5508 xmlRegAtomPtr atom;
5509
5510 if ((am == NULL) || (from == NULL) || (token == NULL))
5511 return(NULL);
5512 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5513 if (atom == NULL)
5514 return(NULL);
5515 atom->data = data;
5516 atom->valuep = xmlStrdup(token);
5517 if (atom->valuep == NULL) {
5518 xmlRegFreeAtom(atom);
5519 xmlRegexpErrMemory(am);
5520 return(NULL);
5521 }
5522
5523 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5524 xmlRegFreeAtom(atom);
5525 return(NULL);
5526 }
5527 if (to == NULL)
5528 return(am->state);
5529 return(to);
5530 }
5531
5532 /**
5533 * xmlAutomataNewTransition2:
5534 * @am: an automata
5535 * @from: the starting point of the transition
5536 * @to: the target point of the transition or NULL
5537 * @token: the first input string associated to that transition
5538 * @token2: the second input string associated to that transition
5539 * @data: data passed to the callback function if the transition is activated
5540 *
5541 * If @to is NULL, this creates first a new target state in the automata
5542 * and then adds a transition from the @from state to the target state
5543 * activated by the value of @token
5544 *
5545 * Returns the target state or NULL in case of error
5546 */
5547 xmlAutomataStatePtr
xmlAutomataNewTransition2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5548 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5549 xmlAutomataStatePtr to, const xmlChar *token,
5550 const xmlChar *token2, void *data) {
5551 xmlRegAtomPtr atom;
5552
5553 if ((am == NULL) || (from == NULL) || (token == NULL))
5554 return(NULL);
5555 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5556 if (atom == NULL)
5557 return(NULL);
5558 atom->data = data;
5559 if ((token2 == NULL) || (*token2 == 0)) {
5560 atom->valuep = xmlStrdup(token);
5561 } else {
5562 int lenn, lenp;
5563 xmlChar *str;
5564
5565 lenn = strlen((char *) token2);
5566 lenp = strlen((char *) token);
5567
5568 str = xmlMalloc(lenn + lenp + 2);
5569 if (str == NULL) {
5570 xmlRegFreeAtom(atom);
5571 return(NULL);
5572 }
5573 memcpy(&str[0], token, lenp);
5574 str[lenp] = '|';
5575 memcpy(&str[lenp + 1], token2, lenn);
5576 str[lenn + lenp + 1] = 0;
5577
5578 atom->valuep = str;
5579 }
5580
5581 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5582 xmlRegFreeAtom(atom);
5583 return(NULL);
5584 }
5585 if (to == NULL)
5586 return(am->state);
5587 return(to);
5588 }
5589
5590 /**
5591 * xmlAutomataNewNegTrans:
5592 * @am: an automata
5593 * @from: the starting point of the transition
5594 * @to: the target point of the transition or NULL
5595 * @token: the first input string associated to that transition
5596 * @token2: the second input string associated to that transition
5597 * @data: data passed to the callback function if the transition is activated
5598 *
5599 * If @to is NULL, this creates first a new target state in the automata
5600 * and then adds a transition from the @from state to the target state
5601 * activated by any value except (@token,@token2)
5602 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5603 # the semantic of XSD ##other
5604 *
5605 * Returns the target state or NULL in case of error
5606 */
5607 xmlAutomataStatePtr
xmlAutomataNewNegTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5608 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5609 xmlAutomataStatePtr to, const xmlChar *token,
5610 const xmlChar *token2, void *data) {
5611 xmlRegAtomPtr atom;
5612 xmlChar err_msg[200];
5613
5614 if ((am == NULL) || (from == NULL) || (token == NULL))
5615 return(NULL);
5616 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5617 if (atom == NULL)
5618 return(NULL);
5619 atom->data = data;
5620 atom->neg = 1;
5621 if ((token2 == NULL) || (*token2 == 0)) {
5622 atom->valuep = xmlStrdup(token);
5623 } else {
5624 int lenn, lenp;
5625 xmlChar *str;
5626
5627 lenn = strlen((char *) token2);
5628 lenp = strlen((char *) token);
5629
5630 str = xmlMalloc(lenn + lenp + 2);
5631 if (str == NULL) {
5632 xmlRegFreeAtom(atom);
5633 return(NULL);
5634 }
5635 memcpy(&str[0], token, lenp);
5636 str[lenp] = '|';
5637 memcpy(&str[lenp + 1], token2, lenn);
5638 str[lenn + lenp + 1] = 0;
5639
5640 atom->valuep = str;
5641 }
5642 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
5643 err_msg[199] = 0;
5644 atom->valuep2 = xmlStrdup(err_msg);
5645
5646 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5647 xmlRegFreeAtom(atom);
5648 return(NULL);
5649 }
5650 am->negs++;
5651 if (to == NULL)
5652 return(am->state);
5653 return(to);
5654 }
5655
5656 /**
5657 * xmlAutomataNewCountTrans2:
5658 * @am: an automata
5659 * @from: the starting point of the transition
5660 * @to: the target point of the transition or NULL
5661 * @token: the input string associated to that transition
5662 * @token2: the second input string associated to that transition
5663 * @min: the minimum successive occurrences of token
5664 * @max: the maximum successive occurrences of token
5665 * @data: data associated to the transition
5666 *
5667 * If @to is NULL, this creates first a new target state in the automata
5668 * and then adds a transition from the @from state to the target state
5669 * activated by a succession of input of value @token and @token2 and
5670 * whose number is between @min and @max
5671 *
5672 * Returns the target state or NULL in case of error
5673 */
5674 xmlAutomataStatePtr
xmlAutomataNewCountTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)5675 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5676 xmlAutomataStatePtr to, const xmlChar *token,
5677 const xmlChar *token2,
5678 int min, int max, void *data) {
5679 xmlRegAtomPtr atom;
5680 int counter;
5681
5682 if ((am == NULL) || (from == NULL) || (token == NULL))
5683 return(NULL);
5684 if (min < 0)
5685 return(NULL);
5686 if ((max < min) || (max < 1))
5687 return(NULL);
5688 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5689 if (atom == NULL)
5690 return(NULL);
5691 if ((token2 == NULL) || (*token2 == 0)) {
5692 atom->valuep = xmlStrdup(token);
5693 if (atom->valuep == NULL)
5694 goto error;
5695 } else {
5696 int lenn, lenp;
5697 xmlChar *str;
5698
5699 lenn = strlen((char *) token2);
5700 lenp = strlen((char *) token);
5701
5702 str = xmlMalloc(lenn + lenp + 2);
5703 if (str == NULL)
5704 goto error;
5705 memcpy(&str[0], token, lenp);
5706 str[lenp] = '|';
5707 memcpy(&str[lenp + 1], token2, lenn);
5708 str[lenn + lenp + 1] = 0;
5709
5710 atom->valuep = str;
5711 }
5712 atom->data = data;
5713 if (min == 0)
5714 atom->min = 1;
5715 else
5716 atom->min = min;
5717 atom->max = max;
5718
5719 /*
5720 * associate a counter to the transition.
5721 */
5722 counter = xmlRegGetCounter(am);
5723 if (counter < 0)
5724 goto error;
5725 am->counters[counter].min = min;
5726 am->counters[counter].max = max;
5727
5728 /* xmlFAGenerateTransitions(am, from, to, atom); */
5729 if (to == NULL) {
5730 to = xmlRegStatePush(am);
5731 if (to == NULL)
5732 goto error;
5733 }
5734 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5735 if (xmlRegAtomPush(am, atom) < 0)
5736 goto error;
5737 am->state = to;
5738
5739 if (to == NULL)
5740 to = am->state;
5741 if (to == NULL)
5742 return(NULL);
5743 if (min == 0)
5744 xmlFAGenerateEpsilonTransition(am, from, to);
5745 return(to);
5746
5747 error:
5748 xmlRegFreeAtom(atom);
5749 return(NULL);
5750 }
5751
5752 /**
5753 * xmlAutomataNewCountTrans:
5754 * @am: an automata
5755 * @from: the starting point of the transition
5756 * @to: the target point of the transition or NULL
5757 * @token: the input string associated to that transition
5758 * @min: the minimum successive occurrences of token
5759 * @max: the maximum successive occurrences of token
5760 * @data: data associated to the transition
5761 *
5762 * If @to is NULL, this creates first a new target state in the automata
5763 * and then adds a transition from the @from state to the target state
5764 * activated by a succession of input of value @token and whose number
5765 * is between @min and @max
5766 *
5767 * Returns the target state or NULL in case of error
5768 */
5769 xmlAutomataStatePtr
xmlAutomataNewCountTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)5770 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5771 xmlAutomataStatePtr to, const xmlChar *token,
5772 int min, int max, void *data) {
5773 xmlRegAtomPtr atom;
5774 int counter;
5775
5776 if ((am == NULL) || (from == NULL) || (token == NULL))
5777 return(NULL);
5778 if (min < 0)
5779 return(NULL);
5780 if ((max < min) || (max < 1))
5781 return(NULL);
5782 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5783 if (atom == NULL)
5784 return(NULL);
5785 atom->valuep = xmlStrdup(token);
5786 if (atom->valuep == NULL)
5787 goto error;
5788 atom->data = data;
5789 if (min == 0)
5790 atom->min = 1;
5791 else
5792 atom->min = min;
5793 atom->max = max;
5794
5795 /*
5796 * associate a counter to the transition.
5797 */
5798 counter = xmlRegGetCounter(am);
5799 if (counter < 0)
5800 goto error;
5801 am->counters[counter].min = min;
5802 am->counters[counter].max = max;
5803
5804 /* xmlFAGenerateTransitions(am, from, to, atom); */
5805 if (to == NULL) {
5806 to = xmlRegStatePush(am);
5807 if (to == NULL)
5808 goto error;
5809 }
5810 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5811 if (xmlRegAtomPush(am, atom) < 0)
5812 goto error;
5813 am->state = to;
5814
5815 if (to == NULL)
5816 to = am->state;
5817 if (to == NULL)
5818 return(NULL);
5819 if (min == 0)
5820 xmlFAGenerateEpsilonTransition(am, from, to);
5821 return(to);
5822
5823 error:
5824 xmlRegFreeAtom(atom);
5825 return(NULL);
5826 }
5827
5828 /**
5829 * xmlAutomataNewOnceTrans2:
5830 * @am: an automata
5831 * @from: the starting point of the transition
5832 * @to: the target point of the transition or NULL
5833 * @token: the input string associated to that transition
5834 * @token2: the second input string associated to that transition
5835 * @min: the minimum successive occurrences of token
5836 * @max: the maximum successive occurrences of token
5837 * @data: data associated to the transition
5838 *
5839 * If @to is NULL, this creates first a new target state in the automata
5840 * and then adds a transition from the @from state to the target state
5841 * activated by a succession of input of value @token and @token2 and whose
5842 * number is between @min and @max, moreover that transition can only be
5843 * crossed once.
5844 *
5845 * Returns the target state or NULL in case of error
5846 */
5847 xmlAutomataStatePtr
xmlAutomataNewOnceTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)5848 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5849 xmlAutomataStatePtr to, const xmlChar *token,
5850 const xmlChar *token2,
5851 int min, int max, void *data) {
5852 xmlRegAtomPtr atom;
5853 int counter;
5854
5855 if ((am == NULL) || (from == NULL) || (token == NULL))
5856 return(NULL);
5857 if (min < 1)
5858 return(NULL);
5859 if (max < min)
5860 return(NULL);
5861 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5862 if (atom == NULL)
5863 return(NULL);
5864 if ((token2 == NULL) || (*token2 == 0)) {
5865 atom->valuep = xmlStrdup(token);
5866 if (atom->valuep == NULL)
5867 goto error;
5868 } else {
5869 int lenn, lenp;
5870 xmlChar *str;
5871
5872 lenn = strlen((char *) token2);
5873 lenp = strlen((char *) token);
5874
5875 str = xmlMalloc(lenn + lenp + 2);
5876 if (str == NULL)
5877 goto error;
5878 memcpy(&str[0], token, lenp);
5879 str[lenp] = '|';
5880 memcpy(&str[lenp + 1], token2, lenn);
5881 str[lenn + lenp + 1] = 0;
5882
5883 atom->valuep = str;
5884 }
5885 atom->data = data;
5886 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
5887 atom->min = min;
5888 atom->max = max;
5889 /*
5890 * associate a counter to the transition.
5891 */
5892 counter = xmlRegGetCounter(am);
5893 if (counter < 0)
5894 goto error;
5895 am->counters[counter].min = 1;
5896 am->counters[counter].max = 1;
5897
5898 /* xmlFAGenerateTransitions(am, from, to, atom); */
5899 if (to == NULL) {
5900 to = xmlRegStatePush(am);
5901 if (to == NULL)
5902 goto error;
5903 }
5904 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5905 if (xmlRegAtomPush(am, atom) < 0)
5906 goto error;
5907 am->state = to;
5908 return(to);
5909
5910 error:
5911 xmlRegFreeAtom(atom);
5912 return(NULL);
5913 }
5914
5915
5916
5917 /**
5918 * xmlAutomataNewOnceTrans:
5919 * @am: an automata
5920 * @from: the starting point of the transition
5921 * @to: the target point of the transition or NULL
5922 * @token: the input string associated to that transition
5923 * @min: the minimum successive occurrences of token
5924 * @max: the maximum successive occurrences of token
5925 * @data: data associated to the transition
5926 *
5927 * If @to is NULL, this creates first a new target state in the automata
5928 * and then adds a transition from the @from state to the target state
5929 * activated by a succession of input of value @token and whose number
5930 * is between @min and @max, moreover that transition can only be crossed
5931 * once.
5932 *
5933 * Returns the target state or NULL in case of error
5934 */
5935 xmlAutomataStatePtr
xmlAutomataNewOnceTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)5936 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5937 xmlAutomataStatePtr to, const xmlChar *token,
5938 int min, int max, void *data) {
5939 xmlRegAtomPtr atom;
5940 int counter;
5941
5942 if ((am == NULL) || (from == NULL) || (token == NULL))
5943 return(NULL);
5944 if (min < 1)
5945 return(NULL);
5946 if (max < min)
5947 return(NULL);
5948 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5949 if (atom == NULL)
5950 return(NULL);
5951 atom->valuep = xmlStrdup(token);
5952 atom->data = data;
5953 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
5954 atom->min = min;
5955 atom->max = max;
5956 /*
5957 * associate a counter to the transition.
5958 */
5959 counter = xmlRegGetCounter(am);
5960 if (counter < 0)
5961 goto error;
5962 am->counters[counter].min = 1;
5963 am->counters[counter].max = 1;
5964
5965 /* xmlFAGenerateTransitions(am, from, to, atom); */
5966 if (to == NULL) {
5967 to = xmlRegStatePush(am);
5968 if (to == NULL)
5969 goto error;
5970 }
5971 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5972 if (xmlRegAtomPush(am, atom) < 0)
5973 goto error;
5974 am->state = to;
5975 return(to);
5976
5977 error:
5978 xmlRegFreeAtom(atom);
5979 return(NULL);
5980 }
5981
5982 /**
5983 * xmlAutomataNewState:
5984 * @am: an automata
5985 *
5986 * Create a new disconnected state in the automata
5987 *
5988 * Returns the new state or NULL in case of error
5989 */
5990 xmlAutomataStatePtr
xmlAutomataNewState(xmlAutomataPtr am)5991 xmlAutomataNewState(xmlAutomataPtr am) {
5992 if (am == NULL)
5993 return(NULL);
5994 return(xmlRegStatePush(am));
5995 }
5996
5997 /**
5998 * xmlAutomataNewEpsilon:
5999 * @am: an automata
6000 * @from: the starting point of the transition
6001 * @to: the target point of the transition or NULL
6002 *
6003 * If @to is NULL, this creates first a new target state in the automata
6004 * and then adds an epsilon transition from the @from state to the
6005 * target state
6006 *
6007 * Returns the target state or NULL in case of error
6008 */
6009 xmlAutomataStatePtr
xmlAutomataNewEpsilon(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to)6010 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
6011 xmlAutomataStatePtr to) {
6012 if ((am == NULL) || (from == NULL))
6013 return(NULL);
6014 xmlFAGenerateEpsilonTransition(am, from, to);
6015 if (to == NULL)
6016 return(am->state);
6017 return(to);
6018 }
6019
6020 /**
6021 * xmlAutomataNewAllTrans:
6022 * @am: an automata
6023 * @from: the starting point of the transition
6024 * @to: the target point of the transition or NULL
6025 * @lax: allow to transition if not all all transitions have been activated
6026 *
6027 * If @to is NULL, this creates first a new target state in the automata
6028 * and then adds a an ALL transition from the @from state to the
6029 * target state. That transition is an epsilon transition allowed only when
6030 * all transitions from the @from node have been activated.
6031 *
6032 * Returns the target state or NULL in case of error
6033 */
6034 xmlAutomataStatePtr
xmlAutomataNewAllTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int lax)6035 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6036 xmlAutomataStatePtr to, int lax) {
6037 if ((am == NULL) || (from == NULL))
6038 return(NULL);
6039 xmlFAGenerateAllTransition(am, from, to, lax);
6040 if (to == NULL)
6041 return(am->state);
6042 return(to);
6043 }
6044
6045 /**
6046 * xmlAutomataNewCounter:
6047 * @am: an automata
6048 * @min: the minimal value on the counter
6049 * @max: the maximal value on the counter
6050 *
6051 * Create a new counter
6052 *
6053 * Returns the counter number or -1 in case of error
6054 */
6055 int
xmlAutomataNewCounter(xmlAutomataPtr am,int min,int max)6056 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6057 int ret;
6058
6059 if (am == NULL)
6060 return(-1);
6061
6062 ret = xmlRegGetCounter(am);
6063 if (ret < 0)
6064 return(-1);
6065 am->counters[ret].min = min;
6066 am->counters[ret].max = max;
6067 return(ret);
6068 }
6069
6070 /**
6071 * xmlAutomataNewCountedTrans:
6072 * @am: an automata
6073 * @from: the starting point of the transition
6074 * @to: the target point of the transition or NULL
6075 * @counter: the counter associated to that transition
6076 *
6077 * If @to is NULL, this creates first a new target state in the automata
6078 * and then adds an epsilon transition from the @from state to the target state
6079 * which will increment the counter provided
6080 *
6081 * Returns the target state or NULL in case of error
6082 */
6083 xmlAutomataStatePtr
xmlAutomataNewCountedTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6084 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6085 xmlAutomataStatePtr to, int counter) {
6086 if ((am == NULL) || (from == NULL) || (counter < 0))
6087 return(NULL);
6088 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6089 if (to == NULL)
6090 return(am->state);
6091 return(to);
6092 }
6093
6094 /**
6095 * xmlAutomataNewCounterTrans:
6096 * @am: an automata
6097 * @from: the starting point of the transition
6098 * @to: the target point of the transition or NULL
6099 * @counter: the counter associated to that transition
6100 *
6101 * If @to is NULL, this creates first a new target state in the automata
6102 * and then adds an epsilon transition from the @from state to the target state
6103 * which will be allowed only if the counter is within the right range.
6104 *
6105 * Returns the target state or NULL in case of error
6106 */
6107 xmlAutomataStatePtr
xmlAutomataNewCounterTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6108 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6109 xmlAutomataStatePtr to, int counter) {
6110 if ((am == NULL) || (from == NULL) || (counter < 0))
6111 return(NULL);
6112 xmlFAGenerateCountedTransition(am, from, to, counter);
6113 if (to == NULL)
6114 return(am->state);
6115 return(to);
6116 }
6117
6118 /**
6119 * xmlAutomataCompile:
6120 * @am: an automata
6121 *
6122 * Compile the automata into a Reg Exp ready for being executed.
6123 * The automata should be free after this point.
6124 *
6125 * Returns the compiled regexp or NULL in case of error
6126 */
6127 xmlRegexpPtr
xmlAutomataCompile(xmlAutomataPtr am)6128 xmlAutomataCompile(xmlAutomataPtr am) {
6129 xmlRegexpPtr ret;
6130
6131 if ((am == NULL) || (am->error != 0)) return(NULL);
6132 xmlFAEliminateEpsilonTransitions(am);
6133 if (am->error != 0)
6134 return(NULL);
6135 /* xmlFAComputesDeterminism(am); */
6136 ret = xmlRegEpxFromParse(am);
6137
6138 return(ret);
6139 }
6140
6141 /**
6142 * xmlAutomataIsDeterminist:
6143 * @am: an automata
6144 *
6145 * Checks if an automata is determinist.
6146 *
6147 * Returns 1 if true, 0 if not, and -1 in case of error
6148 */
6149 int
xmlAutomataIsDeterminist(xmlAutomataPtr am)6150 xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6151 int ret;
6152
6153 if (am == NULL)
6154 return(-1);
6155
6156 ret = xmlFAComputesDeterminism(am);
6157 return(ret);
6158 }
6159
6160 #ifdef LIBXML_EXPR_ENABLED
6161 /** DOC_DISABLE */
6162 /************************************************************************
6163 * *
6164 * Formal Expression handling code *
6165 * *
6166 ************************************************************************/
6167
6168 /*
6169 * Formal regular expression handling
6170 * Its goal is to do some formal work on content models
6171 */
6172
6173 /* expressions are used within a context */
6174 typedef struct _xmlExpCtxt xmlExpCtxt;
6175 typedef xmlExpCtxt *xmlExpCtxtPtr;
6176
6177 XMLPUBFUN void
6178 xmlExpFreeCtxt (xmlExpCtxtPtr ctxt);
6179 XMLPUBFUN xmlExpCtxtPtr
6180 xmlExpNewCtxt (int maxNodes,
6181 xmlDictPtr dict);
6182
6183 XMLPUBFUN int
6184 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt);
6185 XMLPUBFUN int
6186 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt);
6187
6188 /* Expressions are trees but the tree is opaque */
6189 typedef struct _xmlExpNode xmlExpNode;
6190 typedef xmlExpNode *xmlExpNodePtr;
6191
6192 typedef enum {
6193 XML_EXP_EMPTY = 0,
6194 XML_EXP_FORBID = 1,
6195 XML_EXP_ATOM = 2,
6196 XML_EXP_SEQ = 3,
6197 XML_EXP_OR = 4,
6198 XML_EXP_COUNT = 5
6199 } xmlExpNodeType;
6200
6201 /*
6202 * 2 core expressions shared by all for the empty language set
6203 * and for the set with just the empty token
6204 */
6205 XMLPUBVAR xmlExpNodePtr forbiddenExp;
6206 XMLPUBVAR xmlExpNodePtr emptyExp;
6207
6208 /*
6209 * Expressions are reference counted internally
6210 */
6211 XMLPUBFUN void
6212 xmlExpFree (xmlExpCtxtPtr ctxt,
6213 xmlExpNodePtr expr);
6214 XMLPUBFUN void
6215 xmlExpRef (xmlExpNodePtr expr);
6216
6217 /*
6218 * constructors can be either manual or from a string
6219 */
6220 XMLPUBFUN xmlExpNodePtr
6221 xmlExpParse (xmlExpCtxtPtr ctxt,
6222 const char *expr);
6223 XMLPUBFUN xmlExpNodePtr
6224 xmlExpNewAtom (xmlExpCtxtPtr ctxt,
6225 const xmlChar *name,
6226 int len);
6227 XMLPUBFUN xmlExpNodePtr
6228 xmlExpNewOr (xmlExpCtxtPtr ctxt,
6229 xmlExpNodePtr left,
6230 xmlExpNodePtr right);
6231 XMLPUBFUN xmlExpNodePtr
6232 xmlExpNewSeq (xmlExpCtxtPtr ctxt,
6233 xmlExpNodePtr left,
6234 xmlExpNodePtr right);
6235 XMLPUBFUN xmlExpNodePtr
6236 xmlExpNewRange (xmlExpCtxtPtr ctxt,
6237 xmlExpNodePtr subset,
6238 int min,
6239 int max);
6240 /*
6241 * The really interesting APIs
6242 */
6243 XMLPUBFUN int
6244 xmlExpIsNillable(xmlExpNodePtr expr);
6245 XMLPUBFUN int
6246 xmlExpMaxToken (xmlExpNodePtr expr);
6247 XMLPUBFUN int
6248 xmlExpGetLanguage(xmlExpCtxtPtr ctxt,
6249 xmlExpNodePtr expr,
6250 const xmlChar**langList,
6251 int len);
6252 XMLPUBFUN int
6253 xmlExpGetStart (xmlExpCtxtPtr ctxt,
6254 xmlExpNodePtr expr,
6255 const xmlChar**tokList,
6256 int len);
6257 XMLPUBFUN xmlExpNodePtr
6258 xmlExpStringDerive(xmlExpCtxtPtr ctxt,
6259 xmlExpNodePtr expr,
6260 const xmlChar *str,
6261 int len);
6262 XMLPUBFUN xmlExpNodePtr
6263 xmlExpExpDerive (xmlExpCtxtPtr ctxt,
6264 xmlExpNodePtr expr,
6265 xmlExpNodePtr sub);
6266 XMLPUBFUN int
6267 xmlExpSubsume (xmlExpCtxtPtr ctxt,
6268 xmlExpNodePtr expr,
6269 xmlExpNodePtr sub);
6270 XMLPUBFUN void
6271 xmlExpDump (xmlBufferPtr buf,
6272 xmlExpNodePtr expr);
6273
6274 /************************************************************************
6275 * *
6276 * Expression handling context *
6277 * *
6278 ************************************************************************/
6279
6280 struct _xmlExpCtxt {
6281 xmlDictPtr dict;
6282 xmlExpNodePtr *table;
6283 int size;
6284 int nbElems;
6285 int nb_nodes;
6286 int maxNodes;
6287 const char *expr;
6288 const char *cur;
6289 int nb_cons;
6290 int tabSize;
6291 };
6292
6293 /**
6294 * xmlExpNewCtxt:
6295 * @maxNodes: the maximum number of nodes
6296 * @dict: optional dictionary to use internally
6297 *
6298 * Creates a new context for manipulating expressions
6299 *
6300 * Returns the context or NULL in case of error
6301 */
6302 xmlExpCtxtPtr
xmlExpNewCtxt(int maxNodes,xmlDictPtr dict)6303 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6304 xmlExpCtxtPtr ret;
6305 int size = 256;
6306
6307 if (maxNodes <= 4096)
6308 maxNodes = 4096;
6309
6310 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6311 if (ret == NULL)
6312 return(NULL);
6313 memset(ret, 0, sizeof(xmlExpCtxt));
6314 ret->size = size;
6315 ret->nbElems = 0;
6316 ret->maxNodes = maxNodes;
6317 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6318 if (ret->table == NULL) {
6319 xmlFree(ret);
6320 return(NULL);
6321 }
6322 memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6323 if (dict == NULL) {
6324 ret->dict = xmlDictCreate();
6325 if (ret->dict == NULL) {
6326 xmlFree(ret->table);
6327 xmlFree(ret);
6328 return(NULL);
6329 }
6330 } else {
6331 ret->dict = dict;
6332 xmlDictReference(ret->dict);
6333 }
6334 return(ret);
6335 }
6336
6337 /**
6338 * xmlExpFreeCtxt:
6339 * @ctxt: an expression context
6340 *
6341 * Free an expression context
6342 */
6343 void
xmlExpFreeCtxt(xmlExpCtxtPtr ctxt)6344 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6345 if (ctxt == NULL)
6346 return;
6347 xmlDictFree(ctxt->dict);
6348 if (ctxt->table != NULL)
6349 xmlFree(ctxt->table);
6350 xmlFree(ctxt);
6351 }
6352
6353 /************************************************************************
6354 * *
6355 * Structure associated to an expression node *
6356 * *
6357 ************************************************************************/
6358 #define MAX_NODES 10000
6359
6360 /*
6361 * TODO:
6362 * - Wildcards
6363 * - public API for creation
6364 *
6365 * Started
6366 * - regression testing
6367 *
6368 * Done
6369 * - split into module and test tool
6370 * - memleaks
6371 */
6372
6373 typedef enum {
6374 XML_EXP_NILABLE = (1 << 0)
6375 } xmlExpNodeInfo;
6376
6377 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6378
6379 struct _xmlExpNode {
6380 unsigned char type;/* xmlExpNodeType */
6381 unsigned char info;/* OR of xmlExpNodeInfo */
6382 unsigned short key; /* the hash key */
6383 unsigned int ref; /* The number of references */
6384 int c_max; /* the maximum length it can consume */
6385 xmlExpNodePtr exp_left;
6386 xmlExpNodePtr next;/* the next node in the hash table or free list */
6387 union {
6388 struct {
6389 int f_min;
6390 int f_max;
6391 } count;
6392 struct {
6393 xmlExpNodePtr f_right;
6394 } children;
6395 const xmlChar *f_str;
6396 } field;
6397 };
6398
6399 #define exp_min field.count.f_min
6400 #define exp_max field.count.f_max
6401 /* #define exp_left field.children.f_left */
6402 #define exp_right field.children.f_right
6403 #define exp_str field.f_str
6404
6405 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6406 static xmlExpNode forbiddenExpNode = {
6407 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6408 };
6409 xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6410 static xmlExpNode emptyExpNode = {
6411 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6412 };
6413 xmlExpNodePtr emptyExp = &emptyExpNode;
6414
6415 /************************************************************************
6416 * *
6417 * The custom hash table for unicity and canonicalization *
6418 * of sub-expressions pointers *
6419 * *
6420 ************************************************************************/
6421 /*
6422 * xmlExpHashNameComputeKey:
6423 * Calculate the hash key for a token
6424 */
6425 static unsigned short
xmlExpHashNameComputeKey(const xmlChar * name)6426 xmlExpHashNameComputeKey(const xmlChar *name) {
6427 unsigned short value = 0L;
6428 char ch;
6429
6430 if (name != NULL) {
6431 value += 30 * (*name);
6432 while ((ch = *name++) != 0) {
6433 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6434 }
6435 }
6436 return (value);
6437 }
6438
6439 /*
6440 * xmlExpHashComputeKey:
6441 * Calculate the hash key for a compound expression
6442 */
6443 static unsigned short
xmlExpHashComputeKey(xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right)6444 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6445 xmlExpNodePtr right) {
6446 unsigned long value;
6447 unsigned short ret;
6448
6449 switch (type) {
6450 case XML_EXP_SEQ:
6451 value = left->key;
6452 value += right->key;
6453 value *= 3;
6454 ret = (unsigned short) value;
6455 break;
6456 case XML_EXP_OR:
6457 value = left->key;
6458 value += right->key;
6459 value *= 7;
6460 ret = (unsigned short) value;
6461 break;
6462 case XML_EXP_COUNT:
6463 value = left->key;
6464 value += right->key;
6465 ret = (unsigned short) value;
6466 break;
6467 default:
6468 ret = 0;
6469 }
6470 return(ret);
6471 }
6472
6473
6474 static xmlExpNodePtr
xmlExpNewNode(xmlExpCtxtPtr ctxt,xmlExpNodeType type)6475 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6476 xmlExpNodePtr ret;
6477
6478 if (ctxt->nb_nodes >= MAX_NODES)
6479 return(NULL);
6480 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6481 if (ret == NULL)
6482 return(NULL);
6483 memset(ret, 0, sizeof(xmlExpNode));
6484 ret->type = type;
6485 ret->next = NULL;
6486 ctxt->nb_nodes++;
6487 ctxt->nb_cons++;
6488 return(ret);
6489 }
6490
6491 /**
6492 * xmlExpHashGetEntry:
6493 * @table: the hash table
6494 *
6495 * Get the unique entry from the hash table. The entry is created if
6496 * needed. @left and @right are consumed, i.e. their ref count will
6497 * be decremented by the operation.
6498 *
6499 * Returns the pointer or NULL in case of error
6500 */
6501 static xmlExpNodePtr
xmlExpHashGetEntry(xmlExpCtxtPtr ctxt,xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right,const xmlChar * name,int min,int max)6502 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6503 xmlExpNodePtr left, xmlExpNodePtr right,
6504 const xmlChar *name, int min, int max) {
6505 unsigned short kbase, key;
6506 xmlExpNodePtr entry;
6507 xmlExpNodePtr insert;
6508
6509 if (ctxt == NULL)
6510 return(NULL);
6511
6512 /*
6513 * Check for duplicate and insertion location.
6514 */
6515 if (type == XML_EXP_ATOM) {
6516 kbase = xmlExpHashNameComputeKey(name);
6517 } else if (type == XML_EXP_COUNT) {
6518 /* COUNT reduction rule 1 */
6519 /* a{1} -> a */
6520 if (min == max) {
6521 if (min == 1) {
6522 return(left);
6523 }
6524 if (min == 0) {
6525 xmlExpFree(ctxt, left);
6526 return(emptyExp);
6527 }
6528 }
6529 if (min < 0) {
6530 xmlExpFree(ctxt, left);
6531 return(forbiddenExp);
6532 }
6533 if (max == -1)
6534 kbase = min + 79;
6535 else
6536 kbase = max - min;
6537 kbase += left->key;
6538 } else if (type == XML_EXP_OR) {
6539 /* Forbid reduction rules */
6540 if (left->type == XML_EXP_FORBID) {
6541 xmlExpFree(ctxt, left);
6542 return(right);
6543 }
6544 if (right->type == XML_EXP_FORBID) {
6545 xmlExpFree(ctxt, right);
6546 return(left);
6547 }
6548
6549 /* OR reduction rule 1 */
6550 /* a | a reduced to a */
6551 if (left == right) {
6552 xmlExpFree(ctxt, right);
6553 return(left);
6554 }
6555 /* OR canonicalization rule 1 */
6556 /* linearize (a | b) | c into a | (b | c) */
6557 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6558 xmlExpNodePtr tmp = left;
6559 left = right;
6560 right = tmp;
6561 }
6562 /* OR reduction rule 2 */
6563 /* a | (a | b) and b | (a | b) are reduced to a | b */
6564 if (right->type == XML_EXP_OR) {
6565 if ((left == right->exp_left) ||
6566 (left == right->exp_right)) {
6567 xmlExpFree(ctxt, left);
6568 return(right);
6569 }
6570 }
6571 /* OR canonicalization rule 2 */
6572 /* linearize (a | b) | c into a | (b | c) */
6573 if (left->type == XML_EXP_OR) {
6574 xmlExpNodePtr tmp;
6575
6576 /* OR canonicalization rule 2 */
6577 if ((left->exp_right->type != XML_EXP_OR) &&
6578 (left->exp_right->key < left->exp_left->key)) {
6579 tmp = left->exp_right;
6580 left->exp_right = left->exp_left;
6581 left->exp_left = tmp;
6582 }
6583 left->exp_right->ref++;
6584 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6585 NULL, 0, 0);
6586 left->exp_left->ref++;
6587 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6588 NULL, 0, 0);
6589
6590 xmlExpFree(ctxt, left);
6591 return(tmp);
6592 }
6593 if (right->type == XML_EXP_OR) {
6594 /* Ordering in the tree */
6595 /* C | (A | B) -> A | (B | C) */
6596 if (left->key > right->exp_right->key) {
6597 xmlExpNodePtr tmp;
6598 right->exp_right->ref++;
6599 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6600 left, NULL, 0, 0);
6601 right->exp_left->ref++;
6602 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6603 tmp, NULL, 0, 0);
6604 xmlExpFree(ctxt, right);
6605 return(tmp);
6606 }
6607 /* Ordering in the tree */
6608 /* B | (A | C) -> A | (B | C) */
6609 if (left->key > right->exp_left->key) {
6610 xmlExpNodePtr tmp;
6611 right->exp_right->ref++;
6612 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6613 right->exp_right, NULL, 0, 0);
6614 right->exp_left->ref++;
6615 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6616 tmp, NULL, 0, 0);
6617 xmlExpFree(ctxt, right);
6618 return(tmp);
6619 }
6620 }
6621 /* we know both types are != XML_EXP_OR here */
6622 else if (left->key > right->key) {
6623 xmlExpNodePtr tmp = left;
6624 left = right;
6625 right = tmp;
6626 }
6627 kbase = xmlExpHashComputeKey(type, left, right);
6628 } else if (type == XML_EXP_SEQ) {
6629 /* Forbid reduction rules */
6630 if (left->type == XML_EXP_FORBID) {
6631 xmlExpFree(ctxt, right);
6632 return(left);
6633 }
6634 if (right->type == XML_EXP_FORBID) {
6635 xmlExpFree(ctxt, left);
6636 return(right);
6637 }
6638 /* Empty reduction rules */
6639 if (right->type == XML_EXP_EMPTY) {
6640 return(left);
6641 }
6642 if (left->type == XML_EXP_EMPTY) {
6643 return(right);
6644 }
6645 kbase = xmlExpHashComputeKey(type, left, right);
6646 } else
6647 return(NULL);
6648
6649 key = kbase % ctxt->size;
6650 if (ctxt->table[key] != NULL) {
6651 for (insert = ctxt->table[key]; insert != NULL;
6652 insert = insert->next) {
6653 if ((insert->key == kbase) &&
6654 (insert->type == type)) {
6655 if (type == XML_EXP_ATOM) {
6656 if (name == insert->exp_str) {
6657 insert->ref++;
6658 return(insert);
6659 }
6660 } else if (type == XML_EXP_COUNT) {
6661 if ((insert->exp_min == min) && (insert->exp_max == max) &&
6662 (insert->exp_left == left)) {
6663 insert->ref++;
6664 left->ref--;
6665 return(insert);
6666 }
6667 } else if ((insert->exp_left == left) &&
6668 (insert->exp_right == right)) {
6669 insert->ref++;
6670 left->ref--;
6671 right->ref--;
6672 return(insert);
6673 }
6674 }
6675 }
6676 }
6677
6678 entry = xmlExpNewNode(ctxt, type);
6679 if (entry == NULL)
6680 return(NULL);
6681 entry->key = kbase;
6682 if (type == XML_EXP_ATOM) {
6683 entry->exp_str = name;
6684 entry->c_max = 1;
6685 } else if (type == XML_EXP_COUNT) {
6686 entry->exp_min = min;
6687 entry->exp_max = max;
6688 entry->exp_left = left;
6689 if ((min == 0) || (IS_NILLABLE(left)))
6690 entry->info |= XML_EXP_NILABLE;
6691 if (max < 0)
6692 entry->c_max = -1;
6693 else
6694 entry->c_max = max * entry->exp_left->c_max;
6695 } else {
6696 entry->exp_left = left;
6697 entry->exp_right = right;
6698 if (type == XML_EXP_OR) {
6699 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6700 entry->info |= XML_EXP_NILABLE;
6701 if ((entry->exp_left->c_max == -1) ||
6702 (entry->exp_right->c_max == -1))
6703 entry->c_max = -1;
6704 else if (entry->exp_left->c_max > entry->exp_right->c_max)
6705 entry->c_max = entry->exp_left->c_max;
6706 else
6707 entry->c_max = entry->exp_right->c_max;
6708 } else {
6709 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6710 entry->info |= XML_EXP_NILABLE;
6711 if ((entry->exp_left->c_max == -1) ||
6712 (entry->exp_right->c_max == -1))
6713 entry->c_max = -1;
6714 else
6715 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6716 }
6717 }
6718 entry->ref = 1;
6719 if (ctxt->table[key] != NULL)
6720 entry->next = ctxt->table[key];
6721
6722 ctxt->table[key] = entry;
6723 ctxt->nbElems++;
6724
6725 return(entry);
6726 }
6727
6728 /**
6729 * xmlExpFree:
6730 * @ctxt: the expression context
6731 * @exp: the expression
6732 *
6733 * Dereference the expression
6734 */
6735 void
xmlExpFree(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp)6736 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6737 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6738 return;
6739 exp->ref--;
6740 if (exp->ref == 0) {
6741 unsigned short key;
6742
6743 /* Unlink it first from the hash table */
6744 key = exp->key % ctxt->size;
6745 if (ctxt->table[key] == exp) {
6746 ctxt->table[key] = exp->next;
6747 } else {
6748 xmlExpNodePtr tmp;
6749
6750 tmp = ctxt->table[key];
6751 while (tmp != NULL) {
6752 if (tmp->next == exp) {
6753 tmp->next = exp->next;
6754 break;
6755 }
6756 tmp = tmp->next;
6757 }
6758 }
6759
6760 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6761 xmlExpFree(ctxt, exp->exp_left);
6762 xmlExpFree(ctxt, exp->exp_right);
6763 } else if (exp->type == XML_EXP_COUNT) {
6764 xmlExpFree(ctxt, exp->exp_left);
6765 }
6766 xmlFree(exp);
6767 ctxt->nb_nodes--;
6768 }
6769 }
6770
6771 /**
6772 * xmlExpRef:
6773 * @exp: the expression
6774 *
6775 * Increase the reference count of the expression
6776 */
6777 void
xmlExpRef(xmlExpNodePtr exp)6778 xmlExpRef(xmlExpNodePtr exp) {
6779 if (exp != NULL)
6780 exp->ref++;
6781 }
6782
6783 /**
6784 * xmlExpNewAtom:
6785 * @ctxt: the expression context
6786 * @name: the atom name
6787 * @len: the atom name length in byte (or -1);
6788 *
6789 * Get the atom associated to this name from that context
6790 *
6791 * Returns the node or NULL in case of error
6792 */
6793 xmlExpNodePtr
xmlExpNewAtom(xmlExpCtxtPtr ctxt,const xmlChar * name,int len)6794 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
6795 if ((ctxt == NULL) || (name == NULL))
6796 return(NULL);
6797 name = xmlDictLookup(ctxt->dict, name, len);
6798 if (name == NULL)
6799 return(NULL);
6800 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
6801 }
6802
6803 /**
6804 * xmlExpNewOr:
6805 * @ctxt: the expression context
6806 * @left: left expression
6807 * @right: right expression
6808 *
6809 * Get the atom associated to the choice @left | @right
6810 * Note that @left and @right are consumed in the operation, to keep
6811 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6812 * this is true even in case of failure (unless ctxt == NULL).
6813 *
6814 * Returns the node or NULL in case of error
6815 */
6816 xmlExpNodePtr
xmlExpNewOr(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)6817 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6818 if (ctxt == NULL)
6819 return(NULL);
6820 if ((left == NULL) || (right == NULL)) {
6821 xmlExpFree(ctxt, left);
6822 xmlExpFree(ctxt, right);
6823 return(NULL);
6824 }
6825 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
6826 }
6827
6828 /**
6829 * xmlExpNewSeq:
6830 * @ctxt: the expression context
6831 * @left: left expression
6832 * @right: right expression
6833 *
6834 * Get the atom associated to the sequence @left , @right
6835 * Note that @left and @right are consumed in the operation, to keep
6836 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6837 * this is true even in case of failure (unless ctxt == NULL).
6838 *
6839 * Returns the node or NULL in case of error
6840 */
6841 xmlExpNodePtr
xmlExpNewSeq(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)6842 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6843 if (ctxt == NULL)
6844 return(NULL);
6845 if ((left == NULL) || (right == NULL)) {
6846 xmlExpFree(ctxt, left);
6847 xmlExpFree(ctxt, right);
6848 return(NULL);
6849 }
6850 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
6851 }
6852
6853 /**
6854 * xmlExpNewRange:
6855 * @ctxt: the expression context
6856 * @subset: the expression to be repeated
6857 * @min: the lower bound for the repetition
6858 * @max: the upper bound for the repetition, -1 means infinite
6859 *
6860 * Get the atom associated to the range (@subset){@min, @max}
6861 * Note that @subset is consumed in the operation, to keep
6862 * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
6863 * this is true even in case of failure (unless ctxt == NULL).
6864 *
6865 * Returns the node or NULL in case of error
6866 */
6867 xmlExpNodePtr
xmlExpNewRange(xmlExpCtxtPtr ctxt,xmlExpNodePtr subset,int min,int max)6868 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
6869 if (ctxt == NULL)
6870 return(NULL);
6871 if ((subset == NULL) || (min < 0) || (max < -1) ||
6872 ((max >= 0) && (min > max))) {
6873 xmlExpFree(ctxt, subset);
6874 return(NULL);
6875 }
6876 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
6877 NULL, NULL, min, max));
6878 }
6879
6880 /************************************************************************
6881 * *
6882 * Public API for operations on expressions *
6883 * *
6884 ************************************************************************/
6885
6886 static int
xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)6887 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6888 const xmlChar**list, int len, int nb) {
6889 int tmp, tmp2;
6890 tail:
6891 switch (exp->type) {
6892 case XML_EXP_EMPTY:
6893 return(0);
6894 case XML_EXP_ATOM:
6895 for (tmp = 0;tmp < nb;tmp++)
6896 if (list[tmp] == exp->exp_str)
6897 return(0);
6898 if (nb >= len)
6899 return(-2);
6900 list[nb] = exp->exp_str;
6901 return(1);
6902 case XML_EXP_COUNT:
6903 exp = exp->exp_left;
6904 goto tail;
6905 case XML_EXP_SEQ:
6906 case XML_EXP_OR:
6907 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
6908 if (tmp < 0)
6909 return(tmp);
6910 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
6911 nb + tmp);
6912 if (tmp2 < 0)
6913 return(tmp2);
6914 return(tmp + tmp2);
6915 }
6916 return(-1);
6917 }
6918
6919 /**
6920 * xmlExpGetLanguage:
6921 * @ctxt: the expression context
6922 * @exp: the expression
6923 * @langList: where to store the tokens
6924 * @len: the allocated length of @list
6925 *
6926 * Find all the strings used in @exp and store them in @list
6927 *
6928 * Returns the number of unique strings found, -1 in case of errors and
6929 * -2 if there is more than @len strings
6930 */
6931 int
xmlExpGetLanguage(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** langList,int len)6932 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6933 const xmlChar**langList, int len) {
6934 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
6935 return(-1);
6936 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
6937 }
6938
6939 static int
xmlExpGetStartInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)6940 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6941 const xmlChar**list, int len, int nb) {
6942 int tmp, tmp2;
6943 tail:
6944 switch (exp->type) {
6945 case XML_EXP_FORBID:
6946 return(0);
6947 case XML_EXP_EMPTY:
6948 return(0);
6949 case XML_EXP_ATOM:
6950 for (tmp = 0;tmp < nb;tmp++)
6951 if (list[tmp] == exp->exp_str)
6952 return(0);
6953 if (nb >= len)
6954 return(-2);
6955 list[nb] = exp->exp_str;
6956 return(1);
6957 case XML_EXP_COUNT:
6958 exp = exp->exp_left;
6959 goto tail;
6960 case XML_EXP_SEQ:
6961 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
6962 if (tmp < 0)
6963 return(tmp);
6964 if (IS_NILLABLE(exp->exp_left)) {
6965 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
6966 nb + tmp);
6967 if (tmp2 < 0)
6968 return(tmp2);
6969 tmp += tmp2;
6970 }
6971 return(tmp);
6972 case XML_EXP_OR:
6973 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
6974 if (tmp < 0)
6975 return(tmp);
6976 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
6977 nb + tmp);
6978 if (tmp2 < 0)
6979 return(tmp2);
6980 return(tmp + tmp2);
6981 }
6982 return(-1);
6983 }
6984
6985 /**
6986 * xmlExpGetStart:
6987 * @ctxt: the expression context
6988 * @exp: the expression
6989 * @tokList: where to store the tokens
6990 * @len: the allocated length of @list
6991 *
6992 * Find all the strings that appears at the start of the languages
6993 * accepted by @exp and store them in @list. E.g. for (a, b) | c
6994 * it will return the list [a, c]
6995 *
6996 * Returns the number of unique strings found, -1 in case of errors and
6997 * -2 if there is more than @len strings
6998 */
6999 int
xmlExpGetStart(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** tokList,int len)7000 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7001 const xmlChar**tokList, int len) {
7002 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
7003 return(-1);
7004 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
7005 }
7006
7007 /**
7008 * xmlExpIsNillable:
7009 * @exp: the expression
7010 *
7011 * Finds if the expression is nillable, i.e. if it accepts the empty sequence
7012 *
7013 * Returns 1 if nillable, 0 if not and -1 in case of error
7014 */
7015 int
xmlExpIsNillable(xmlExpNodePtr exp)7016 xmlExpIsNillable(xmlExpNodePtr exp) {
7017 if (exp == NULL)
7018 return(-1);
7019 return(IS_NILLABLE(exp) != 0);
7020 }
7021
7022 static xmlExpNodePtr
xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str)7023 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
7024 {
7025 xmlExpNodePtr ret;
7026
7027 switch (exp->type) {
7028 case XML_EXP_EMPTY:
7029 return(forbiddenExp);
7030 case XML_EXP_FORBID:
7031 return(forbiddenExp);
7032 case XML_EXP_ATOM:
7033 if (exp->exp_str == str) {
7034 ret = emptyExp;
7035 } else {
7036 /* TODO wildcards here */
7037 ret = forbiddenExp;
7038 }
7039 return(ret);
7040 case XML_EXP_OR: {
7041 xmlExpNodePtr tmp;
7042
7043 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7044 if (tmp == NULL) {
7045 return(NULL);
7046 }
7047 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7048 if (ret == NULL) {
7049 xmlExpFree(ctxt, tmp);
7050 return(NULL);
7051 }
7052 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
7053 NULL, 0, 0);
7054 return(ret);
7055 }
7056 case XML_EXP_SEQ:
7057 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7058 if (ret == NULL) {
7059 return(NULL);
7060 } else if (ret == forbiddenExp) {
7061 if (IS_NILLABLE(exp->exp_left)) {
7062 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7063 }
7064 } else {
7065 exp->exp_right->ref++;
7066 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
7067 NULL, 0, 0);
7068 }
7069 return(ret);
7070 case XML_EXP_COUNT: {
7071 int min, max;
7072 xmlExpNodePtr tmp;
7073
7074 if (exp->exp_max == 0)
7075 return(forbiddenExp);
7076 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7077 if (ret == NULL)
7078 return(NULL);
7079 if (ret == forbiddenExp) {
7080 return(ret);
7081 }
7082 if (exp->exp_max == 1)
7083 return(ret);
7084 if (exp->exp_max < 0) /* unbounded */
7085 max = -1;
7086 else
7087 max = exp->exp_max - 1;
7088 if (exp->exp_min > 0)
7089 min = exp->exp_min - 1;
7090 else
7091 min = 0;
7092 exp->exp_left->ref++;
7093 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
7094 NULL, min, max);
7095 if (ret == emptyExp) {
7096 return(tmp);
7097 }
7098 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
7099 NULL, 0, 0));
7100 }
7101 }
7102 return(NULL);
7103 }
7104
7105 /**
7106 * xmlExpStringDerive:
7107 * @ctxt: the expression context
7108 * @exp: the expression
7109 * @str: the string
7110 * @len: the string len in bytes if available
7111 *
7112 * Do one step of Brzozowski derivation of the expression @exp with
7113 * respect to the input string
7114 *
7115 * Returns the resulting expression or NULL in case of internal error
7116 */
7117 xmlExpNodePtr
xmlExpStringDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str,int len)7118 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7119 const xmlChar *str, int len) {
7120 const xmlChar *input;
7121
7122 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
7123 return(NULL);
7124 }
7125 /*
7126 * check the string is in the dictionary, if yes use an interned
7127 * copy, otherwise we know it's not an acceptable input
7128 */
7129 input = xmlDictExists(ctxt->dict, str, len);
7130 if (input == NULL) {
7131 return(forbiddenExp);
7132 }
7133 return(xmlExpStringDeriveInt(ctxt, exp, input));
7134 }
7135
7136 static int
xmlExpCheckCard(xmlExpNodePtr exp,xmlExpNodePtr sub)7137 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7138 int ret = 1;
7139
7140 if (sub->c_max == -1) {
7141 if (exp->c_max != -1)
7142 ret = 0;
7143 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7144 ret = 0;
7145 }
7146 return(ret);
7147 }
7148
7149 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7150 xmlExpNodePtr sub);
7151 /**
7152 * xmlExpDivide:
7153 * @ctxt: the expressions context
7154 * @exp: the englobing expression
7155 * @sub: the subexpression
7156 * @mult: the multiple expression
7157 * @remain: the remain from the derivation of the multiple
7158 *
7159 * Check if exp is a multiple of sub, i.e. if there is a finite number n
7160 * so that sub{n} subsume exp
7161 *
7162 * Returns the multiple value if successful, 0 if it is not a multiple
7163 * and -1 in case of internal error.
7164 */
7165
7166 static int
xmlExpDivide(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub,xmlExpNodePtr * mult,xmlExpNodePtr * remain)7167 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7168 xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7169 int i;
7170 xmlExpNodePtr tmp, tmp2;
7171
7172 if (mult != NULL) *mult = NULL;
7173 if (remain != NULL) *remain = NULL;
7174 if (exp->c_max == -1) return(0);
7175 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7176
7177 for (i = 1;i <= exp->c_max;i++) {
7178 sub->ref++;
7179 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7180 sub, NULL, NULL, i, i);
7181 if (tmp == NULL) {
7182 return(-1);
7183 }
7184 if (!xmlExpCheckCard(tmp, exp)) {
7185 xmlExpFree(ctxt, tmp);
7186 continue;
7187 }
7188 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7189 if (tmp2 == NULL) {
7190 xmlExpFree(ctxt, tmp);
7191 return(-1);
7192 }
7193 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7194 if (remain != NULL)
7195 *remain = tmp2;
7196 else
7197 xmlExpFree(ctxt, tmp2);
7198 if (mult != NULL)
7199 *mult = tmp;
7200 else
7201 xmlExpFree(ctxt, tmp);
7202 return(i);
7203 }
7204 xmlExpFree(ctxt, tmp);
7205 xmlExpFree(ctxt, tmp2);
7206 }
7207 return(0);
7208 }
7209
7210 /**
7211 * xmlExpExpDeriveInt:
7212 * @ctxt: the expressions context
7213 * @exp: the englobing expression
7214 * @sub: the subexpression
7215 *
7216 * Try to do a step of Brzozowski derivation but at a higher level
7217 * the input being a subexpression.
7218 *
7219 * Returns the resulting expression or NULL in case of internal error
7220 */
7221 static xmlExpNodePtr
xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7222 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7223 xmlExpNodePtr ret, tmp, tmp2, tmp3;
7224 const xmlChar **tab;
7225 int len, i;
7226
7227 /*
7228 * In case of equality and if the expression can only consume a finite
7229 * amount, then the derivation is empty
7230 */
7231 if ((exp == sub) && (exp->c_max >= 0)) {
7232 return(emptyExp);
7233 }
7234 /*
7235 * decompose sub sequence first
7236 */
7237 if (sub->type == XML_EXP_EMPTY) {
7238 exp->ref++;
7239 return(exp);
7240 }
7241 if (sub->type == XML_EXP_SEQ) {
7242 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7243 if (tmp == NULL)
7244 return(NULL);
7245 if (tmp == forbiddenExp)
7246 return(tmp);
7247 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7248 xmlExpFree(ctxt, tmp);
7249 return(ret);
7250 }
7251 if (sub->type == XML_EXP_OR) {
7252 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7253 if (tmp == forbiddenExp)
7254 return(tmp);
7255 if (tmp == NULL)
7256 return(NULL);
7257 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7258 if ((ret == NULL) || (ret == forbiddenExp)) {
7259 xmlExpFree(ctxt, tmp);
7260 return(ret);
7261 }
7262 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7263 }
7264 if (!xmlExpCheckCard(exp, sub)) {
7265 return(forbiddenExp);
7266 }
7267 switch (exp->type) {
7268 case XML_EXP_EMPTY:
7269 if (sub == emptyExp)
7270 return(emptyExp);
7271 return(forbiddenExp);
7272 case XML_EXP_FORBID:
7273 return(forbiddenExp);
7274 case XML_EXP_ATOM:
7275 if (sub->type == XML_EXP_ATOM) {
7276 /* TODO: handle wildcards */
7277 if (exp->exp_str == sub->exp_str) {
7278 return(emptyExp);
7279 }
7280 return(forbiddenExp);
7281 }
7282 if ((sub->type == XML_EXP_COUNT) &&
7283 (sub->exp_max == 1) &&
7284 (sub->exp_left->type == XML_EXP_ATOM)) {
7285 /* TODO: handle wildcards */
7286 if (exp->exp_str == sub->exp_left->exp_str) {
7287 return(emptyExp);
7288 }
7289 return(forbiddenExp);
7290 }
7291 return(forbiddenExp);
7292 case XML_EXP_SEQ:
7293 /* try to get the sequence consumed only if possible */
7294 if (xmlExpCheckCard(exp->exp_left, sub)) {
7295 /* See if the sequence can be consumed directly */
7296 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7297 if ((ret != forbiddenExp) && (ret != NULL)) {
7298 /*
7299 * TODO: assumption here that we are determinist
7300 * i.e. we won't get to a nillable exp left
7301 * subset which could be matched by the right
7302 * part too.
7303 * e.g.: (a | b)+,(a | c) and 'a+,a'
7304 */
7305 exp->exp_right->ref++;
7306 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7307 exp->exp_right, NULL, 0, 0));
7308 }
7309 }
7310 /* Try instead to decompose */
7311 if (sub->type == XML_EXP_COUNT) {
7312 int min, max;
7313
7314 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7315 if (ret == NULL)
7316 return(NULL);
7317 if (ret != forbiddenExp) {
7318 if (sub->exp_max < 0)
7319 max = -1;
7320 else
7321 max = sub->exp_max -1;
7322 if (sub->exp_min > 0)
7323 min = sub->exp_min -1;
7324 else
7325 min = 0;
7326 exp->exp_right->ref++;
7327 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7328 exp->exp_right, NULL, 0, 0);
7329 if (tmp == NULL)
7330 return(NULL);
7331
7332 sub->exp_left->ref++;
7333 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7334 sub->exp_left, NULL, NULL, min, max);
7335 if (tmp2 == NULL) {
7336 xmlExpFree(ctxt, tmp);
7337 return(NULL);
7338 }
7339 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7340 xmlExpFree(ctxt, tmp);
7341 xmlExpFree(ctxt, tmp2);
7342 return(ret);
7343 }
7344 }
7345 /* we made no progress on structured operations */
7346 break;
7347 case XML_EXP_OR:
7348 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7349 if (ret == NULL)
7350 return(NULL);
7351 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7352 if (tmp == NULL) {
7353 xmlExpFree(ctxt, ret);
7354 return(NULL);
7355 }
7356 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7357 case XML_EXP_COUNT: {
7358 int min, max;
7359
7360 if (sub->type == XML_EXP_COUNT) {
7361 /*
7362 * Try to see if the loop is completely subsumed
7363 */
7364 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7365 if (tmp == NULL)
7366 return(NULL);
7367 if (tmp == forbiddenExp) {
7368 int mult;
7369
7370 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7371 NULL, &tmp);
7372 if (mult <= 0) {
7373 return(forbiddenExp);
7374 }
7375 if (sub->exp_max == -1) {
7376 max = -1;
7377 if (exp->exp_max == -1) {
7378 if (exp->exp_min <= sub->exp_min * mult)
7379 min = 0;
7380 else
7381 min = exp->exp_min - sub->exp_min * mult;
7382 } else {
7383 xmlExpFree(ctxt, tmp);
7384 return(forbiddenExp);
7385 }
7386 } else {
7387 if (exp->exp_max == -1) {
7388 if (exp->exp_min > sub->exp_min * mult) {
7389 max = -1;
7390 min = exp->exp_min - sub->exp_min * mult;
7391 } else {
7392 max = -1;
7393 min = 0;
7394 }
7395 } else {
7396 if (exp->exp_max < sub->exp_max * mult) {
7397 xmlExpFree(ctxt, tmp);
7398 return(forbiddenExp);
7399 }
7400 if (sub->exp_max * mult > exp->exp_min)
7401 min = 0;
7402 else
7403 min = exp->exp_min - sub->exp_max * mult;
7404 max = exp->exp_max - sub->exp_max * mult;
7405 }
7406 }
7407 } else if (!IS_NILLABLE(tmp)) {
7408 /*
7409 * TODO: loop here to try to grow if working on finite
7410 * blocks.
7411 */
7412 xmlExpFree(ctxt, tmp);
7413 return(forbiddenExp);
7414 } else if (sub->exp_max == -1) {
7415 if (exp->exp_max == -1) {
7416 if (exp->exp_min <= sub->exp_min) {
7417 max = -1;
7418 min = 0;
7419 } else {
7420 max = -1;
7421 min = exp->exp_min - sub->exp_min;
7422 }
7423 } else if (exp->exp_min > sub->exp_min) {
7424 xmlExpFree(ctxt, tmp);
7425 return(forbiddenExp);
7426 } else {
7427 max = -1;
7428 min = 0;
7429 }
7430 } else {
7431 if (exp->exp_max == -1) {
7432 if (exp->exp_min > sub->exp_min) {
7433 max = -1;
7434 min = exp->exp_min - sub->exp_min;
7435 } else {
7436 max = -1;
7437 min = 0;
7438 }
7439 } else {
7440 if (exp->exp_max < sub->exp_max) {
7441 xmlExpFree(ctxt, tmp);
7442 return(forbiddenExp);
7443 }
7444 if (sub->exp_max > exp->exp_min)
7445 min = 0;
7446 else
7447 min = exp->exp_min - sub->exp_max;
7448 max = exp->exp_max - sub->exp_max;
7449 }
7450 }
7451 exp->exp_left->ref++;
7452 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7453 NULL, NULL, min, max);
7454 if (tmp2 == NULL) {
7455 return(NULL);
7456 }
7457 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7458 NULL, 0, 0);
7459 return(ret);
7460 }
7461 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7462 if (tmp == NULL)
7463 return(NULL);
7464 if (tmp == forbiddenExp) {
7465 return(forbiddenExp);
7466 }
7467 if (exp->exp_min > 0)
7468 min = exp->exp_min - 1;
7469 else
7470 min = 0;
7471 if (exp->exp_max < 0)
7472 max = -1;
7473 else
7474 max = exp->exp_max - 1;
7475
7476 exp->exp_left->ref++;
7477 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7478 NULL, NULL, min, max);
7479 if (tmp2 == NULL)
7480 return(NULL);
7481 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7482 NULL, 0, 0);
7483 return(ret);
7484 }
7485 }
7486
7487 if (IS_NILLABLE(sub)) {
7488 if (!(IS_NILLABLE(exp)))
7489 return(forbiddenExp);
7490 else
7491 ret = emptyExp;
7492 } else
7493 ret = NULL;
7494 /*
7495 * here the structured derivation made no progress so
7496 * we use the default token based derivation to force one more step
7497 */
7498 if (ctxt->tabSize == 0)
7499 ctxt->tabSize = 40;
7500
7501 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7502 sizeof(const xmlChar *));
7503 if (tab == NULL) {
7504 return(NULL);
7505 }
7506
7507 /*
7508 * collect all the strings accepted by the subexpression on input
7509 */
7510 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7511 while (len < 0) {
7512 const xmlChar **temp;
7513 int newSize;
7514
7515 newSize = xmlGrowCapacity(ctxt->tabSize, sizeof(temp[0]),
7516 40, XML_MAX_ITEMS);
7517 if (newSize < 0) {
7518 xmlFree(tab);
7519 return(NULL);
7520 }
7521 temp = xmlRealloc(tab, newSize * sizeof(temp[0]));
7522 if (temp == NULL) {
7523 xmlFree(tab);
7524 return(NULL);
7525 }
7526 tab = temp;
7527 ctxt->tabSize = newSize;
7528 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7529 }
7530 for (i = 0;i < len;i++) {
7531 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7532 if ((tmp == NULL) || (tmp == forbiddenExp)) {
7533 xmlExpFree(ctxt, ret);
7534 xmlFree((xmlChar **) tab);
7535 return(tmp);
7536 }
7537 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7538 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7539 xmlExpFree(ctxt, tmp);
7540 xmlExpFree(ctxt, ret);
7541 xmlFree((xmlChar **) tab);
7542 return(tmp);
7543 }
7544 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7545 xmlExpFree(ctxt, tmp);
7546 xmlExpFree(ctxt, tmp2);
7547
7548 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7549 xmlExpFree(ctxt, ret);
7550 xmlFree((xmlChar **) tab);
7551 return(tmp3);
7552 }
7553
7554 if (ret == NULL)
7555 ret = tmp3;
7556 else {
7557 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7558 if (ret == NULL) {
7559 xmlFree((xmlChar **) tab);
7560 return(NULL);
7561 }
7562 }
7563 }
7564 xmlFree((xmlChar **) tab);
7565 return(ret);
7566 }
7567
7568 /**
7569 * xmlExpExpDerive:
7570 * @ctxt: the expressions context
7571 * @exp: the englobing expression
7572 * @sub: the subexpression
7573 *
7574 * Evaluates the expression resulting from @exp consuming a sub expression @sub
7575 * Based on algebraic derivation and sometimes direct Brzozowski derivation
7576 * it usually takes less than linear time and can handle expressions generating
7577 * infinite languages.
7578 *
7579 * Returns the resulting expression or NULL in case of internal error, the
7580 * result must be freed
7581 */
7582 xmlExpNodePtr
xmlExpExpDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7583 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7584 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7585 return(NULL);
7586
7587 /*
7588 * O(1) speedups
7589 */
7590 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7591 return(forbiddenExp);
7592 }
7593 if (xmlExpCheckCard(exp, sub) == 0) {
7594 return(forbiddenExp);
7595 }
7596 return(xmlExpExpDeriveInt(ctxt, exp, sub));
7597 }
7598
7599 /**
7600 * xmlExpSubsume:
7601 * @ctxt: the expressions context
7602 * @exp: the englobing expression
7603 * @sub: the subexpression
7604 *
7605 * Check whether @exp accepts all the languages accepted by @sub
7606 * the input being a subexpression.
7607 *
7608 * Returns 1 if true 0 if false and -1 in case of failure.
7609 */
7610 int
xmlExpSubsume(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7611 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7612 xmlExpNodePtr tmp;
7613
7614 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7615 return(-1);
7616
7617 /*
7618 * TODO: speedup by checking the language of sub is a subset of the
7619 * language of exp
7620 */
7621 /*
7622 * O(1) speedups
7623 */
7624 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7625 return(0);
7626 }
7627 if (xmlExpCheckCard(exp, sub) == 0) {
7628 return(0);
7629 }
7630 tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7631 if (tmp == NULL)
7632 return(-1);
7633 if (tmp == forbiddenExp)
7634 return(0);
7635 if (tmp == emptyExp)
7636 return(1);
7637 if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
7638 xmlExpFree(ctxt, tmp);
7639 return(1);
7640 }
7641 xmlExpFree(ctxt, tmp);
7642 return(0);
7643 }
7644
7645 /************************************************************************
7646 * *
7647 * Parsing expression *
7648 * *
7649 ************************************************************************/
7650
7651 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
7652
7653 #undef CUR
7654 #define CUR (*ctxt->cur)
7655 #undef NEXT
7656 #define NEXT ctxt->cur++;
7657 #undef IS_BLANK
7658 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
7659 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
7660
7661 static int
xmlExpParseNumber(xmlExpCtxtPtr ctxt)7662 xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
7663 int ret = 0;
7664
7665 SKIP_BLANKS
7666 if (CUR == '*') {
7667 NEXT
7668 return(-1);
7669 }
7670 if ((CUR < '0') || (CUR > '9'))
7671 return(-1);
7672 while ((CUR >= '0') && (CUR <= '9')) {
7673 ret = ret * 10 + (CUR - '0');
7674 NEXT
7675 }
7676 return(ret);
7677 }
7678
7679 static xmlExpNodePtr
xmlExpParseOr(xmlExpCtxtPtr ctxt)7680 xmlExpParseOr(xmlExpCtxtPtr ctxt) {
7681 const char *base;
7682 xmlExpNodePtr ret;
7683 const xmlChar *val;
7684
7685 SKIP_BLANKS
7686 base = ctxt->cur;
7687 if (*ctxt->cur == '(') {
7688 NEXT
7689 ret = xmlExpParseExpr(ctxt);
7690 SKIP_BLANKS
7691 if (*ctxt->cur != ')') {
7692 xmlExpFree(ctxt, ret);
7693 return(NULL);
7694 }
7695 NEXT;
7696 SKIP_BLANKS
7697 goto parse_quantifier;
7698 }
7699 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
7700 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
7701 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
7702 NEXT;
7703 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
7704 if (val == NULL)
7705 return(NULL);
7706 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
7707 if (ret == NULL)
7708 return(NULL);
7709 SKIP_BLANKS
7710 parse_quantifier:
7711 if (CUR == '{') {
7712 int min, max;
7713
7714 NEXT
7715 min = xmlExpParseNumber(ctxt);
7716 if (min < 0) {
7717 xmlExpFree(ctxt, ret);
7718 return(NULL);
7719 }
7720 SKIP_BLANKS
7721 if (CUR == ',') {
7722 NEXT
7723 max = xmlExpParseNumber(ctxt);
7724 SKIP_BLANKS
7725 } else
7726 max = min;
7727 if (CUR != '}') {
7728 xmlExpFree(ctxt, ret);
7729 return(NULL);
7730 }
7731 NEXT
7732 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7733 min, max);
7734 SKIP_BLANKS
7735 } else if (CUR == '?') {
7736 NEXT
7737 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7738 0, 1);
7739 SKIP_BLANKS
7740 } else if (CUR == '+') {
7741 NEXT
7742 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7743 1, -1);
7744 SKIP_BLANKS
7745 } else if (CUR == '*') {
7746 NEXT
7747 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7748 0, -1);
7749 SKIP_BLANKS
7750 }
7751 return(ret);
7752 }
7753
7754
7755 static xmlExpNodePtr
xmlExpParseSeq(xmlExpCtxtPtr ctxt)7756 xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
7757 xmlExpNodePtr ret, right;
7758
7759 ret = xmlExpParseOr(ctxt);
7760 SKIP_BLANKS
7761 while (CUR == '|') {
7762 NEXT
7763 right = xmlExpParseOr(ctxt);
7764 if (right == NULL) {
7765 xmlExpFree(ctxt, ret);
7766 return(NULL);
7767 }
7768 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
7769 if (ret == NULL)
7770 return(NULL);
7771 }
7772 return(ret);
7773 }
7774
7775 static xmlExpNodePtr
xmlExpParseExpr(xmlExpCtxtPtr ctxt)7776 xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
7777 xmlExpNodePtr ret, right;
7778
7779 ret = xmlExpParseSeq(ctxt);
7780 SKIP_BLANKS
7781 while (CUR == ',') {
7782 NEXT
7783 right = xmlExpParseSeq(ctxt);
7784 if (right == NULL) {
7785 xmlExpFree(ctxt, ret);
7786 return(NULL);
7787 }
7788 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
7789 if (ret == NULL)
7790 return(NULL);
7791 }
7792 return(ret);
7793 }
7794
7795 /**
7796 * xmlExpParse:
7797 * @ctxt: the expressions context
7798 * @expr: the 0 terminated string
7799 *
7800 * Minimal parser for regexps, it understand the following constructs
7801 * - string terminals
7802 * - choice operator |
7803 * - sequence operator ,
7804 * - subexpressions (...)
7805 * - usual cardinality operators + * and ?
7806 * - finite sequences { min, max }
7807 * - infinite sequences { min, * }
7808 * There is minimal checkings made especially no checking on strings values
7809 *
7810 * Returns a new expression or NULL in case of failure
7811 */
7812 xmlExpNodePtr
xmlExpParse(xmlExpCtxtPtr ctxt,const char * expr)7813 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
7814 xmlExpNodePtr ret;
7815
7816 ctxt->expr = expr;
7817 ctxt->cur = expr;
7818
7819 ret = xmlExpParseExpr(ctxt);
7820 SKIP_BLANKS
7821 if (*ctxt->cur != 0) {
7822 xmlExpFree(ctxt, ret);
7823 return(NULL);
7824 }
7825 return(ret);
7826 }
7827
7828 static void
xmlExpDumpInt(xmlBufferPtr buf,xmlExpNodePtr expr,int glob)7829 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
7830 xmlExpNodePtr c;
7831
7832 if (expr == NULL) return;
7833 if (glob) xmlBufferWriteChar(buf, "(");
7834 switch (expr->type) {
7835 case XML_EXP_EMPTY:
7836 xmlBufferWriteChar(buf, "empty");
7837 break;
7838 case XML_EXP_FORBID:
7839 xmlBufferWriteChar(buf, "forbidden");
7840 break;
7841 case XML_EXP_ATOM:
7842 xmlBufferWriteCHAR(buf, expr->exp_str);
7843 break;
7844 case XML_EXP_SEQ:
7845 c = expr->exp_left;
7846 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7847 xmlExpDumpInt(buf, c, 1);
7848 else
7849 xmlExpDumpInt(buf, c, 0);
7850 xmlBufferWriteChar(buf, " , ");
7851 c = expr->exp_right;
7852 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7853 xmlExpDumpInt(buf, c, 1);
7854 else
7855 xmlExpDumpInt(buf, c, 0);
7856 break;
7857 case XML_EXP_OR:
7858 c = expr->exp_left;
7859 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7860 xmlExpDumpInt(buf, c, 1);
7861 else
7862 xmlExpDumpInt(buf, c, 0);
7863 xmlBufferWriteChar(buf, " | ");
7864 c = expr->exp_right;
7865 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7866 xmlExpDumpInt(buf, c, 1);
7867 else
7868 xmlExpDumpInt(buf, c, 0);
7869 break;
7870 case XML_EXP_COUNT: {
7871 char rep[40];
7872
7873 c = expr->exp_left;
7874 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7875 xmlExpDumpInt(buf, c, 1);
7876 else
7877 xmlExpDumpInt(buf, c, 0);
7878 if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
7879 rep[0] = '?';
7880 rep[1] = 0;
7881 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
7882 rep[0] = '*';
7883 rep[1] = 0;
7884 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
7885 rep[0] = '+';
7886 rep[1] = 0;
7887 } else if (expr->exp_max == expr->exp_min) {
7888 snprintf(rep, 39, "{%d}", expr->exp_min);
7889 } else if (expr->exp_max < 0) {
7890 snprintf(rep, 39, "{%d,inf}", expr->exp_min);
7891 } else {
7892 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
7893 }
7894 rep[39] = 0;
7895 xmlBufferWriteChar(buf, rep);
7896 break;
7897 }
7898 default:
7899 break;
7900 }
7901 if (glob)
7902 xmlBufferWriteChar(buf, ")");
7903 }
7904 /**
7905 * xmlExpDump:
7906 * @buf: a buffer to receive the output
7907 * @expr: the compiled expression
7908 *
7909 * Serialize the expression as compiled to the buffer
7910 */
7911 void
xmlExpDump(xmlBufferPtr buf,xmlExpNodePtr expr)7912 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
7913 if ((buf == NULL) || (expr == NULL))
7914 return;
7915 xmlExpDumpInt(buf, expr, 0);
7916 }
7917
7918 /**
7919 * xmlExpMaxToken:
7920 * @expr: a compiled expression
7921 *
7922 * Indicate the maximum number of input a expression can accept
7923 *
7924 * Returns the maximum length or -1 in case of error
7925 */
7926 int
xmlExpMaxToken(xmlExpNodePtr expr)7927 xmlExpMaxToken(xmlExpNodePtr expr) {
7928 if (expr == NULL)
7929 return(-1);
7930 return(expr->c_max);
7931 }
7932
7933 /**
7934 * xmlExpCtxtNbNodes:
7935 * @ctxt: an expression context
7936 *
7937 * Debugging facility provides the number of allocated nodes at a that point
7938 *
7939 * Returns the number of nodes in use or -1 in case of error
7940 */
7941 int
xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt)7942 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
7943 if (ctxt == NULL)
7944 return(-1);
7945 return(ctxt->nb_nodes);
7946 }
7947
7948 /**
7949 * xmlExpCtxtNbCons:
7950 * @ctxt: an expression context
7951 *
7952 * Debugging facility provides the number of allocated nodes over lifetime
7953 *
7954 * Returns the number of nodes ever allocated or -1 in case of error
7955 */
7956 int
xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt)7957 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
7958 if (ctxt == NULL)
7959 return(-1);
7960 return(ctxt->nb_cons);
7961 }
7962
7963 /** DOC_ENABLE */
7964 #endif /* LIBXML_EXPR_ENABLED */
7965
7966 #endif /* LIBXML_REGEXP_ENABLED */
7967