1 /*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/schemas mechanisms now available in
6 * XML related specifications these include:
7 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17 #define IN_LIBXML
18 #include "libxml.h"
19
20 #ifdef LIBXML_REGEXP_ENABLED
21
22 /* #define DEBUG_ERR */
23
24 #include <stdio.h>
25 #include <string.h>
26 #ifdef HAVE_LIMITS_H
27 #include <limits.h>
28 #endif
29
30 #include <libxml/tree.h>
31 #include <libxml/parserInternals.h>
32 #include <libxml/xmlregexp.h>
33 #include <libxml/xmlautomata.h>
34 #include <libxml/xmlunicode.h>
35
36 #ifndef INT_MAX
37 #define INT_MAX 123456789 /* easy to flag and big enough for our needs */
38 #endif
39
40 /* #define DEBUG_REGEXP_GRAPH */
41 /* #define DEBUG_REGEXP_EXEC */
42 /* #define DEBUG_PUSH */
43 /* #define DEBUG_COMPACTION */
44
45 #define MAX_PUSH 10000000
46
47 #ifdef ERROR
48 #undef ERROR
49 #endif
50 #define ERROR(str) \
51 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
52 xmlRegexpErrCompile(ctxt, str);
53 #define NEXT ctxt->cur++
54 #define CUR (*(ctxt->cur))
55 #define NXT(index) (ctxt->cur[index])
56
57 #define CUR_SCHAR(s, l) xmlStringCurrentChar(NULL, s, &l)
58 #define NEXTL(l) ctxt->cur += l;
59 #define XML_REG_STRING_SEPARATOR '|'
60 /*
61 * Need PREV to check on a '-' within a Character Group. May only be used
62 * when it's guaranteed that cur is not at the beginning of ctxt->string!
63 */
64 #define PREV (ctxt->cur[-1])
65
66 /**
67 * TODO:
68 *
69 * macro to flag unimplemented blocks
70 */
71 #define TODO \
72 xmlGenericError(xmlGenericErrorContext, \
73 "Unimplemented block at %s:%d\n", \
74 __FILE__, __LINE__);
75
76 /************************************************************************
77 * *
78 * Datatypes and structures *
79 * *
80 ************************************************************************/
81
82 /*
83 * Note: the order of the enums below is significant, do not shuffle
84 */
85 typedef enum {
86 XML_REGEXP_EPSILON = 1,
87 XML_REGEXP_CHARVAL,
88 XML_REGEXP_RANGES,
89 XML_REGEXP_SUBREG, /* used for () sub regexps */
90 XML_REGEXP_STRING,
91 XML_REGEXP_ANYCHAR, /* . */
92 XML_REGEXP_ANYSPACE, /* \s */
93 XML_REGEXP_NOTSPACE, /* \S */
94 XML_REGEXP_INITNAME, /* \l */
95 XML_REGEXP_NOTINITNAME, /* \L */
96 XML_REGEXP_NAMECHAR, /* \c */
97 XML_REGEXP_NOTNAMECHAR, /* \C */
98 XML_REGEXP_DECIMAL, /* \d */
99 XML_REGEXP_NOTDECIMAL, /* \D */
100 XML_REGEXP_REALCHAR, /* \w */
101 XML_REGEXP_NOTREALCHAR, /* \W */
102 XML_REGEXP_LETTER = 100,
103 XML_REGEXP_LETTER_UPPERCASE,
104 XML_REGEXP_LETTER_LOWERCASE,
105 XML_REGEXP_LETTER_TITLECASE,
106 XML_REGEXP_LETTER_MODIFIER,
107 XML_REGEXP_LETTER_OTHERS,
108 XML_REGEXP_MARK,
109 XML_REGEXP_MARK_NONSPACING,
110 XML_REGEXP_MARK_SPACECOMBINING,
111 XML_REGEXP_MARK_ENCLOSING,
112 XML_REGEXP_NUMBER,
113 XML_REGEXP_NUMBER_DECIMAL,
114 XML_REGEXP_NUMBER_LETTER,
115 XML_REGEXP_NUMBER_OTHERS,
116 XML_REGEXP_PUNCT,
117 XML_REGEXP_PUNCT_CONNECTOR,
118 XML_REGEXP_PUNCT_DASH,
119 XML_REGEXP_PUNCT_OPEN,
120 XML_REGEXP_PUNCT_CLOSE,
121 XML_REGEXP_PUNCT_INITQUOTE,
122 XML_REGEXP_PUNCT_FINQUOTE,
123 XML_REGEXP_PUNCT_OTHERS,
124 XML_REGEXP_SEPAR,
125 XML_REGEXP_SEPAR_SPACE,
126 XML_REGEXP_SEPAR_LINE,
127 XML_REGEXP_SEPAR_PARA,
128 XML_REGEXP_SYMBOL,
129 XML_REGEXP_SYMBOL_MATH,
130 XML_REGEXP_SYMBOL_CURRENCY,
131 XML_REGEXP_SYMBOL_MODIFIER,
132 XML_REGEXP_SYMBOL_OTHERS,
133 XML_REGEXP_OTHER,
134 XML_REGEXP_OTHER_CONTROL,
135 XML_REGEXP_OTHER_FORMAT,
136 XML_REGEXP_OTHER_PRIVATE,
137 XML_REGEXP_OTHER_NA,
138 XML_REGEXP_BLOCK_NAME
139 } xmlRegAtomType;
140
141 typedef enum {
142 XML_REGEXP_QUANT_EPSILON = 1,
143 XML_REGEXP_QUANT_ONCE,
144 XML_REGEXP_QUANT_OPT,
145 XML_REGEXP_QUANT_MULT,
146 XML_REGEXP_QUANT_PLUS,
147 XML_REGEXP_QUANT_ONCEONLY,
148 XML_REGEXP_QUANT_ALL,
149 XML_REGEXP_QUANT_RANGE
150 } xmlRegQuantType;
151
152 typedef enum {
153 XML_REGEXP_START_STATE = 1,
154 XML_REGEXP_FINAL_STATE,
155 XML_REGEXP_TRANS_STATE,
156 XML_REGEXP_SINK_STATE,
157 XML_REGEXP_UNREACH_STATE
158 } xmlRegStateType;
159
160 typedef enum {
161 XML_REGEXP_MARK_NORMAL = 0,
162 XML_REGEXP_MARK_START,
163 XML_REGEXP_MARK_VISITED
164 } xmlRegMarkedType;
165
166 typedef struct _xmlRegRange xmlRegRange;
167 typedef xmlRegRange *xmlRegRangePtr;
168
169 struct _xmlRegRange {
170 int neg; /* 0 normal, 1 not, 2 exclude */
171 xmlRegAtomType type;
172 int start;
173 int end;
174 xmlChar *blockName;
175 };
176
177 typedef struct _xmlRegAtom xmlRegAtom;
178 typedef xmlRegAtom *xmlRegAtomPtr;
179
180 typedef struct _xmlAutomataState xmlRegState;
181 typedef xmlRegState *xmlRegStatePtr;
182
183 struct _xmlRegAtom {
184 int no;
185 xmlRegAtomType type;
186 xmlRegQuantType quant;
187 int min;
188 int max;
189
190 void *valuep;
191 void *valuep2;
192 int neg;
193 int codepoint;
194 xmlRegStatePtr start;
195 xmlRegStatePtr start0;
196 xmlRegStatePtr stop;
197 int maxRanges;
198 int nbRanges;
199 xmlRegRangePtr *ranges;
200 void *data;
201 };
202
203 typedef struct _xmlRegCounter xmlRegCounter;
204 typedef xmlRegCounter *xmlRegCounterPtr;
205
206 struct _xmlRegCounter {
207 int min;
208 int max;
209 };
210
211 typedef struct _xmlRegTrans xmlRegTrans;
212 typedef xmlRegTrans *xmlRegTransPtr;
213
214 struct _xmlRegTrans {
215 xmlRegAtomPtr atom;
216 int to;
217 int counter;
218 int count;
219 int nd;
220 };
221
222 struct _xmlAutomataState {
223 xmlRegStateType type;
224 xmlRegMarkedType mark;
225 xmlRegMarkedType markd;
226 xmlRegMarkedType reached;
227 int no;
228 int maxTrans;
229 int nbTrans;
230 xmlRegTrans *trans;
231 /* knowing states pointing to us can speed things up */
232 int maxTransTo;
233 int nbTransTo;
234 int *transTo;
235 };
236
237 typedef struct _xmlAutomata xmlRegParserCtxt;
238 typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
239
240 #define AM_AUTOMATA_RNG 1
241
242 struct _xmlAutomata {
243 xmlChar *string;
244 xmlChar *cur;
245
246 int error;
247 int neg;
248
249 xmlRegStatePtr start;
250 xmlRegStatePtr end;
251 xmlRegStatePtr state;
252
253 xmlRegAtomPtr atom;
254
255 int maxAtoms;
256 int nbAtoms;
257 xmlRegAtomPtr *atoms;
258
259 int maxStates;
260 int nbStates;
261 xmlRegStatePtr *states;
262
263 int maxCounters;
264 int nbCounters;
265 xmlRegCounter *counters;
266
267 int determinist;
268 int negs;
269 int flags;
270 };
271
272 struct _xmlRegexp {
273 xmlChar *string;
274 int nbStates;
275 xmlRegStatePtr *states;
276 int nbAtoms;
277 xmlRegAtomPtr *atoms;
278 int nbCounters;
279 xmlRegCounter *counters;
280 int determinist;
281 int flags;
282 /*
283 * That's the compact form for determinists automatas
284 */
285 int nbstates;
286 int *compact;
287 void **transdata;
288 int nbstrings;
289 xmlChar **stringMap;
290 };
291
292 typedef struct _xmlRegExecRollback xmlRegExecRollback;
293 typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
294
295 struct _xmlRegExecRollback {
296 xmlRegStatePtr state;/* the current state */
297 int index; /* the index in the input stack */
298 int nextbranch; /* the next transition to explore in that state */
299 int *counts; /* save the automata state if it has some */
300 };
301
302 typedef struct _xmlRegInputToken xmlRegInputToken;
303 typedef xmlRegInputToken *xmlRegInputTokenPtr;
304
305 struct _xmlRegInputToken {
306 xmlChar *value;
307 void *data;
308 };
309
310 struct _xmlRegExecCtxt {
311 int status; /* execution status != 0 indicate an error */
312 int determinist; /* did we find an indeterministic behaviour */
313 xmlRegexpPtr comp; /* the compiled regexp */
314 xmlRegExecCallbacks callback;
315 void *data;
316
317 xmlRegStatePtr state;/* the current state */
318 int transno; /* the current transition on that state */
319 int transcount; /* the number of chars in char counted transitions */
320
321 /*
322 * A stack of rollback states
323 */
324 int maxRollbacks;
325 int nbRollbacks;
326 xmlRegExecRollback *rollbacks;
327
328 /*
329 * The state of the automata if any
330 */
331 int *counts;
332
333 /*
334 * The input stack
335 */
336 int inputStackMax;
337 int inputStackNr;
338 int index;
339 int *charStack;
340 const xmlChar *inputString; /* when operating on characters */
341 xmlRegInputTokenPtr inputStack;/* when operating on strings */
342
343 /*
344 * error handling
345 */
346 int errStateNo; /* the error state number */
347 xmlRegStatePtr errState; /* the error state */
348 xmlChar *errString; /* the string raising the error */
349 int *errCounts; /* counters at the error state */
350 int nbPush;
351 };
352
353 #define REGEXP_ALL_COUNTER 0x123456
354 #define REGEXP_ALL_LAX_COUNTER 0x123457
355
356 static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
357 static void xmlRegFreeState(xmlRegStatePtr state);
358 static void xmlRegFreeAtom(xmlRegAtomPtr atom);
359 static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
360 static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
361 static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
362 int neg, int start, int end, const xmlChar *blockName);
363
364 void xmlAutomataSetFlags(xmlAutomataPtr am, int flags);
365
366 /************************************************************************
367 * *
368 * Regexp memory error handler *
369 * *
370 ************************************************************************/
371 /**
372 * xmlRegexpErrMemory:
373 * @extra: extra information
374 *
375 * Handle an out of memory condition
376 */
377 static void
xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt,const char * extra)378 xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt, const char *extra)
379 {
380 const char *regexp = NULL;
381 if (ctxt != NULL) {
382 regexp = (const char *) ctxt->string;
383 ctxt->error = XML_ERR_NO_MEMORY;
384 }
385 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
386 XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
387 regexp, NULL, 0, 0,
388 "Memory allocation failed : %s\n", extra);
389 }
390
391 /**
392 * xmlRegexpErrCompile:
393 * @extra: extra information
394 *
395 * Handle a compilation failure
396 */
397 static void
xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt,const char * extra)398 xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
399 {
400 const char *regexp = NULL;
401 int idx = 0;
402
403 if (ctxt != NULL) {
404 regexp = (const char *) ctxt->string;
405 idx = ctxt->cur - ctxt->string;
406 ctxt->error = XML_REGEXP_COMPILE_ERROR;
407 }
408 __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
409 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL, NULL, 0, extra,
410 regexp, NULL, idx, 0,
411 "failed to compile: %s\n", extra);
412 }
413
414 /************************************************************************
415 * *
416 * Allocation/Deallocation *
417 * *
418 ************************************************************************/
419
420 static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
421 /**
422 * xmlRegEpxFromParse:
423 * @ctxt: the parser context used to build it
424 *
425 * Allocate a new regexp and fill it with the result from the parser
426 *
427 * Returns the new regexp or NULL in case of error
428 */
429 static xmlRegexpPtr
xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt)430 xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
431 xmlRegexpPtr ret;
432
433 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
434 if (ret == NULL) {
435 xmlRegexpErrMemory(ctxt, "compiling regexp");
436 return(NULL);
437 }
438 memset(ret, 0, sizeof(xmlRegexp));
439 ret->string = ctxt->string;
440 ret->nbStates = ctxt->nbStates;
441 ret->states = ctxt->states;
442 ret->nbAtoms = ctxt->nbAtoms;
443 ret->atoms = ctxt->atoms;
444 ret->nbCounters = ctxt->nbCounters;
445 ret->counters = ctxt->counters;
446 ret->determinist = ctxt->determinist;
447 ret->flags = ctxt->flags;
448 if (ret->determinist == -1) {
449 xmlRegexpIsDeterminist(ret);
450 }
451
452 if ((ret->determinist != 0) &&
453 (ret->nbCounters == 0) &&
454 (ctxt->negs == 0) &&
455 (ret->atoms != NULL) &&
456 (ret->atoms[0] != NULL) &&
457 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
458 int i, j, nbstates = 0, nbatoms = 0;
459 int *stateRemap;
460 int *stringRemap;
461 int *transitions;
462 void **transdata;
463 xmlChar **stringMap;
464 xmlChar *value;
465
466 /*
467 * Switch to a compact representation
468 * 1/ counting the effective number of states left
469 * 2/ counting the unique number of atoms, and check that
470 * they are all of the string type
471 * 3/ build a table state x atom for the transitions
472 */
473
474 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
475 if (stateRemap == NULL) {
476 xmlRegexpErrMemory(ctxt, "compiling regexp");
477 xmlFree(ret);
478 return(NULL);
479 }
480 for (i = 0;i < ret->nbStates;i++) {
481 if (ret->states[i] != NULL) {
482 stateRemap[i] = nbstates;
483 nbstates++;
484 } else {
485 stateRemap[i] = -1;
486 }
487 }
488 #ifdef DEBUG_COMPACTION
489 printf("Final: %d states\n", nbstates);
490 #endif
491 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
492 if (stringMap == NULL) {
493 xmlRegexpErrMemory(ctxt, "compiling regexp");
494 xmlFree(stateRemap);
495 xmlFree(ret);
496 return(NULL);
497 }
498 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
499 if (stringRemap == NULL) {
500 xmlRegexpErrMemory(ctxt, "compiling regexp");
501 xmlFree(stringMap);
502 xmlFree(stateRemap);
503 xmlFree(ret);
504 return(NULL);
505 }
506 for (i = 0;i < ret->nbAtoms;i++) {
507 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
508 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
509 value = ret->atoms[i]->valuep;
510 for (j = 0;j < nbatoms;j++) {
511 if (xmlStrEqual(stringMap[j], value)) {
512 stringRemap[i] = j;
513 break;
514 }
515 }
516 if (j >= nbatoms) {
517 stringRemap[i] = nbatoms;
518 stringMap[nbatoms] = xmlStrdup(value);
519 if (stringMap[nbatoms] == NULL) {
520 for (i = 0;i < nbatoms;i++)
521 xmlFree(stringMap[i]);
522 xmlFree(stringRemap);
523 xmlFree(stringMap);
524 xmlFree(stateRemap);
525 xmlFree(ret);
526 return(NULL);
527 }
528 nbatoms++;
529 }
530 } else {
531 xmlFree(stateRemap);
532 xmlFree(stringRemap);
533 for (i = 0;i < nbatoms;i++)
534 xmlFree(stringMap[i]);
535 xmlFree(stringMap);
536 xmlFree(ret);
537 return(NULL);
538 }
539 }
540 #ifdef DEBUG_COMPACTION
541 printf("Final: %d atoms\n", nbatoms);
542 #endif
543 transitions = (int *) xmlMalloc((nbstates + 1) *
544 (nbatoms + 1) * sizeof(int));
545 if (transitions == NULL) {
546 xmlFree(stateRemap);
547 xmlFree(stringRemap);
548 for (i = 0;i < nbatoms;i++)
549 xmlFree(stringMap[i]);
550 xmlFree(stringMap);
551 xmlFree(ret);
552 return(NULL);
553 }
554 memset(transitions, 0, (nbstates + 1) * (nbatoms + 1) * sizeof(int));
555
556 /*
557 * Allocate the transition table. The first entry for each
558 * state corresponds to the state type.
559 */
560 transdata = NULL;
561
562 for (i = 0;i < ret->nbStates;i++) {
563 int stateno, atomno, targetno, prev;
564 xmlRegStatePtr state;
565 xmlRegTransPtr trans;
566
567 stateno = stateRemap[i];
568 if (stateno == -1)
569 continue;
570 state = ret->states[i];
571
572 transitions[stateno * (nbatoms + 1)] = state->type;
573
574 for (j = 0;j < state->nbTrans;j++) {
575 trans = &(state->trans[j]);
576 if ((trans->to == -1) || (trans->atom == NULL))
577 continue;
578 atomno = stringRemap[trans->atom->no];
579 if ((trans->atom->data != NULL) && (transdata == NULL)) {
580 transdata = (void **) xmlMalloc(nbstates * nbatoms *
581 sizeof(void *));
582 if (transdata != NULL)
583 memset(transdata, 0,
584 nbstates * nbatoms * sizeof(void *));
585 else {
586 xmlRegexpErrMemory(ctxt, "compiling regexp");
587 break;
588 }
589 }
590 targetno = stateRemap[trans->to];
591 /*
592 * if the same atom can generate transitions to 2 different
593 * states then it means the automata is not deterministic and
594 * the compact form can't be used !
595 */
596 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
597 if (prev != 0) {
598 if (prev != targetno + 1) {
599 ret->determinist = 0;
600 #ifdef DEBUG_COMPACTION
601 printf("Indet: state %d trans %d, atom %d to %d : %d to %d\n",
602 i, j, trans->atom->no, trans->to, atomno, targetno);
603 printf(" previous to is %d\n", prev);
604 #endif
605 if (transdata != NULL)
606 xmlFree(transdata);
607 xmlFree(transitions);
608 xmlFree(stateRemap);
609 xmlFree(stringRemap);
610 for (i = 0;i < nbatoms;i++)
611 xmlFree(stringMap[i]);
612 xmlFree(stringMap);
613 goto not_determ;
614 }
615 } else {
616 #if 0
617 printf("State %d trans %d: atom %d to %d : %d to %d\n",
618 i, j, trans->atom->no, trans->to, atomno, targetno);
619 #endif
620 transitions[stateno * (nbatoms + 1) + atomno + 1] =
621 targetno + 1; /* to avoid 0 */
622 if (transdata != NULL)
623 transdata[stateno * nbatoms + atomno] =
624 trans->atom->data;
625 }
626 }
627 }
628 ret->determinist = 1;
629 #ifdef DEBUG_COMPACTION
630 /*
631 * Debug
632 */
633 for (i = 0;i < nbstates;i++) {
634 for (j = 0;j < nbatoms + 1;j++) {
635 printf("%02d ", transitions[i * (nbatoms + 1) + j]);
636 }
637 printf("\n");
638 }
639 printf("\n");
640 #endif
641 /*
642 * Cleanup of the old data
643 */
644 if (ret->states != NULL) {
645 for (i = 0;i < ret->nbStates;i++)
646 xmlRegFreeState(ret->states[i]);
647 xmlFree(ret->states);
648 }
649 ret->states = NULL;
650 ret->nbStates = 0;
651 if (ret->atoms != NULL) {
652 for (i = 0;i < ret->nbAtoms;i++)
653 xmlRegFreeAtom(ret->atoms[i]);
654 xmlFree(ret->atoms);
655 }
656 ret->atoms = NULL;
657 ret->nbAtoms = 0;
658
659 ret->compact = transitions;
660 ret->transdata = transdata;
661 ret->stringMap = stringMap;
662 ret->nbstrings = nbatoms;
663 ret->nbstates = nbstates;
664 xmlFree(stateRemap);
665 xmlFree(stringRemap);
666 }
667 not_determ:
668 ctxt->string = NULL;
669 ctxt->nbStates = 0;
670 ctxt->states = NULL;
671 ctxt->nbAtoms = 0;
672 ctxt->atoms = NULL;
673 ctxt->nbCounters = 0;
674 ctxt->counters = NULL;
675 return(ret);
676 }
677
678 /**
679 * xmlRegNewParserCtxt:
680 * @string: the string to parse
681 *
682 * Allocate a new regexp parser context
683 *
684 * Returns the new context or NULL in case of error
685 */
686 static xmlRegParserCtxtPtr
xmlRegNewParserCtxt(const xmlChar * string)687 xmlRegNewParserCtxt(const xmlChar *string) {
688 xmlRegParserCtxtPtr ret;
689
690 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
691 if (ret == NULL)
692 return(NULL);
693 memset(ret, 0, sizeof(xmlRegParserCtxt));
694 if (string != NULL)
695 ret->string = xmlStrdup(string);
696 ret->cur = ret->string;
697 ret->neg = 0;
698 ret->negs = 0;
699 ret->error = 0;
700 ret->determinist = -1;
701 return(ret);
702 }
703
704 /**
705 * xmlRegNewRange:
706 * @ctxt: the regexp parser context
707 * @neg: is that negative
708 * @type: the type of range
709 * @start: the start codepoint
710 * @end: the end codepoint
711 *
712 * Allocate a new regexp range
713 *
714 * Returns the new range or NULL in case of error
715 */
716 static xmlRegRangePtr
xmlRegNewRange(xmlRegParserCtxtPtr ctxt,int neg,xmlRegAtomType type,int start,int end)717 xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
718 int neg, xmlRegAtomType type, int start, int end) {
719 xmlRegRangePtr ret;
720
721 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
722 if (ret == NULL) {
723 xmlRegexpErrMemory(ctxt, "allocating range");
724 return(NULL);
725 }
726 ret->neg = neg;
727 ret->type = type;
728 ret->start = start;
729 ret->end = end;
730 return(ret);
731 }
732
733 /**
734 * xmlRegFreeRange:
735 * @range: the regexp range
736 *
737 * Free a regexp range
738 */
739 static void
xmlRegFreeRange(xmlRegRangePtr range)740 xmlRegFreeRange(xmlRegRangePtr range) {
741 if (range == NULL)
742 return;
743
744 if (range->blockName != NULL)
745 xmlFree(range->blockName);
746 xmlFree(range);
747 }
748
749 /**
750 * xmlRegCopyRange:
751 * @range: the regexp range
752 *
753 * Copy a regexp range
754 *
755 * Returns the new copy or NULL in case of error.
756 */
757 static xmlRegRangePtr
xmlRegCopyRange(xmlRegParserCtxtPtr ctxt,xmlRegRangePtr range)758 xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
759 xmlRegRangePtr ret;
760
761 if (range == NULL)
762 return(NULL);
763
764 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
765 range->end);
766 if (ret == NULL)
767 return(NULL);
768 if (range->blockName != NULL) {
769 ret->blockName = xmlStrdup(range->blockName);
770 if (ret->blockName == NULL) {
771 xmlRegexpErrMemory(ctxt, "allocating range");
772 xmlRegFreeRange(ret);
773 return(NULL);
774 }
775 }
776 return(ret);
777 }
778
779 /**
780 * xmlRegNewAtom:
781 * @ctxt: the regexp parser context
782 * @type: the type of atom
783 *
784 * Allocate a new atom
785 *
786 * Returns the new atom or NULL in case of error
787 */
788 static xmlRegAtomPtr
xmlRegNewAtom(xmlRegParserCtxtPtr ctxt,xmlRegAtomType type)789 xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
790 xmlRegAtomPtr ret;
791
792 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
793 if (ret == NULL) {
794 xmlRegexpErrMemory(ctxt, "allocating atom");
795 return(NULL);
796 }
797 memset(ret, 0, sizeof(xmlRegAtom));
798 ret->type = type;
799 ret->quant = XML_REGEXP_QUANT_ONCE;
800 ret->min = 0;
801 ret->max = 0;
802 return(ret);
803 }
804
805 /**
806 * xmlRegFreeAtom:
807 * @atom: the regexp atom
808 *
809 * Free a regexp atom
810 */
811 static void
xmlRegFreeAtom(xmlRegAtomPtr atom)812 xmlRegFreeAtom(xmlRegAtomPtr atom) {
813 int i;
814
815 if (atom == NULL)
816 return;
817
818 for (i = 0;i < atom->nbRanges;i++)
819 xmlRegFreeRange(atom->ranges[i]);
820 if (atom->ranges != NULL)
821 xmlFree(atom->ranges);
822 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
823 xmlFree(atom->valuep);
824 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
825 xmlFree(atom->valuep2);
826 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
827 xmlFree(atom->valuep);
828 xmlFree(atom);
829 }
830
831 /**
832 * xmlRegCopyAtom:
833 * @ctxt: the regexp parser context
834 * @atom: the original atom
835 *
836 * Allocate a new regexp range
837 *
838 * Returns the new atom or NULL in case of error
839 */
840 static xmlRegAtomPtr
xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom)841 xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
842 xmlRegAtomPtr ret;
843
844 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
845 if (ret == NULL) {
846 xmlRegexpErrMemory(ctxt, "copying atom");
847 return(NULL);
848 }
849 memset(ret, 0, sizeof(xmlRegAtom));
850 ret->type = atom->type;
851 ret->quant = atom->quant;
852 ret->min = atom->min;
853 ret->max = atom->max;
854 if (atom->nbRanges > 0) {
855 int i;
856
857 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
858 atom->nbRanges);
859 if (ret->ranges == NULL) {
860 xmlRegexpErrMemory(ctxt, "copying atom");
861 goto error;
862 }
863 for (i = 0;i < atom->nbRanges;i++) {
864 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
865 if (ret->ranges[i] == NULL)
866 goto error;
867 ret->nbRanges = i + 1;
868 }
869 }
870 return(ret);
871
872 error:
873 xmlRegFreeAtom(ret);
874 return(NULL);
875 }
876
877 static xmlRegStatePtr
xmlRegNewState(xmlRegParserCtxtPtr ctxt)878 xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
879 xmlRegStatePtr ret;
880
881 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
882 if (ret == NULL) {
883 xmlRegexpErrMemory(ctxt, "allocating state");
884 return(NULL);
885 }
886 memset(ret, 0, sizeof(xmlRegState));
887 ret->type = XML_REGEXP_TRANS_STATE;
888 ret->mark = XML_REGEXP_MARK_NORMAL;
889 return(ret);
890 }
891
892 /**
893 * xmlRegFreeState:
894 * @state: the regexp state
895 *
896 * Free a regexp state
897 */
898 static void
xmlRegFreeState(xmlRegStatePtr state)899 xmlRegFreeState(xmlRegStatePtr state) {
900 if (state == NULL)
901 return;
902
903 if (state->trans != NULL)
904 xmlFree(state->trans);
905 if (state->transTo != NULL)
906 xmlFree(state->transTo);
907 xmlFree(state);
908 }
909
910 /**
911 * xmlRegFreeParserCtxt:
912 * @ctxt: the regexp parser context
913 *
914 * Free a regexp parser context
915 */
916 static void
xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt)917 xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
918 int i;
919 if (ctxt == NULL)
920 return;
921
922 if (ctxt->string != NULL)
923 xmlFree(ctxt->string);
924 if (ctxt->states != NULL) {
925 for (i = 0;i < ctxt->nbStates;i++)
926 xmlRegFreeState(ctxt->states[i]);
927 xmlFree(ctxt->states);
928 }
929 if (ctxt->atoms != NULL) {
930 for (i = 0;i < ctxt->nbAtoms;i++)
931 xmlRegFreeAtom(ctxt->atoms[i]);
932 xmlFree(ctxt->atoms);
933 }
934 if (ctxt->counters != NULL)
935 xmlFree(ctxt->counters);
936 xmlFree(ctxt);
937 }
938
939 /************************************************************************
940 * *
941 * Display of Data structures *
942 * *
943 ************************************************************************/
944
945 static void
xmlRegPrintAtomType(FILE * output,xmlRegAtomType type)946 xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
947 switch (type) {
948 case XML_REGEXP_EPSILON:
949 fprintf(output, "epsilon "); break;
950 case XML_REGEXP_CHARVAL:
951 fprintf(output, "charval "); break;
952 case XML_REGEXP_RANGES:
953 fprintf(output, "ranges "); break;
954 case XML_REGEXP_SUBREG:
955 fprintf(output, "subexpr "); break;
956 case XML_REGEXP_STRING:
957 fprintf(output, "string "); break;
958 case XML_REGEXP_ANYCHAR:
959 fprintf(output, "anychar "); break;
960 case XML_REGEXP_ANYSPACE:
961 fprintf(output, "anyspace "); break;
962 case XML_REGEXP_NOTSPACE:
963 fprintf(output, "notspace "); break;
964 case XML_REGEXP_INITNAME:
965 fprintf(output, "initname "); break;
966 case XML_REGEXP_NOTINITNAME:
967 fprintf(output, "notinitname "); break;
968 case XML_REGEXP_NAMECHAR:
969 fprintf(output, "namechar "); break;
970 case XML_REGEXP_NOTNAMECHAR:
971 fprintf(output, "notnamechar "); break;
972 case XML_REGEXP_DECIMAL:
973 fprintf(output, "decimal "); break;
974 case XML_REGEXP_NOTDECIMAL:
975 fprintf(output, "notdecimal "); break;
976 case XML_REGEXP_REALCHAR:
977 fprintf(output, "realchar "); break;
978 case XML_REGEXP_NOTREALCHAR:
979 fprintf(output, "notrealchar "); break;
980 case XML_REGEXP_LETTER:
981 fprintf(output, "LETTER "); break;
982 case XML_REGEXP_LETTER_UPPERCASE:
983 fprintf(output, "LETTER_UPPERCASE "); break;
984 case XML_REGEXP_LETTER_LOWERCASE:
985 fprintf(output, "LETTER_LOWERCASE "); break;
986 case XML_REGEXP_LETTER_TITLECASE:
987 fprintf(output, "LETTER_TITLECASE "); break;
988 case XML_REGEXP_LETTER_MODIFIER:
989 fprintf(output, "LETTER_MODIFIER "); break;
990 case XML_REGEXP_LETTER_OTHERS:
991 fprintf(output, "LETTER_OTHERS "); break;
992 case XML_REGEXP_MARK:
993 fprintf(output, "MARK "); break;
994 case XML_REGEXP_MARK_NONSPACING:
995 fprintf(output, "MARK_NONSPACING "); break;
996 case XML_REGEXP_MARK_SPACECOMBINING:
997 fprintf(output, "MARK_SPACECOMBINING "); break;
998 case XML_REGEXP_MARK_ENCLOSING:
999 fprintf(output, "MARK_ENCLOSING "); break;
1000 case XML_REGEXP_NUMBER:
1001 fprintf(output, "NUMBER "); break;
1002 case XML_REGEXP_NUMBER_DECIMAL:
1003 fprintf(output, "NUMBER_DECIMAL "); break;
1004 case XML_REGEXP_NUMBER_LETTER:
1005 fprintf(output, "NUMBER_LETTER "); break;
1006 case XML_REGEXP_NUMBER_OTHERS:
1007 fprintf(output, "NUMBER_OTHERS "); break;
1008 case XML_REGEXP_PUNCT:
1009 fprintf(output, "PUNCT "); break;
1010 case XML_REGEXP_PUNCT_CONNECTOR:
1011 fprintf(output, "PUNCT_CONNECTOR "); break;
1012 case XML_REGEXP_PUNCT_DASH:
1013 fprintf(output, "PUNCT_DASH "); break;
1014 case XML_REGEXP_PUNCT_OPEN:
1015 fprintf(output, "PUNCT_OPEN "); break;
1016 case XML_REGEXP_PUNCT_CLOSE:
1017 fprintf(output, "PUNCT_CLOSE "); break;
1018 case XML_REGEXP_PUNCT_INITQUOTE:
1019 fprintf(output, "PUNCT_INITQUOTE "); break;
1020 case XML_REGEXP_PUNCT_FINQUOTE:
1021 fprintf(output, "PUNCT_FINQUOTE "); break;
1022 case XML_REGEXP_PUNCT_OTHERS:
1023 fprintf(output, "PUNCT_OTHERS "); break;
1024 case XML_REGEXP_SEPAR:
1025 fprintf(output, "SEPAR "); break;
1026 case XML_REGEXP_SEPAR_SPACE:
1027 fprintf(output, "SEPAR_SPACE "); break;
1028 case XML_REGEXP_SEPAR_LINE:
1029 fprintf(output, "SEPAR_LINE "); break;
1030 case XML_REGEXP_SEPAR_PARA:
1031 fprintf(output, "SEPAR_PARA "); break;
1032 case XML_REGEXP_SYMBOL:
1033 fprintf(output, "SYMBOL "); break;
1034 case XML_REGEXP_SYMBOL_MATH:
1035 fprintf(output, "SYMBOL_MATH "); break;
1036 case XML_REGEXP_SYMBOL_CURRENCY:
1037 fprintf(output, "SYMBOL_CURRENCY "); break;
1038 case XML_REGEXP_SYMBOL_MODIFIER:
1039 fprintf(output, "SYMBOL_MODIFIER "); break;
1040 case XML_REGEXP_SYMBOL_OTHERS:
1041 fprintf(output, "SYMBOL_OTHERS "); break;
1042 case XML_REGEXP_OTHER:
1043 fprintf(output, "OTHER "); break;
1044 case XML_REGEXP_OTHER_CONTROL:
1045 fprintf(output, "OTHER_CONTROL "); break;
1046 case XML_REGEXP_OTHER_FORMAT:
1047 fprintf(output, "OTHER_FORMAT "); break;
1048 case XML_REGEXP_OTHER_PRIVATE:
1049 fprintf(output, "OTHER_PRIVATE "); break;
1050 case XML_REGEXP_OTHER_NA:
1051 fprintf(output, "OTHER_NA "); break;
1052 case XML_REGEXP_BLOCK_NAME:
1053 fprintf(output, "BLOCK "); break;
1054 }
1055 }
1056
1057 static void
xmlRegPrintQuantType(FILE * output,xmlRegQuantType type)1058 xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1059 switch (type) {
1060 case XML_REGEXP_QUANT_EPSILON:
1061 fprintf(output, "epsilon "); break;
1062 case XML_REGEXP_QUANT_ONCE:
1063 fprintf(output, "once "); break;
1064 case XML_REGEXP_QUANT_OPT:
1065 fprintf(output, "? "); break;
1066 case XML_REGEXP_QUANT_MULT:
1067 fprintf(output, "* "); break;
1068 case XML_REGEXP_QUANT_PLUS:
1069 fprintf(output, "+ "); break;
1070 case XML_REGEXP_QUANT_RANGE:
1071 fprintf(output, "range "); break;
1072 case XML_REGEXP_QUANT_ONCEONLY:
1073 fprintf(output, "onceonly "); break;
1074 case XML_REGEXP_QUANT_ALL:
1075 fprintf(output, "all "); break;
1076 }
1077 }
1078 static void
xmlRegPrintRange(FILE * output,xmlRegRangePtr range)1079 xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1080 fprintf(output, " range: ");
1081 if (range->neg)
1082 fprintf(output, "negative ");
1083 xmlRegPrintAtomType(output, range->type);
1084 fprintf(output, "%c - %c\n", range->start, range->end);
1085 }
1086
1087 static void
xmlRegPrintAtom(FILE * output,xmlRegAtomPtr atom)1088 xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1089 fprintf(output, " atom: ");
1090 if (atom == NULL) {
1091 fprintf(output, "NULL\n");
1092 return;
1093 }
1094 if (atom->neg)
1095 fprintf(output, "not ");
1096 xmlRegPrintAtomType(output, atom->type);
1097 xmlRegPrintQuantType(output, atom->quant);
1098 if (atom->quant == XML_REGEXP_QUANT_RANGE)
1099 fprintf(output, "%d-%d ", atom->min, atom->max);
1100 if (atom->type == XML_REGEXP_STRING)
1101 fprintf(output, "'%s' ", (char *) atom->valuep);
1102 if (atom->type == XML_REGEXP_CHARVAL)
1103 fprintf(output, "char %c\n", atom->codepoint);
1104 else if (atom->type == XML_REGEXP_RANGES) {
1105 int i;
1106 fprintf(output, "%d entries\n", atom->nbRanges);
1107 for (i = 0; i < atom->nbRanges;i++)
1108 xmlRegPrintRange(output, atom->ranges[i]);
1109 } else if (atom->type == XML_REGEXP_SUBREG) {
1110 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1111 } else {
1112 fprintf(output, "\n");
1113 }
1114 }
1115
1116 static void
xmlRegPrintTrans(FILE * output,xmlRegTransPtr trans)1117 xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1118 fprintf(output, " trans: ");
1119 if (trans == NULL) {
1120 fprintf(output, "NULL\n");
1121 return;
1122 }
1123 if (trans->to < 0) {
1124 fprintf(output, "removed\n");
1125 return;
1126 }
1127 if (trans->nd != 0) {
1128 if (trans->nd == 2)
1129 fprintf(output, "last not determinist, ");
1130 else
1131 fprintf(output, "not determinist, ");
1132 }
1133 if (trans->counter >= 0) {
1134 fprintf(output, "counted %d, ", trans->counter);
1135 }
1136 if (trans->count == REGEXP_ALL_COUNTER) {
1137 fprintf(output, "all transition, ");
1138 } else if (trans->count >= 0) {
1139 fprintf(output, "count based %d, ", trans->count);
1140 }
1141 if (trans->atom == NULL) {
1142 fprintf(output, "epsilon to %d\n", trans->to);
1143 return;
1144 }
1145 if (trans->atom->type == XML_REGEXP_CHARVAL)
1146 fprintf(output, "char %c ", trans->atom->codepoint);
1147 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1148 }
1149
1150 static void
xmlRegPrintState(FILE * output,xmlRegStatePtr state)1151 xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1152 int i;
1153
1154 fprintf(output, " state: ");
1155 if (state == NULL) {
1156 fprintf(output, "NULL\n");
1157 return;
1158 }
1159 if (state->type == XML_REGEXP_START_STATE)
1160 fprintf(output, "START ");
1161 if (state->type == XML_REGEXP_FINAL_STATE)
1162 fprintf(output, "FINAL ");
1163
1164 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1165 for (i = 0;i < state->nbTrans; i++) {
1166 xmlRegPrintTrans(output, &(state->trans[i]));
1167 }
1168 }
1169
1170 #ifdef DEBUG_REGEXP_GRAPH
1171 static void
xmlRegPrintCtxt(FILE * output,xmlRegParserCtxtPtr ctxt)1172 xmlRegPrintCtxt(FILE *output, xmlRegParserCtxtPtr ctxt) {
1173 int i;
1174
1175 fprintf(output, " ctxt: ");
1176 if (ctxt == NULL) {
1177 fprintf(output, "NULL\n");
1178 return;
1179 }
1180 fprintf(output, "'%s' ", ctxt->string);
1181 if (ctxt->error)
1182 fprintf(output, "error ");
1183 if (ctxt->neg)
1184 fprintf(output, "neg ");
1185 fprintf(output, "\n");
1186 fprintf(output, "%d atoms:\n", ctxt->nbAtoms);
1187 for (i = 0;i < ctxt->nbAtoms; i++) {
1188 fprintf(output, " %02d ", i);
1189 xmlRegPrintAtom(output, ctxt->atoms[i]);
1190 }
1191 if (ctxt->atom != NULL) {
1192 fprintf(output, "current atom:\n");
1193 xmlRegPrintAtom(output, ctxt->atom);
1194 }
1195 fprintf(output, "%d states:", ctxt->nbStates);
1196 if (ctxt->start != NULL)
1197 fprintf(output, " start: %d", ctxt->start->no);
1198 if (ctxt->end != NULL)
1199 fprintf(output, " end: %d", ctxt->end->no);
1200 fprintf(output, "\n");
1201 for (i = 0;i < ctxt->nbStates; i++) {
1202 xmlRegPrintState(output, ctxt->states[i]);
1203 }
1204 fprintf(output, "%d counters:\n", ctxt->nbCounters);
1205 for (i = 0;i < ctxt->nbCounters; i++) {
1206 fprintf(output, " %d: min %d max %d\n", i, ctxt->counters[i].min,
1207 ctxt->counters[i].max);
1208 }
1209 }
1210 #endif
1211
1212 /************************************************************************
1213 * *
1214 * Finite Automata structures manipulations *
1215 * *
1216 ************************************************************************/
1217
1218 static void
xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom,int neg,xmlRegAtomType type,int start,int end,xmlChar * blockName)1219 xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1220 int neg, xmlRegAtomType type, int start, int end,
1221 xmlChar *blockName) {
1222 xmlRegRangePtr range;
1223
1224 if (atom == NULL) {
1225 ERROR("add range: atom is NULL");
1226 return;
1227 }
1228 if (atom->type != XML_REGEXP_RANGES) {
1229 ERROR("add range: atom is not ranges");
1230 return;
1231 }
1232 if (atom->maxRanges == 0) {
1233 atom->maxRanges = 4;
1234 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1235 sizeof(xmlRegRangePtr));
1236 if (atom->ranges == NULL) {
1237 xmlRegexpErrMemory(ctxt, "adding ranges");
1238 atom->maxRanges = 0;
1239 return;
1240 }
1241 } else if (atom->nbRanges >= atom->maxRanges) {
1242 xmlRegRangePtr *tmp;
1243 atom->maxRanges *= 2;
1244 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1245 sizeof(xmlRegRangePtr));
1246 if (tmp == NULL) {
1247 xmlRegexpErrMemory(ctxt, "adding ranges");
1248 atom->maxRanges /= 2;
1249 return;
1250 }
1251 atom->ranges = tmp;
1252 }
1253 range = xmlRegNewRange(ctxt, neg, type, start, end);
1254 if (range == NULL)
1255 return;
1256 range->blockName = blockName;
1257 atom->ranges[atom->nbRanges++] = range;
1258
1259 }
1260
1261 static int
xmlRegGetCounter(xmlRegParserCtxtPtr ctxt)1262 xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1263 if (ctxt->maxCounters == 0) {
1264 ctxt->maxCounters = 4;
1265 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1266 sizeof(xmlRegCounter));
1267 if (ctxt->counters == NULL) {
1268 xmlRegexpErrMemory(ctxt, "allocating counter");
1269 ctxt->maxCounters = 0;
1270 return(-1);
1271 }
1272 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1273 xmlRegCounter *tmp;
1274 ctxt->maxCounters *= 2;
1275 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1276 sizeof(xmlRegCounter));
1277 if (tmp == NULL) {
1278 xmlRegexpErrMemory(ctxt, "allocating counter");
1279 ctxt->maxCounters /= 2;
1280 return(-1);
1281 }
1282 ctxt->counters = tmp;
1283 }
1284 ctxt->counters[ctxt->nbCounters].min = -1;
1285 ctxt->counters[ctxt->nbCounters].max = -1;
1286 return(ctxt->nbCounters++);
1287 }
1288
1289 static int
xmlRegAtomPush(xmlRegParserCtxtPtr ctxt,xmlRegAtomPtr atom)1290 xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1291 if (atom == NULL) {
1292 ERROR("atom push: atom is NULL");
1293 return(-1);
1294 }
1295 if (ctxt->maxAtoms == 0) {
1296 ctxt->maxAtoms = 4;
1297 ctxt->atoms = (xmlRegAtomPtr *) xmlMalloc(ctxt->maxAtoms *
1298 sizeof(xmlRegAtomPtr));
1299 if (ctxt->atoms == NULL) {
1300 xmlRegexpErrMemory(ctxt, "pushing atom");
1301 ctxt->maxAtoms = 0;
1302 return(-1);
1303 }
1304 } else if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1305 xmlRegAtomPtr *tmp;
1306 ctxt->maxAtoms *= 2;
1307 tmp = (xmlRegAtomPtr *) xmlRealloc(ctxt->atoms, ctxt->maxAtoms *
1308 sizeof(xmlRegAtomPtr));
1309 if (tmp == NULL) {
1310 xmlRegexpErrMemory(ctxt, "allocating counter");
1311 ctxt->maxAtoms /= 2;
1312 return(-1);
1313 }
1314 ctxt->atoms = tmp;
1315 }
1316 atom->no = ctxt->nbAtoms;
1317 ctxt->atoms[ctxt->nbAtoms++] = atom;
1318 return(0);
1319 }
1320
1321 static void
xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr target,int from)1322 xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1323 int from) {
1324 if (target->maxTransTo == 0) {
1325 target->maxTransTo = 8;
1326 target->transTo = (int *) xmlMalloc(target->maxTransTo *
1327 sizeof(int));
1328 if (target->transTo == NULL) {
1329 xmlRegexpErrMemory(ctxt, "adding transition");
1330 target->maxTransTo = 0;
1331 return;
1332 }
1333 } else if (target->nbTransTo >= target->maxTransTo) {
1334 int *tmp;
1335 target->maxTransTo *= 2;
1336 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1337 sizeof(int));
1338 if (tmp == NULL) {
1339 xmlRegexpErrMemory(ctxt, "adding transition");
1340 target->maxTransTo /= 2;
1341 return;
1342 }
1343 target->transTo = tmp;
1344 }
1345 target->transTo[target->nbTransTo] = from;
1346 target->nbTransTo++;
1347 }
1348
1349 static void
xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,xmlRegAtomPtr atom,xmlRegStatePtr target,int counter,int count)1350 xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1351 xmlRegAtomPtr atom, xmlRegStatePtr target,
1352 int counter, int count) {
1353
1354 int nrtrans;
1355
1356 if (state == NULL) {
1357 ERROR("add state: state is NULL");
1358 return;
1359 }
1360 if (target == NULL) {
1361 ERROR("add state: target is NULL");
1362 return;
1363 }
1364 /*
1365 * Other routines follow the philosophy 'When in doubt, add a transition'
1366 * so we check here whether such a transition is already present and, if
1367 * so, silently ignore this request.
1368 */
1369
1370 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1371 xmlRegTransPtr trans = &(state->trans[nrtrans]);
1372 if ((trans->atom == atom) &&
1373 (trans->to == target->no) &&
1374 (trans->counter == counter) &&
1375 (trans->count == count)) {
1376 #ifdef DEBUG_REGEXP_GRAPH
1377 printf("Ignoring duplicate transition from %d to %d\n",
1378 state->no, target->no);
1379 #endif
1380 return;
1381 }
1382 }
1383
1384 if (state->maxTrans == 0) {
1385 state->maxTrans = 8;
1386 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1387 sizeof(xmlRegTrans));
1388 if (state->trans == NULL) {
1389 xmlRegexpErrMemory(ctxt, "adding transition");
1390 state->maxTrans = 0;
1391 return;
1392 }
1393 } else if (state->nbTrans >= state->maxTrans) {
1394 xmlRegTrans *tmp;
1395 state->maxTrans *= 2;
1396 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1397 sizeof(xmlRegTrans));
1398 if (tmp == NULL) {
1399 xmlRegexpErrMemory(ctxt, "adding transition");
1400 state->maxTrans /= 2;
1401 return;
1402 }
1403 state->trans = tmp;
1404 }
1405 #ifdef DEBUG_REGEXP_GRAPH
1406 printf("Add trans from %d to %d ", state->no, target->no);
1407 if (count == REGEXP_ALL_COUNTER)
1408 printf("all transition\n");
1409 else if (count >= 0)
1410 printf("count based %d\n", count);
1411 else if (counter >= 0)
1412 printf("counted %d\n", counter);
1413 else if (atom == NULL)
1414 printf("epsilon transition\n");
1415 else if (atom != NULL)
1416 xmlRegPrintAtom(stdout, atom);
1417 #endif
1418
1419 state->trans[state->nbTrans].atom = atom;
1420 state->trans[state->nbTrans].to = target->no;
1421 state->trans[state->nbTrans].counter = counter;
1422 state->trans[state->nbTrans].count = count;
1423 state->trans[state->nbTrans].nd = 0;
1424 state->nbTrans++;
1425 xmlRegStateAddTransTo(ctxt, target, state->no);
1426 }
1427
1428 static int
xmlRegStatePush(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state)1429 xmlRegStatePush(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
1430 if (state == NULL) return(-1);
1431 if (ctxt->maxStates == 0) {
1432 ctxt->maxStates = 4;
1433 ctxt->states = (xmlRegStatePtr *) xmlMalloc(ctxt->maxStates *
1434 sizeof(xmlRegStatePtr));
1435 if (ctxt->states == NULL) {
1436 xmlRegexpErrMemory(ctxt, "adding state");
1437 ctxt->maxStates = 0;
1438 return(-1);
1439 }
1440 } else if (ctxt->nbStates >= ctxt->maxStates) {
1441 xmlRegStatePtr *tmp;
1442 ctxt->maxStates *= 2;
1443 tmp = (xmlRegStatePtr *) xmlRealloc(ctxt->states, ctxt->maxStates *
1444 sizeof(xmlRegStatePtr));
1445 if (tmp == NULL) {
1446 xmlRegexpErrMemory(ctxt, "adding state");
1447 ctxt->maxStates /= 2;
1448 return(-1);
1449 }
1450 ctxt->states = tmp;
1451 }
1452 state->no = ctxt->nbStates;
1453 ctxt->states[ctxt->nbStates++] = state;
1454 return(0);
1455 }
1456
1457 /**
1458 * xmlFAGenerateAllTransition:
1459 * @ctxt: a regexp parser context
1460 * @from: the from state
1461 * @to: the target state or NULL for building a new one
1462 * @lax:
1463 *
1464 */
1465 static void
xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int lax)1466 xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1467 xmlRegStatePtr from, xmlRegStatePtr to,
1468 int lax) {
1469 if (to == NULL) {
1470 to = xmlRegNewState(ctxt);
1471 xmlRegStatePush(ctxt, to);
1472 ctxt->state = to;
1473 }
1474 if (lax)
1475 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1476 else
1477 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1478 }
1479
1480 /**
1481 * xmlFAGenerateEpsilonTransition:
1482 * @ctxt: a regexp parser context
1483 * @from: the from state
1484 * @to: the target state or NULL for building a new one
1485 *
1486 */
1487 static void
xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to)1488 xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1489 xmlRegStatePtr from, xmlRegStatePtr to) {
1490 if (to == NULL) {
1491 to = xmlRegNewState(ctxt);
1492 xmlRegStatePush(ctxt, to);
1493 ctxt->state = to;
1494 }
1495 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1496 }
1497
1498 /**
1499 * xmlFAGenerateCountedEpsilonTransition:
1500 * @ctxt: a regexp parser context
1501 * @from: the from state
1502 * @to: the target state or NULL for building a new one
1503 * counter: the counter for that transition
1504 *
1505 */
1506 static void
xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1507 xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1508 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1509 if (to == NULL) {
1510 to = xmlRegNewState(ctxt);
1511 xmlRegStatePush(ctxt, to);
1512 ctxt->state = to;
1513 }
1514 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1515 }
1516
1517 /**
1518 * xmlFAGenerateCountedTransition:
1519 * @ctxt: a regexp parser context
1520 * @from: the from state
1521 * @to: the target state or NULL for building a new one
1522 * counter: the counter for that transition
1523 *
1524 */
1525 static void
xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,int counter)1526 xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1527 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1528 if (to == NULL) {
1529 to = xmlRegNewState(ctxt);
1530 xmlRegStatePush(ctxt, to);
1531 ctxt->state = to;
1532 }
1533 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1534 }
1535
1536 /**
1537 * xmlFAGenerateTransitions:
1538 * @ctxt: a regexp parser context
1539 * @from: the from state
1540 * @to: the target state or NULL for building a new one
1541 * @atom: the atom generating the transition
1542 *
1543 * Returns 0 if success and -1 in case of error.
1544 */
1545 static int
xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr from,xmlRegStatePtr to,xmlRegAtomPtr atom)1546 xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1547 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1548 xmlRegStatePtr end;
1549 int nullable = 0;
1550
1551 if (atom == NULL) {
1552 ERROR("generate transition: atom == NULL");
1553 return(-1);
1554 }
1555 if (atom->type == XML_REGEXP_SUBREG) {
1556 /*
1557 * this is a subexpression handling one should not need to
1558 * create a new node except for XML_REGEXP_QUANT_RANGE.
1559 */
1560 if (xmlRegAtomPush(ctxt, atom) < 0) {
1561 return(-1);
1562 }
1563 if ((to != NULL) && (atom->stop != to) &&
1564 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1565 /*
1566 * Generate an epsilon transition to link to the target
1567 */
1568 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1569 #ifdef DV
1570 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1571 (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1572 to = xmlRegNewState(ctxt);
1573 xmlRegStatePush(ctxt, to);
1574 ctxt->state = to;
1575 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1576 #endif
1577 }
1578 switch (atom->quant) {
1579 case XML_REGEXP_QUANT_OPT:
1580 atom->quant = XML_REGEXP_QUANT_ONCE;
1581 /*
1582 * transition done to the state after end of atom.
1583 * 1. set transition from atom start to new state
1584 * 2. set transition from atom end to this state.
1585 */
1586 if (to == NULL) {
1587 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1588 xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1589 ctxt->state);
1590 } else {
1591 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1592 }
1593 break;
1594 case XML_REGEXP_QUANT_MULT:
1595 atom->quant = XML_REGEXP_QUANT_ONCE;
1596 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1597 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1598 break;
1599 case XML_REGEXP_QUANT_PLUS:
1600 atom->quant = XML_REGEXP_QUANT_ONCE;
1601 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1602 break;
1603 case XML_REGEXP_QUANT_RANGE: {
1604 int counter;
1605 xmlRegStatePtr inter, newstate;
1606
1607 /*
1608 * create the final state now if needed
1609 */
1610 if (to != NULL) {
1611 newstate = to;
1612 } else {
1613 newstate = xmlRegNewState(ctxt);
1614 xmlRegStatePush(ctxt, newstate);
1615 }
1616
1617 /*
1618 * The principle here is to use counted transition
1619 * to avoid explosion in the number of states in the
1620 * graph. This is clearly more complex but should not
1621 * be exploitable at runtime.
1622 */
1623 if ((atom->min == 0) && (atom->start0 == NULL)) {
1624 xmlRegAtomPtr copy;
1625 /*
1626 * duplicate a transition based on atom to count next
1627 * occurrences after 1. We cannot loop to atom->start
1628 * directly because we need an epsilon transition to
1629 * newstate.
1630 */
1631 /* ???? For some reason it seems we never reach that
1632 case, I suppose this got optimized out before when
1633 building the automata */
1634 copy = xmlRegCopyAtom(ctxt, atom);
1635 if (copy == NULL)
1636 return(-1);
1637 copy->quant = XML_REGEXP_QUANT_ONCE;
1638 copy->min = 0;
1639 copy->max = 0;
1640
1641 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1642 < 0)
1643 return(-1);
1644 inter = ctxt->state;
1645 counter = xmlRegGetCounter(ctxt);
1646 ctxt->counters[counter].min = atom->min - 1;
1647 ctxt->counters[counter].max = atom->max - 1;
1648 /* count the number of times we see it again */
1649 xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1650 atom->stop, counter);
1651 /* allow a way out based on the count */
1652 xmlFAGenerateCountedTransition(ctxt, inter,
1653 newstate, counter);
1654 /* and also allow a direct exit for 0 */
1655 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1656 newstate);
1657 } else {
1658 /*
1659 * either we need the atom at least once or there
1660 * is an atom->start0 allowing to easily plug the
1661 * epsilon transition.
1662 */
1663 counter = xmlRegGetCounter(ctxt);
1664 ctxt->counters[counter].min = atom->min - 1;
1665 ctxt->counters[counter].max = atom->max - 1;
1666 /* count the number of times we see it again */
1667 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1668 atom->start, counter);
1669 /* allow a way out based on the count */
1670 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1671 newstate, counter);
1672 /* and if needed allow a direct exit for 0 */
1673 if (atom->min == 0)
1674 xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1675 newstate);
1676
1677 }
1678 atom->min = 0;
1679 atom->max = 0;
1680 atom->quant = XML_REGEXP_QUANT_ONCE;
1681 ctxt->state = newstate;
1682 }
1683 default:
1684 break;
1685 }
1686 return(0);
1687 }
1688 if ((atom->min == 0) && (atom->max == 0) &&
1689 (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1690 /*
1691 * we can discard the atom and generate an epsilon transition instead
1692 */
1693 if (to == NULL) {
1694 to = xmlRegNewState(ctxt);
1695 if (to != NULL)
1696 xmlRegStatePush(ctxt, to);
1697 else {
1698 return(-1);
1699 }
1700 }
1701 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1702 ctxt->state = to;
1703 xmlRegFreeAtom(atom);
1704 return(0);
1705 }
1706 if (to == NULL) {
1707 to = xmlRegNewState(ctxt);
1708 if (to != NULL)
1709 xmlRegStatePush(ctxt, to);
1710 else {
1711 return(-1);
1712 }
1713 }
1714 end = to;
1715 if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
1716 (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1717 /*
1718 * Do not pollute the target state by adding transitions from
1719 * it as it is likely to be the shared target of multiple branches.
1720 * So isolate with an epsilon transition.
1721 */
1722 xmlRegStatePtr tmp;
1723
1724 tmp = xmlRegNewState(ctxt);
1725 if (tmp != NULL)
1726 xmlRegStatePush(ctxt, tmp);
1727 else {
1728 return(-1);
1729 }
1730 xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1731 to = tmp;
1732 }
1733 if (xmlRegAtomPush(ctxt, atom) < 0) {
1734 return(-1);
1735 }
1736 if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
1737 (atom->min == 0) && (atom->max > 0)) {
1738 nullable = 1;
1739 atom->min = 1;
1740 if (atom->max == 1)
1741 atom->quant = XML_REGEXP_QUANT_OPT;
1742 }
1743 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1744 ctxt->state = end;
1745 switch (atom->quant) {
1746 case XML_REGEXP_QUANT_OPT:
1747 atom->quant = XML_REGEXP_QUANT_ONCE;
1748 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1749 break;
1750 case XML_REGEXP_QUANT_MULT:
1751 atom->quant = XML_REGEXP_QUANT_ONCE;
1752 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1753 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1754 break;
1755 case XML_REGEXP_QUANT_PLUS:
1756 atom->quant = XML_REGEXP_QUANT_ONCE;
1757 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1758 break;
1759 case XML_REGEXP_QUANT_RANGE:
1760 if (nullable)
1761 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1762 break;
1763 default:
1764 break;
1765 }
1766 return(0);
1767 }
1768
1769 /**
1770 * xmlFAReduceEpsilonTransitions:
1771 * @ctxt: a regexp parser context
1772 * @fromnr: the from state
1773 * @tonr: the to state
1774 * @counter: should that transition be associated to a counted
1775 *
1776 */
1777 static void
xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt,int fromnr,int tonr,int counter)1778 xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1779 int tonr, int counter) {
1780 int transnr;
1781 xmlRegStatePtr from;
1782 xmlRegStatePtr to;
1783
1784 #ifdef DEBUG_REGEXP_GRAPH
1785 printf("xmlFAReduceEpsilonTransitions(%d, %d)\n", fromnr, tonr);
1786 #endif
1787 from = ctxt->states[fromnr];
1788 if (from == NULL)
1789 return;
1790 to = ctxt->states[tonr];
1791 if (to == NULL)
1792 return;
1793 if ((to->mark == XML_REGEXP_MARK_START) ||
1794 (to->mark == XML_REGEXP_MARK_VISITED))
1795 return;
1796
1797 to->mark = XML_REGEXP_MARK_VISITED;
1798 if (to->type == XML_REGEXP_FINAL_STATE) {
1799 #ifdef DEBUG_REGEXP_GRAPH
1800 printf("State %d is final, so %d becomes final\n", tonr, fromnr);
1801 #endif
1802 from->type = XML_REGEXP_FINAL_STATE;
1803 }
1804 for (transnr = 0;transnr < to->nbTrans;transnr++) {
1805 if (to->trans[transnr].to < 0)
1806 continue;
1807 if (to->trans[transnr].atom == NULL) {
1808 /*
1809 * Don't remove counted transitions
1810 * Don't loop either
1811 */
1812 if (to->trans[transnr].to != fromnr) {
1813 if (to->trans[transnr].count >= 0) {
1814 int newto = to->trans[transnr].to;
1815
1816 xmlRegStateAddTrans(ctxt, from, NULL,
1817 ctxt->states[newto],
1818 -1, to->trans[transnr].count);
1819 } else {
1820 #ifdef DEBUG_REGEXP_GRAPH
1821 printf("Found epsilon trans %d from %d to %d\n",
1822 transnr, tonr, to->trans[transnr].to);
1823 #endif
1824 if (to->trans[transnr].counter >= 0) {
1825 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1826 to->trans[transnr].to,
1827 to->trans[transnr].counter);
1828 } else {
1829 xmlFAReduceEpsilonTransitions(ctxt, fromnr,
1830 to->trans[transnr].to,
1831 counter);
1832 }
1833 }
1834 }
1835 } else {
1836 int newto = to->trans[transnr].to;
1837
1838 if (to->trans[transnr].counter >= 0) {
1839 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1840 ctxt->states[newto],
1841 to->trans[transnr].counter, -1);
1842 } else {
1843 xmlRegStateAddTrans(ctxt, from, to->trans[transnr].atom,
1844 ctxt->states[newto], counter, -1);
1845 }
1846 }
1847 }
1848 to->mark = XML_REGEXP_MARK_NORMAL;
1849 }
1850
1851 /**
1852 * xmlFAEliminateSimpleEpsilonTransitions:
1853 * @ctxt: a regexp parser context
1854 *
1855 * Eliminating general epsilon transitions can get costly in the general
1856 * algorithm due to the large amount of generated new transitions and
1857 * associated comparisons. However for simple epsilon transition used just
1858 * to separate building blocks when generating the automata this can be
1859 * reduced to state elimination:
1860 * - if there exists an epsilon from X to Y
1861 * - if there is no other transition from X
1862 * then X and Y are semantically equivalent and X can be eliminated
1863 * If X is the start state then make Y the start state, else replace the
1864 * target of all transitions to X by transitions to Y.
1865 */
1866 static void
xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1867 xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1868 int statenr, i, j, newto;
1869 xmlRegStatePtr state, tmp;
1870
1871 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1872 state = ctxt->states[statenr];
1873 if (state == NULL)
1874 continue;
1875 if (state->nbTrans != 1)
1876 continue;
1877 if (state->type == XML_REGEXP_UNREACH_STATE)
1878 continue;
1879 /* is the only transition out a basic transition */
1880 if ((state->trans[0].atom == NULL) &&
1881 (state->trans[0].to >= 0) &&
1882 (state->trans[0].to != statenr) &&
1883 (state->trans[0].counter < 0) &&
1884 (state->trans[0].count < 0)) {
1885 newto = state->trans[0].to;
1886
1887 if (state->type == XML_REGEXP_START_STATE) {
1888 #ifdef DEBUG_REGEXP_GRAPH
1889 printf("Found simple epsilon trans from start %d to %d\n",
1890 statenr, newto);
1891 #endif
1892 } else {
1893 #ifdef DEBUG_REGEXP_GRAPH
1894 printf("Found simple epsilon trans from %d to %d\n",
1895 statenr, newto);
1896 #endif
1897 for (i = 0;i < state->nbTransTo;i++) {
1898 tmp = ctxt->states[state->transTo[i]];
1899 for (j = 0;j < tmp->nbTrans;j++) {
1900 if (tmp->trans[j].to == statenr) {
1901 #ifdef DEBUG_REGEXP_GRAPH
1902 printf("Changed transition %d on %d to go to %d\n",
1903 j, tmp->no, newto);
1904 #endif
1905 tmp->trans[j].to = -1;
1906 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
1907 ctxt->states[newto],
1908 tmp->trans[j].counter,
1909 tmp->trans[j].count);
1910 }
1911 }
1912 }
1913 if (state->type == XML_REGEXP_FINAL_STATE)
1914 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1915 /* eliminate the transition completely */
1916 state->nbTrans = 0;
1917
1918 state->type = XML_REGEXP_UNREACH_STATE;
1919
1920 }
1921
1922 }
1923 }
1924 }
1925 /**
1926 * xmlFAEliminateEpsilonTransitions:
1927 * @ctxt: a regexp parser context
1928 *
1929 */
1930 static void
xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt)1931 xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1932 int statenr, transnr;
1933 xmlRegStatePtr state;
1934 int has_epsilon;
1935
1936 if (ctxt->states == NULL) return;
1937
1938 /*
1939 * Eliminate simple epsilon transition and the associated unreachable
1940 * states.
1941 */
1942 xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1943 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1944 state = ctxt->states[statenr];
1945 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1946 #ifdef DEBUG_REGEXP_GRAPH
1947 printf("Removed unreachable state %d\n", statenr);
1948 #endif
1949 xmlRegFreeState(state);
1950 ctxt->states[statenr] = NULL;
1951 }
1952 }
1953
1954 has_epsilon = 0;
1955
1956 /*
1957 * Build the completed transitions bypassing the epsilons
1958 * Use a marking algorithm to avoid loops
1959 * Mark sink states too.
1960 * Process from the latest states backward to the start when
1961 * there is long cascading epsilon chains this minimize the
1962 * recursions and transition compares when adding the new ones
1963 */
1964 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
1965 state = ctxt->states[statenr];
1966 if (state == NULL)
1967 continue;
1968 if ((state->nbTrans == 0) &&
1969 (state->type != XML_REGEXP_FINAL_STATE)) {
1970 state->type = XML_REGEXP_SINK_STATE;
1971 }
1972 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1973 if ((state->trans[transnr].atom == NULL) &&
1974 (state->trans[transnr].to >= 0)) {
1975 if (state->trans[transnr].to == statenr) {
1976 state->trans[transnr].to = -1;
1977 #ifdef DEBUG_REGEXP_GRAPH
1978 printf("Removed loopback epsilon trans %d on %d\n",
1979 transnr, statenr);
1980 #endif
1981 } else if (state->trans[transnr].count < 0) {
1982 int newto = state->trans[transnr].to;
1983
1984 #ifdef DEBUG_REGEXP_GRAPH
1985 printf("Found epsilon trans %d from %d to %d\n",
1986 transnr, statenr, newto);
1987 #endif
1988 has_epsilon = 1;
1989 state->trans[transnr].to = -2;
1990 state->mark = XML_REGEXP_MARK_START;
1991 xmlFAReduceEpsilonTransitions(ctxt, statenr,
1992 newto, state->trans[transnr].counter);
1993 state->mark = XML_REGEXP_MARK_NORMAL;
1994 #ifdef DEBUG_REGEXP_GRAPH
1995 } else {
1996 printf("Found counted transition %d on %d\n",
1997 transnr, statenr);
1998 #endif
1999 }
2000 }
2001 }
2002 }
2003 /*
2004 * Eliminate the epsilon transitions
2005 */
2006 if (has_epsilon) {
2007 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2008 state = ctxt->states[statenr];
2009 if (state == NULL)
2010 continue;
2011 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2012 xmlRegTransPtr trans = &(state->trans[transnr]);
2013 if ((trans->atom == NULL) &&
2014 (trans->count < 0) &&
2015 (trans->to >= 0)) {
2016 trans->to = -1;
2017 }
2018 }
2019 }
2020 }
2021
2022 /*
2023 * Use this pass to detect unreachable states too
2024 */
2025 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2026 state = ctxt->states[statenr];
2027 if (state != NULL)
2028 state->reached = XML_REGEXP_MARK_NORMAL;
2029 }
2030 state = ctxt->states[0];
2031 if (state != NULL)
2032 state->reached = XML_REGEXP_MARK_START;
2033 while (state != NULL) {
2034 xmlRegStatePtr target = NULL;
2035 state->reached = XML_REGEXP_MARK_VISITED;
2036 /*
2037 * Mark all states reachable from the current reachable state
2038 */
2039 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2040 if ((state->trans[transnr].to >= 0) &&
2041 ((state->trans[transnr].atom != NULL) ||
2042 (state->trans[transnr].count >= 0))) {
2043 int newto = state->trans[transnr].to;
2044
2045 if (ctxt->states[newto] == NULL)
2046 continue;
2047 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
2048 ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
2049 target = ctxt->states[newto];
2050 }
2051 }
2052 }
2053
2054 /*
2055 * find the next accessible state not explored
2056 */
2057 if (target == NULL) {
2058 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
2059 state = ctxt->states[statenr];
2060 if ((state != NULL) && (state->reached ==
2061 XML_REGEXP_MARK_START)) {
2062 target = state;
2063 break;
2064 }
2065 }
2066 }
2067 state = target;
2068 }
2069 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2070 state = ctxt->states[statenr];
2071 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
2072 #ifdef DEBUG_REGEXP_GRAPH
2073 printf("Removed unreachable state %d\n", statenr);
2074 #endif
2075 xmlRegFreeState(state);
2076 ctxt->states[statenr] = NULL;
2077 }
2078 }
2079
2080 }
2081
2082 static int
xmlFACompareRanges(xmlRegRangePtr range1,xmlRegRangePtr range2)2083 xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
2084 int ret = 0;
2085
2086 if ((range1->type == XML_REGEXP_RANGES) ||
2087 (range2->type == XML_REGEXP_RANGES) ||
2088 (range2->type == XML_REGEXP_SUBREG) ||
2089 (range1->type == XML_REGEXP_SUBREG) ||
2090 (range1->type == XML_REGEXP_STRING) ||
2091 (range2->type == XML_REGEXP_STRING))
2092 return(-1);
2093
2094 /* put them in order */
2095 if (range1->type > range2->type) {
2096 xmlRegRangePtr tmp;
2097
2098 tmp = range1;
2099 range1 = range2;
2100 range2 = tmp;
2101 }
2102 if ((range1->type == XML_REGEXP_ANYCHAR) ||
2103 (range2->type == XML_REGEXP_ANYCHAR)) {
2104 ret = 1;
2105 } else if ((range1->type == XML_REGEXP_EPSILON) ||
2106 (range2->type == XML_REGEXP_EPSILON)) {
2107 return(0);
2108 } else if (range1->type == range2->type) {
2109 if (range1->type != XML_REGEXP_CHARVAL)
2110 ret = 1;
2111 else if ((range1->end < range2->start) ||
2112 (range2->end < range1->start))
2113 ret = 0;
2114 else
2115 ret = 1;
2116 } else if (range1->type == XML_REGEXP_CHARVAL) {
2117 int codepoint;
2118 int neg = 0;
2119
2120 /*
2121 * just check all codepoints in the range for acceptance,
2122 * this is usually way cheaper since done only once at
2123 * compilation than testing over and over at runtime or
2124 * pushing too many states when evaluating.
2125 */
2126 if (((range1->neg == 0) && (range2->neg != 0)) ||
2127 ((range1->neg != 0) && (range2->neg == 0)))
2128 neg = 1;
2129
2130 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2131 ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2132 0, range2->start, range2->end,
2133 range2->blockName);
2134 if (ret < 0)
2135 return(-1);
2136 if (((neg == 1) && (ret == 0)) ||
2137 ((neg == 0) && (ret == 1)))
2138 return(1);
2139 }
2140 return(0);
2141 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2142 (range2->type == XML_REGEXP_BLOCK_NAME)) {
2143 if (range1->type == range2->type) {
2144 ret = xmlStrEqual(range1->blockName, range2->blockName);
2145 } else {
2146 /*
2147 * comparing a block range with anything else is way
2148 * too costly, and maintaining the table is like too much
2149 * memory too, so let's force the automata to save state
2150 * here.
2151 */
2152 return(1);
2153 }
2154 } else if ((range1->type < XML_REGEXP_LETTER) ||
2155 (range2->type < XML_REGEXP_LETTER)) {
2156 if ((range1->type == XML_REGEXP_ANYSPACE) &&
2157 (range2->type == XML_REGEXP_NOTSPACE))
2158 ret = 0;
2159 else if ((range1->type == XML_REGEXP_INITNAME) &&
2160 (range2->type == XML_REGEXP_NOTINITNAME))
2161 ret = 0;
2162 else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2163 (range2->type == XML_REGEXP_NOTNAMECHAR))
2164 ret = 0;
2165 else if ((range1->type == XML_REGEXP_DECIMAL) &&
2166 (range2->type == XML_REGEXP_NOTDECIMAL))
2167 ret = 0;
2168 else if ((range1->type == XML_REGEXP_REALCHAR) &&
2169 (range2->type == XML_REGEXP_NOTREALCHAR))
2170 ret = 0;
2171 else {
2172 /* same thing to limit complexity */
2173 return(1);
2174 }
2175 } else {
2176 ret = 0;
2177 /* range1->type < range2->type here */
2178 switch (range1->type) {
2179 case XML_REGEXP_LETTER:
2180 /* all disjoint except in the subgroups */
2181 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2182 (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2183 (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2184 (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2185 (range2->type == XML_REGEXP_LETTER_OTHERS))
2186 ret = 1;
2187 break;
2188 case XML_REGEXP_MARK:
2189 if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2190 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2191 (range2->type == XML_REGEXP_MARK_ENCLOSING))
2192 ret = 1;
2193 break;
2194 case XML_REGEXP_NUMBER:
2195 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2196 (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2197 (range2->type == XML_REGEXP_NUMBER_OTHERS))
2198 ret = 1;
2199 break;
2200 case XML_REGEXP_PUNCT:
2201 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2202 (range2->type == XML_REGEXP_PUNCT_DASH) ||
2203 (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2204 (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2205 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2206 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2207 (range2->type == XML_REGEXP_PUNCT_OTHERS))
2208 ret = 1;
2209 break;
2210 case XML_REGEXP_SEPAR:
2211 if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2212 (range2->type == XML_REGEXP_SEPAR_LINE) ||
2213 (range2->type == XML_REGEXP_SEPAR_PARA))
2214 ret = 1;
2215 break;
2216 case XML_REGEXP_SYMBOL:
2217 if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2218 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2219 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2220 (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2221 ret = 1;
2222 break;
2223 case XML_REGEXP_OTHER:
2224 if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2225 (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2226 (range2->type == XML_REGEXP_OTHER_PRIVATE))
2227 ret = 1;
2228 break;
2229 default:
2230 if ((range2->type >= XML_REGEXP_LETTER) &&
2231 (range2->type < XML_REGEXP_BLOCK_NAME))
2232 ret = 0;
2233 else {
2234 /* safety net ! */
2235 return(1);
2236 }
2237 }
2238 }
2239 if (((range1->neg == 0) && (range2->neg != 0)) ||
2240 ((range1->neg != 0) && (range2->neg == 0)))
2241 ret = !ret;
2242 return(ret);
2243 }
2244
2245 /**
2246 * xmlFACompareAtomTypes:
2247 * @type1: an atom type
2248 * @type2: an atom type
2249 *
2250 * Compares two atoms type to check whether they intersect in some ways,
2251 * this is used by xmlFACompareAtoms only
2252 *
2253 * Returns 1 if they may intersect and 0 otherwise
2254 */
2255 static int
xmlFACompareAtomTypes(xmlRegAtomType type1,xmlRegAtomType type2)2256 xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2257 if ((type1 == XML_REGEXP_EPSILON) ||
2258 (type1 == XML_REGEXP_CHARVAL) ||
2259 (type1 == XML_REGEXP_RANGES) ||
2260 (type1 == XML_REGEXP_SUBREG) ||
2261 (type1 == XML_REGEXP_STRING) ||
2262 (type1 == XML_REGEXP_ANYCHAR))
2263 return(1);
2264 if ((type2 == XML_REGEXP_EPSILON) ||
2265 (type2 == XML_REGEXP_CHARVAL) ||
2266 (type2 == XML_REGEXP_RANGES) ||
2267 (type2 == XML_REGEXP_SUBREG) ||
2268 (type2 == XML_REGEXP_STRING) ||
2269 (type2 == XML_REGEXP_ANYCHAR))
2270 return(1);
2271
2272 if (type1 == type2) return(1);
2273
2274 /* simplify subsequent compares by making sure type1 < type2 */
2275 if (type1 > type2) {
2276 xmlRegAtomType tmp = type1;
2277 type1 = type2;
2278 type2 = tmp;
2279 }
2280 switch (type1) {
2281 case XML_REGEXP_ANYSPACE: /* \s */
2282 /* can't be a letter, number, mark, punctuation, symbol */
2283 if ((type2 == XML_REGEXP_NOTSPACE) ||
2284 ((type2 >= XML_REGEXP_LETTER) &&
2285 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2286 ((type2 >= XML_REGEXP_NUMBER) &&
2287 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2288 ((type2 >= XML_REGEXP_MARK) &&
2289 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2290 ((type2 >= XML_REGEXP_PUNCT) &&
2291 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2292 ((type2 >= XML_REGEXP_SYMBOL) &&
2293 (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2294 ) return(0);
2295 break;
2296 case XML_REGEXP_NOTSPACE: /* \S */
2297 break;
2298 case XML_REGEXP_INITNAME: /* \l */
2299 /* can't be a number, mark, separator, punctuation, symbol or other */
2300 if ((type2 == XML_REGEXP_NOTINITNAME) ||
2301 ((type2 >= XML_REGEXP_NUMBER) &&
2302 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2303 ((type2 >= XML_REGEXP_MARK) &&
2304 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2305 ((type2 >= XML_REGEXP_SEPAR) &&
2306 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2307 ((type2 >= XML_REGEXP_PUNCT) &&
2308 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2309 ((type2 >= XML_REGEXP_SYMBOL) &&
2310 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2311 ((type2 >= XML_REGEXP_OTHER) &&
2312 (type2 <= XML_REGEXP_OTHER_NA))
2313 ) return(0);
2314 break;
2315 case XML_REGEXP_NOTINITNAME: /* \L */
2316 break;
2317 case XML_REGEXP_NAMECHAR: /* \c */
2318 /* can't be a mark, separator, punctuation, symbol or other */
2319 if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2320 ((type2 >= XML_REGEXP_MARK) &&
2321 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2322 ((type2 >= XML_REGEXP_PUNCT) &&
2323 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2324 ((type2 >= XML_REGEXP_SEPAR) &&
2325 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2326 ((type2 >= XML_REGEXP_SYMBOL) &&
2327 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2328 ((type2 >= XML_REGEXP_OTHER) &&
2329 (type2 <= XML_REGEXP_OTHER_NA))
2330 ) return(0);
2331 break;
2332 case XML_REGEXP_NOTNAMECHAR: /* \C */
2333 break;
2334 case XML_REGEXP_DECIMAL: /* \d */
2335 /* can't be a letter, mark, separator, punctuation, symbol or other */
2336 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2337 (type2 == XML_REGEXP_REALCHAR) ||
2338 ((type2 >= XML_REGEXP_LETTER) &&
2339 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2340 ((type2 >= XML_REGEXP_MARK) &&
2341 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2342 ((type2 >= XML_REGEXP_PUNCT) &&
2343 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2344 ((type2 >= XML_REGEXP_SEPAR) &&
2345 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2346 ((type2 >= XML_REGEXP_SYMBOL) &&
2347 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2348 ((type2 >= XML_REGEXP_OTHER) &&
2349 (type2 <= XML_REGEXP_OTHER_NA))
2350 )return(0);
2351 break;
2352 case XML_REGEXP_NOTDECIMAL: /* \D */
2353 break;
2354 case XML_REGEXP_REALCHAR: /* \w */
2355 /* can't be a mark, separator, punctuation, symbol or other */
2356 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2357 ((type2 >= XML_REGEXP_MARK) &&
2358 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2359 ((type2 >= XML_REGEXP_PUNCT) &&
2360 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2361 ((type2 >= XML_REGEXP_SEPAR) &&
2362 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2363 ((type2 >= XML_REGEXP_SYMBOL) &&
2364 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2365 ((type2 >= XML_REGEXP_OTHER) &&
2366 (type2 <= XML_REGEXP_OTHER_NA))
2367 )return(0);
2368 break;
2369 case XML_REGEXP_NOTREALCHAR: /* \W */
2370 break;
2371 /*
2372 * at that point we know both type 1 and type2 are from
2373 * character categories are ordered and are different,
2374 * it becomes simple because this is a partition
2375 */
2376 case XML_REGEXP_LETTER:
2377 if (type2 <= XML_REGEXP_LETTER_OTHERS)
2378 return(1);
2379 return(0);
2380 case XML_REGEXP_LETTER_UPPERCASE:
2381 case XML_REGEXP_LETTER_LOWERCASE:
2382 case XML_REGEXP_LETTER_TITLECASE:
2383 case XML_REGEXP_LETTER_MODIFIER:
2384 case XML_REGEXP_LETTER_OTHERS:
2385 return(0);
2386 case XML_REGEXP_MARK:
2387 if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2388 return(1);
2389 return(0);
2390 case XML_REGEXP_MARK_NONSPACING:
2391 case XML_REGEXP_MARK_SPACECOMBINING:
2392 case XML_REGEXP_MARK_ENCLOSING:
2393 return(0);
2394 case XML_REGEXP_NUMBER:
2395 if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2396 return(1);
2397 return(0);
2398 case XML_REGEXP_NUMBER_DECIMAL:
2399 case XML_REGEXP_NUMBER_LETTER:
2400 case XML_REGEXP_NUMBER_OTHERS:
2401 return(0);
2402 case XML_REGEXP_PUNCT:
2403 if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2404 return(1);
2405 return(0);
2406 case XML_REGEXP_PUNCT_CONNECTOR:
2407 case XML_REGEXP_PUNCT_DASH:
2408 case XML_REGEXP_PUNCT_OPEN:
2409 case XML_REGEXP_PUNCT_CLOSE:
2410 case XML_REGEXP_PUNCT_INITQUOTE:
2411 case XML_REGEXP_PUNCT_FINQUOTE:
2412 case XML_REGEXP_PUNCT_OTHERS:
2413 return(0);
2414 case XML_REGEXP_SEPAR:
2415 if (type2 <= XML_REGEXP_SEPAR_PARA)
2416 return(1);
2417 return(0);
2418 case XML_REGEXP_SEPAR_SPACE:
2419 case XML_REGEXP_SEPAR_LINE:
2420 case XML_REGEXP_SEPAR_PARA:
2421 return(0);
2422 case XML_REGEXP_SYMBOL:
2423 if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2424 return(1);
2425 return(0);
2426 case XML_REGEXP_SYMBOL_MATH:
2427 case XML_REGEXP_SYMBOL_CURRENCY:
2428 case XML_REGEXP_SYMBOL_MODIFIER:
2429 case XML_REGEXP_SYMBOL_OTHERS:
2430 return(0);
2431 case XML_REGEXP_OTHER:
2432 if (type2 <= XML_REGEXP_OTHER_NA)
2433 return(1);
2434 return(0);
2435 case XML_REGEXP_OTHER_CONTROL:
2436 case XML_REGEXP_OTHER_FORMAT:
2437 case XML_REGEXP_OTHER_PRIVATE:
2438 case XML_REGEXP_OTHER_NA:
2439 return(0);
2440 default:
2441 break;
2442 }
2443 return(1);
2444 }
2445
2446 /**
2447 * xmlFAEqualAtoms:
2448 * @atom1: an atom
2449 * @atom2: an atom
2450 * @deep: if not set only compare string pointers
2451 *
2452 * Compares two atoms to check whether they are the same exactly
2453 * this is used to remove equivalent transitions
2454 *
2455 * Returns 1 if same and 0 otherwise
2456 */
2457 static int
xmlFAEqualAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2,int deep)2458 xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2459 int ret = 0;
2460
2461 if (atom1 == atom2)
2462 return(1);
2463 if ((atom1 == NULL) || (atom2 == NULL))
2464 return(0);
2465
2466 if (atom1->type != atom2->type)
2467 return(0);
2468 switch (atom1->type) {
2469 case XML_REGEXP_EPSILON:
2470 ret = 0;
2471 break;
2472 case XML_REGEXP_STRING:
2473 if (!deep)
2474 ret = (atom1->valuep == atom2->valuep);
2475 else
2476 ret = xmlStrEqual((xmlChar *)atom1->valuep,
2477 (xmlChar *)atom2->valuep);
2478 break;
2479 case XML_REGEXP_CHARVAL:
2480 ret = (atom1->codepoint == atom2->codepoint);
2481 break;
2482 case XML_REGEXP_RANGES:
2483 /* too hard to do in the general case */
2484 ret = 0;
2485 default:
2486 break;
2487 }
2488 return(ret);
2489 }
2490
2491 /**
2492 * xmlFACompareAtoms:
2493 * @atom1: an atom
2494 * @atom2: an atom
2495 * @deep: if not set only compare string pointers
2496 *
2497 * Compares two atoms to check whether they intersect in some ways,
2498 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2499 *
2500 * Returns 1 if yes and 0 otherwise
2501 */
2502 static int
xmlFACompareAtoms(xmlRegAtomPtr atom1,xmlRegAtomPtr atom2,int deep)2503 xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2504 int ret = 1;
2505
2506 if (atom1 == atom2)
2507 return(1);
2508 if ((atom1 == NULL) || (atom2 == NULL))
2509 return(0);
2510
2511 if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2512 (atom2->type == XML_REGEXP_ANYCHAR))
2513 return(1);
2514
2515 if (atom1->type > atom2->type) {
2516 xmlRegAtomPtr tmp;
2517 tmp = atom1;
2518 atom1 = atom2;
2519 atom2 = tmp;
2520 }
2521 if (atom1->type != atom2->type) {
2522 ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2523 /* if they can't intersect at the type level break now */
2524 if (ret == 0)
2525 return(0);
2526 }
2527 switch (atom1->type) {
2528 case XML_REGEXP_STRING:
2529 if (!deep)
2530 ret = (atom1->valuep != atom2->valuep);
2531 else {
2532 xmlChar *val1 = (xmlChar *)atom1->valuep;
2533 xmlChar *val2 = (xmlChar *)atom2->valuep;
2534 int compound1 = (xmlStrchr(val1, '|') != NULL);
2535 int compound2 = (xmlStrchr(val2, '|') != NULL);
2536
2537 /* Ignore negative match flag for ##other namespaces */
2538 if (compound1 != compound2)
2539 return(0);
2540
2541 ret = xmlRegStrEqualWildcard(val1, val2);
2542 }
2543 break;
2544 case XML_REGEXP_EPSILON:
2545 goto not_determinist;
2546 case XML_REGEXP_CHARVAL:
2547 if (atom2->type == XML_REGEXP_CHARVAL) {
2548 ret = (atom1->codepoint == atom2->codepoint);
2549 } else {
2550 ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2551 if (ret < 0)
2552 ret = 1;
2553 }
2554 break;
2555 case XML_REGEXP_RANGES:
2556 if (atom2->type == XML_REGEXP_RANGES) {
2557 int i, j, res;
2558 xmlRegRangePtr r1, r2;
2559
2560 /*
2561 * need to check that none of the ranges eventually matches
2562 */
2563 for (i = 0;i < atom1->nbRanges;i++) {
2564 for (j = 0;j < atom2->nbRanges;j++) {
2565 r1 = atom1->ranges[i];
2566 r2 = atom2->ranges[j];
2567 res = xmlFACompareRanges(r1, r2);
2568 if (res == 1) {
2569 ret = 1;
2570 goto done;
2571 }
2572 }
2573 }
2574 ret = 0;
2575 }
2576 break;
2577 default:
2578 goto not_determinist;
2579 }
2580 done:
2581 if (atom1->neg != atom2->neg) {
2582 ret = !ret;
2583 }
2584 if (ret == 0)
2585 return(0);
2586 not_determinist:
2587 return(1);
2588 }
2589
2590 /**
2591 * xmlFARecurseDeterminism:
2592 * @ctxt: a regexp parser context
2593 *
2594 * Check whether the associated regexp is determinist,
2595 * should be called after xmlFAEliminateEpsilonTransitions()
2596 *
2597 */
2598 static int
xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr state,int to,xmlRegAtomPtr atom)2599 xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2600 int to, xmlRegAtomPtr atom) {
2601 int ret = 1;
2602 int res;
2603 int transnr, nbTrans;
2604 xmlRegTransPtr t1;
2605 int deep = 1;
2606
2607 if (state == NULL)
2608 return(ret);
2609 if (state->markd == XML_REGEXP_MARK_VISITED)
2610 return(ret);
2611
2612 if (ctxt->flags & AM_AUTOMATA_RNG)
2613 deep = 0;
2614
2615 /*
2616 * don't recurse on transitions potentially added in the course of
2617 * the elimination.
2618 */
2619 nbTrans = state->nbTrans;
2620 for (transnr = 0;transnr < nbTrans;transnr++) {
2621 t1 = &(state->trans[transnr]);
2622 /*
2623 * check transitions conflicting with the one looked at
2624 */
2625 if (t1->atom == NULL) {
2626 if (t1->to < 0)
2627 continue;
2628 state->markd = XML_REGEXP_MARK_VISITED;
2629 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2630 to, atom);
2631 state->markd = 0;
2632 if (res == 0) {
2633 ret = 0;
2634 /* t1->nd = 1; */
2635 }
2636 continue;
2637 }
2638 if (t1->to != to)
2639 continue;
2640 if (xmlFACompareAtoms(t1->atom, atom, deep)) {
2641 ret = 0;
2642 /* mark the transition as non-deterministic */
2643 t1->nd = 1;
2644 }
2645 }
2646 return(ret);
2647 }
2648
2649 /**
2650 * xmlFAComputesDeterminism:
2651 * @ctxt: a regexp parser context
2652 *
2653 * Check whether the associated regexp is determinist,
2654 * should be called after xmlFAEliminateEpsilonTransitions()
2655 *
2656 */
2657 static int
xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt)2658 xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2659 int statenr, transnr;
2660 xmlRegStatePtr state;
2661 xmlRegTransPtr t1, t2, last;
2662 int i;
2663 int ret = 1;
2664 int deep = 1;
2665
2666 #ifdef DEBUG_REGEXP_GRAPH
2667 printf("xmlFAComputesDeterminism\n");
2668 xmlRegPrintCtxt(stdout, ctxt);
2669 #endif
2670 if (ctxt->determinist != -1)
2671 return(ctxt->determinist);
2672
2673 if (ctxt->flags & AM_AUTOMATA_RNG)
2674 deep = 0;
2675
2676 /*
2677 * First cleanup the automata removing cancelled transitions
2678 */
2679 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2680 state = ctxt->states[statenr];
2681 if (state == NULL)
2682 continue;
2683 if (state->nbTrans < 2)
2684 continue;
2685 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2686 t1 = &(state->trans[transnr]);
2687 /*
2688 * Determinism checks in case of counted or all transitions
2689 * will have to be handled separately
2690 */
2691 if (t1->atom == NULL) {
2692 /* t1->nd = 1; */
2693 continue;
2694 }
2695 if (t1->to == -1) /* eliminated */
2696 continue;
2697 for (i = 0;i < transnr;i++) {
2698 t2 = &(state->trans[i]);
2699 if (t2->to == -1) /* eliminated */
2700 continue;
2701 if (t2->atom != NULL) {
2702 if (t1->to == t2->to) {
2703 /*
2704 * Here we use deep because we want to keep the
2705 * transitions which indicate a conflict
2706 */
2707 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
2708 (t1->counter == t2->counter) &&
2709 (t1->count == t2->count))
2710 t2->to = -1; /* eliminated */
2711 }
2712 }
2713 }
2714 }
2715 }
2716
2717 /*
2718 * Check for all states that there aren't 2 transitions
2719 * with the same atom and a different target.
2720 */
2721 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2722 state = ctxt->states[statenr];
2723 if (state == NULL)
2724 continue;
2725 if (state->nbTrans < 2)
2726 continue;
2727 last = NULL;
2728 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2729 t1 = &(state->trans[transnr]);
2730 /*
2731 * Determinism checks in case of counted or all transitions
2732 * will have to be handled separately
2733 */
2734 if (t1->atom == NULL) {
2735 continue;
2736 }
2737 if (t1->to == -1) /* eliminated */
2738 continue;
2739 for (i = 0;i < transnr;i++) {
2740 t2 = &(state->trans[i]);
2741 if (t2->to == -1) /* eliminated */
2742 continue;
2743 if (t2->atom != NULL) {
2744 /*
2745 * But here we don't use deep because we want to
2746 * find transitions which indicate a conflict
2747 */
2748 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
2749 ret = 0;
2750 /* mark the transitions as non-deterministic ones */
2751 t1->nd = 1;
2752 t2->nd = 1;
2753 last = t1;
2754 }
2755 } else if (t1->to != -1) {
2756 /*
2757 * do the closure in case of remaining specific
2758 * epsilon transitions like choices or all
2759 */
2760 ret = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2761 t2->to, t2->atom);
2762 /* don't shortcut the computation so all non deterministic
2763 transition get marked down
2764 if (ret == 0)
2765 return(0);
2766 */
2767 if (ret == 0) {
2768 t1->nd = 1;
2769 /* t2->nd = 1; */
2770 last = t1;
2771 }
2772 }
2773 }
2774 /* don't shortcut the computation so all non deterministic
2775 transition get marked down
2776 if (ret == 0)
2777 break; */
2778 }
2779
2780 /*
2781 * mark specifically the last non-deterministic transition
2782 * from a state since there is no need to set-up rollback
2783 * from it
2784 */
2785 if (last != NULL) {
2786 last->nd = 2;
2787 }
2788
2789 /* don't shortcut the computation so all non deterministic
2790 transition get marked down
2791 if (ret == 0)
2792 break; */
2793 }
2794
2795 ctxt->determinist = ret;
2796 return(ret);
2797 }
2798
2799 /************************************************************************
2800 * *
2801 * Routines to check input against transition atoms *
2802 * *
2803 ************************************************************************/
2804
2805 static int
xmlRegCheckCharacterRange(xmlRegAtomType type,int codepoint,int neg,int start,int end,const xmlChar * blockName)2806 xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2807 int start, int end, const xmlChar *blockName) {
2808 int ret = 0;
2809
2810 switch (type) {
2811 case XML_REGEXP_STRING:
2812 case XML_REGEXP_SUBREG:
2813 case XML_REGEXP_RANGES:
2814 case XML_REGEXP_EPSILON:
2815 return(-1);
2816 case XML_REGEXP_ANYCHAR:
2817 ret = ((codepoint != '\n') && (codepoint != '\r'));
2818 break;
2819 case XML_REGEXP_CHARVAL:
2820 ret = ((codepoint >= start) && (codepoint <= end));
2821 break;
2822 case XML_REGEXP_NOTSPACE:
2823 neg = !neg;
2824 /* Falls through. */
2825 case XML_REGEXP_ANYSPACE:
2826 ret = ((codepoint == '\n') || (codepoint == '\r') ||
2827 (codepoint == '\t') || (codepoint == ' '));
2828 break;
2829 case XML_REGEXP_NOTINITNAME:
2830 neg = !neg;
2831 /* Falls through. */
2832 case XML_REGEXP_INITNAME:
2833 ret = (IS_LETTER(codepoint) ||
2834 (codepoint == '_') || (codepoint == ':'));
2835 break;
2836 case XML_REGEXP_NOTNAMECHAR:
2837 neg = !neg;
2838 /* Falls through. */
2839 case XML_REGEXP_NAMECHAR:
2840 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
2841 (codepoint == '.') || (codepoint == '-') ||
2842 (codepoint == '_') || (codepoint == ':') ||
2843 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
2844 break;
2845 case XML_REGEXP_NOTDECIMAL:
2846 neg = !neg;
2847 /* Falls through. */
2848 case XML_REGEXP_DECIMAL:
2849 ret = xmlUCSIsCatNd(codepoint);
2850 break;
2851 case XML_REGEXP_REALCHAR:
2852 neg = !neg;
2853 /* Falls through. */
2854 case XML_REGEXP_NOTREALCHAR:
2855 ret = xmlUCSIsCatP(codepoint);
2856 if (ret == 0)
2857 ret = xmlUCSIsCatZ(codepoint);
2858 if (ret == 0)
2859 ret = xmlUCSIsCatC(codepoint);
2860 break;
2861 case XML_REGEXP_LETTER:
2862 ret = xmlUCSIsCatL(codepoint);
2863 break;
2864 case XML_REGEXP_LETTER_UPPERCASE:
2865 ret = xmlUCSIsCatLu(codepoint);
2866 break;
2867 case XML_REGEXP_LETTER_LOWERCASE:
2868 ret = xmlUCSIsCatLl(codepoint);
2869 break;
2870 case XML_REGEXP_LETTER_TITLECASE:
2871 ret = xmlUCSIsCatLt(codepoint);
2872 break;
2873 case XML_REGEXP_LETTER_MODIFIER:
2874 ret = xmlUCSIsCatLm(codepoint);
2875 break;
2876 case XML_REGEXP_LETTER_OTHERS:
2877 ret = xmlUCSIsCatLo(codepoint);
2878 break;
2879 case XML_REGEXP_MARK:
2880 ret = xmlUCSIsCatM(codepoint);
2881 break;
2882 case XML_REGEXP_MARK_NONSPACING:
2883 ret = xmlUCSIsCatMn(codepoint);
2884 break;
2885 case XML_REGEXP_MARK_SPACECOMBINING:
2886 ret = xmlUCSIsCatMc(codepoint);
2887 break;
2888 case XML_REGEXP_MARK_ENCLOSING:
2889 ret = xmlUCSIsCatMe(codepoint);
2890 break;
2891 case XML_REGEXP_NUMBER:
2892 ret = xmlUCSIsCatN(codepoint);
2893 break;
2894 case XML_REGEXP_NUMBER_DECIMAL:
2895 ret = xmlUCSIsCatNd(codepoint);
2896 break;
2897 case XML_REGEXP_NUMBER_LETTER:
2898 ret = xmlUCSIsCatNl(codepoint);
2899 break;
2900 case XML_REGEXP_NUMBER_OTHERS:
2901 ret = xmlUCSIsCatNo(codepoint);
2902 break;
2903 case XML_REGEXP_PUNCT:
2904 ret = xmlUCSIsCatP(codepoint);
2905 break;
2906 case XML_REGEXP_PUNCT_CONNECTOR:
2907 ret = xmlUCSIsCatPc(codepoint);
2908 break;
2909 case XML_REGEXP_PUNCT_DASH:
2910 ret = xmlUCSIsCatPd(codepoint);
2911 break;
2912 case XML_REGEXP_PUNCT_OPEN:
2913 ret = xmlUCSIsCatPs(codepoint);
2914 break;
2915 case XML_REGEXP_PUNCT_CLOSE:
2916 ret = xmlUCSIsCatPe(codepoint);
2917 break;
2918 case XML_REGEXP_PUNCT_INITQUOTE:
2919 ret = xmlUCSIsCatPi(codepoint);
2920 break;
2921 case XML_REGEXP_PUNCT_FINQUOTE:
2922 ret = xmlUCSIsCatPf(codepoint);
2923 break;
2924 case XML_REGEXP_PUNCT_OTHERS:
2925 ret = xmlUCSIsCatPo(codepoint);
2926 break;
2927 case XML_REGEXP_SEPAR:
2928 ret = xmlUCSIsCatZ(codepoint);
2929 break;
2930 case XML_REGEXP_SEPAR_SPACE:
2931 ret = xmlUCSIsCatZs(codepoint);
2932 break;
2933 case XML_REGEXP_SEPAR_LINE:
2934 ret = xmlUCSIsCatZl(codepoint);
2935 break;
2936 case XML_REGEXP_SEPAR_PARA:
2937 ret = xmlUCSIsCatZp(codepoint);
2938 break;
2939 case XML_REGEXP_SYMBOL:
2940 ret = xmlUCSIsCatS(codepoint);
2941 break;
2942 case XML_REGEXP_SYMBOL_MATH:
2943 ret = xmlUCSIsCatSm(codepoint);
2944 break;
2945 case XML_REGEXP_SYMBOL_CURRENCY:
2946 ret = xmlUCSIsCatSc(codepoint);
2947 break;
2948 case XML_REGEXP_SYMBOL_MODIFIER:
2949 ret = xmlUCSIsCatSk(codepoint);
2950 break;
2951 case XML_REGEXP_SYMBOL_OTHERS:
2952 ret = xmlUCSIsCatSo(codepoint);
2953 break;
2954 case XML_REGEXP_OTHER:
2955 ret = xmlUCSIsCatC(codepoint);
2956 break;
2957 case XML_REGEXP_OTHER_CONTROL:
2958 ret = xmlUCSIsCatCc(codepoint);
2959 break;
2960 case XML_REGEXP_OTHER_FORMAT:
2961 ret = xmlUCSIsCatCf(codepoint);
2962 break;
2963 case XML_REGEXP_OTHER_PRIVATE:
2964 ret = xmlUCSIsCatCo(codepoint);
2965 break;
2966 case XML_REGEXP_OTHER_NA:
2967 /* ret = xmlUCSIsCatCn(codepoint); */
2968 /* Seems it doesn't exist anymore in recent Unicode releases */
2969 ret = 0;
2970 break;
2971 case XML_REGEXP_BLOCK_NAME:
2972 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
2973 break;
2974 }
2975 if (neg)
2976 return(!ret);
2977 return(ret);
2978 }
2979
2980 static int
xmlRegCheckCharacter(xmlRegAtomPtr atom,int codepoint)2981 xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
2982 int i, ret = 0;
2983 xmlRegRangePtr range;
2984
2985 if ((atom == NULL) || (!IS_CHAR(codepoint)))
2986 return(-1);
2987
2988 switch (atom->type) {
2989 case XML_REGEXP_SUBREG:
2990 case XML_REGEXP_EPSILON:
2991 return(-1);
2992 case XML_REGEXP_CHARVAL:
2993 return(codepoint == atom->codepoint);
2994 case XML_REGEXP_RANGES: {
2995 int accept = 0;
2996
2997 for (i = 0;i < atom->nbRanges;i++) {
2998 range = atom->ranges[i];
2999 if (range->neg == 2) {
3000 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3001 0, range->start, range->end,
3002 range->blockName);
3003 if (ret != 0)
3004 return(0); /* excluded char */
3005 } else if (range->neg) {
3006 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3007 0, range->start, range->end,
3008 range->blockName);
3009 if (ret == 0)
3010 accept = 1;
3011 else
3012 return(0);
3013 } else {
3014 ret = xmlRegCheckCharacterRange(range->type, codepoint,
3015 0, range->start, range->end,
3016 range->blockName);
3017 if (ret != 0)
3018 accept = 1; /* might still be excluded */
3019 }
3020 }
3021 return(accept);
3022 }
3023 case XML_REGEXP_STRING:
3024 printf("TODO: XML_REGEXP_STRING\n");
3025 return(-1);
3026 case XML_REGEXP_ANYCHAR:
3027 case XML_REGEXP_ANYSPACE:
3028 case XML_REGEXP_NOTSPACE:
3029 case XML_REGEXP_INITNAME:
3030 case XML_REGEXP_NOTINITNAME:
3031 case XML_REGEXP_NAMECHAR:
3032 case XML_REGEXP_NOTNAMECHAR:
3033 case XML_REGEXP_DECIMAL:
3034 case XML_REGEXP_NOTDECIMAL:
3035 case XML_REGEXP_REALCHAR:
3036 case XML_REGEXP_NOTREALCHAR:
3037 case XML_REGEXP_LETTER:
3038 case XML_REGEXP_LETTER_UPPERCASE:
3039 case XML_REGEXP_LETTER_LOWERCASE:
3040 case XML_REGEXP_LETTER_TITLECASE:
3041 case XML_REGEXP_LETTER_MODIFIER:
3042 case XML_REGEXP_LETTER_OTHERS:
3043 case XML_REGEXP_MARK:
3044 case XML_REGEXP_MARK_NONSPACING:
3045 case XML_REGEXP_MARK_SPACECOMBINING:
3046 case XML_REGEXP_MARK_ENCLOSING:
3047 case XML_REGEXP_NUMBER:
3048 case XML_REGEXP_NUMBER_DECIMAL:
3049 case XML_REGEXP_NUMBER_LETTER:
3050 case XML_REGEXP_NUMBER_OTHERS:
3051 case XML_REGEXP_PUNCT:
3052 case XML_REGEXP_PUNCT_CONNECTOR:
3053 case XML_REGEXP_PUNCT_DASH:
3054 case XML_REGEXP_PUNCT_OPEN:
3055 case XML_REGEXP_PUNCT_CLOSE:
3056 case XML_REGEXP_PUNCT_INITQUOTE:
3057 case XML_REGEXP_PUNCT_FINQUOTE:
3058 case XML_REGEXP_PUNCT_OTHERS:
3059 case XML_REGEXP_SEPAR:
3060 case XML_REGEXP_SEPAR_SPACE:
3061 case XML_REGEXP_SEPAR_LINE:
3062 case XML_REGEXP_SEPAR_PARA:
3063 case XML_REGEXP_SYMBOL:
3064 case XML_REGEXP_SYMBOL_MATH:
3065 case XML_REGEXP_SYMBOL_CURRENCY:
3066 case XML_REGEXP_SYMBOL_MODIFIER:
3067 case XML_REGEXP_SYMBOL_OTHERS:
3068 case XML_REGEXP_OTHER:
3069 case XML_REGEXP_OTHER_CONTROL:
3070 case XML_REGEXP_OTHER_FORMAT:
3071 case XML_REGEXP_OTHER_PRIVATE:
3072 case XML_REGEXP_OTHER_NA:
3073 case XML_REGEXP_BLOCK_NAME:
3074 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3075 (const xmlChar *)atom->valuep);
3076 if (atom->neg)
3077 ret = !ret;
3078 break;
3079 }
3080 return(ret);
3081 }
3082
3083 /************************************************************************
3084 * *
3085 * Saving and restoring state of an execution context *
3086 * *
3087 ************************************************************************/
3088
3089 #ifdef DEBUG_REGEXP_EXEC
3090 static void
xmlFARegDebugExec(xmlRegExecCtxtPtr exec)3091 xmlFARegDebugExec(xmlRegExecCtxtPtr exec) {
3092 printf("state: %d:%d:idx %d", exec->state->no, exec->transno, exec->index);
3093 if (exec->inputStack != NULL) {
3094 int i;
3095 printf(": ");
3096 for (i = 0;(i < 3) && (i < exec->inputStackNr);i++)
3097 printf("%s ", (const char *)
3098 exec->inputStack[exec->inputStackNr - (i + 1)].value);
3099 } else {
3100 printf(": %s", &(exec->inputString[exec->index]));
3101 }
3102 printf("\n");
3103 }
3104 #endif
3105
3106 static void
xmlFARegExecSave(xmlRegExecCtxtPtr exec)3107 xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3108 #ifdef DEBUG_REGEXP_EXEC
3109 printf("saving ");
3110 exec->transno++;
3111 xmlFARegDebugExec(exec);
3112 exec->transno--;
3113 #endif
3114 #ifdef MAX_PUSH
3115 if (exec->nbPush > MAX_PUSH) {
3116 return;
3117 }
3118 exec->nbPush++;
3119 #endif
3120
3121 if (exec->maxRollbacks == 0) {
3122 exec->maxRollbacks = 4;
3123 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3124 sizeof(xmlRegExecRollback));
3125 if (exec->rollbacks == NULL) {
3126 xmlRegexpErrMemory(NULL, "saving regexp");
3127 exec->maxRollbacks = 0;
3128 return;
3129 }
3130 memset(exec->rollbacks, 0,
3131 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3132 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
3133 xmlRegExecRollback *tmp;
3134 int len = exec->maxRollbacks;
3135
3136 exec->maxRollbacks *= 2;
3137 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3138 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3139 if (tmp == NULL) {
3140 xmlRegexpErrMemory(NULL, "saving regexp");
3141 exec->maxRollbacks /= 2;
3142 return;
3143 }
3144 exec->rollbacks = tmp;
3145 tmp = &exec->rollbacks[len];
3146 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3147 }
3148 exec->rollbacks[exec->nbRollbacks].state = exec->state;
3149 exec->rollbacks[exec->nbRollbacks].index = exec->index;
3150 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3151 if (exec->comp->nbCounters > 0) {
3152 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3153 exec->rollbacks[exec->nbRollbacks].counts = (int *)
3154 xmlMalloc(exec->comp->nbCounters * sizeof(int));
3155 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3156 xmlRegexpErrMemory(NULL, "saving regexp");
3157 exec->status = -5;
3158 return;
3159 }
3160 }
3161 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3162 exec->comp->nbCounters * sizeof(int));
3163 }
3164 exec->nbRollbacks++;
3165 }
3166
3167 static void
xmlFARegExecRollBack(xmlRegExecCtxtPtr exec)3168 xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3169 if (exec->nbRollbacks <= 0) {
3170 exec->status = -1;
3171 #ifdef DEBUG_REGEXP_EXEC
3172 printf("rollback failed on empty stack\n");
3173 #endif
3174 return;
3175 }
3176 exec->nbRollbacks--;
3177 exec->state = exec->rollbacks[exec->nbRollbacks].state;
3178 exec->index = exec->rollbacks[exec->nbRollbacks].index;
3179 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3180 if (exec->comp->nbCounters > 0) {
3181 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3182 fprintf(stderr, "exec save: allocation failed");
3183 exec->status = -6;
3184 return;
3185 }
3186 if (exec->counts) {
3187 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
3188 exec->comp->nbCounters * sizeof(int));
3189 }
3190 }
3191
3192 #ifdef DEBUG_REGEXP_EXEC
3193 printf("restored ");
3194 xmlFARegDebugExec(exec);
3195 #endif
3196 }
3197
3198 /************************************************************************
3199 * *
3200 * Verifier, running an input against a compiled regexp *
3201 * *
3202 ************************************************************************/
3203
3204 static int
xmlFARegExec(xmlRegexpPtr comp,const xmlChar * content)3205 xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3206 xmlRegExecCtxt execval;
3207 xmlRegExecCtxtPtr exec = &execval;
3208 int ret, codepoint = 0, len, deter;
3209
3210 exec->inputString = content;
3211 exec->index = 0;
3212 exec->nbPush = 0;
3213 exec->determinist = 1;
3214 exec->maxRollbacks = 0;
3215 exec->nbRollbacks = 0;
3216 exec->rollbacks = NULL;
3217 exec->status = 0;
3218 exec->comp = comp;
3219 exec->state = comp->states[0];
3220 exec->transno = 0;
3221 exec->transcount = 0;
3222 exec->inputStack = NULL;
3223 exec->inputStackMax = 0;
3224 if (comp->nbCounters > 0) {
3225 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
3226 if (exec->counts == NULL) {
3227 xmlRegexpErrMemory(NULL, "running regexp");
3228 return(-1);
3229 }
3230 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3231 } else
3232 exec->counts = NULL;
3233 while ((exec->status == 0) && (exec->state != NULL) &&
3234 ((exec->inputString[exec->index] != 0) ||
3235 ((exec->state != NULL) &&
3236 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3237 xmlRegTransPtr trans;
3238 xmlRegAtomPtr atom;
3239
3240 /*
3241 * If end of input on non-terminal state, rollback, however we may
3242 * still have epsilon like transition for counted transitions
3243 * on counters, in that case don't break too early. Additionally,
3244 * if we are working on a range like "AB{0,2}", where B is not present,
3245 * we don't want to break.
3246 */
3247 len = 1;
3248 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
3249 /*
3250 * if there is a transition, we must check if
3251 * atom allows minOccurs of 0
3252 */
3253 if (exec->transno < exec->state->nbTrans) {
3254 trans = &exec->state->trans[exec->transno];
3255 if (trans->to >=0) {
3256 atom = trans->atom;
3257 if (!((atom->min == 0) && (atom->max > 0)))
3258 goto rollback;
3259 }
3260 } else
3261 goto rollback;
3262 }
3263
3264 exec->transcount = 0;
3265 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3266 trans = &exec->state->trans[exec->transno];
3267 if (trans->to < 0)
3268 continue;
3269 atom = trans->atom;
3270 ret = 0;
3271 deter = 1;
3272 if (trans->count >= 0) {
3273 int count;
3274 xmlRegCounterPtr counter;
3275
3276 if (exec->counts == NULL) {
3277 exec->status = -1;
3278 goto error;
3279 }
3280 /*
3281 * A counted transition.
3282 */
3283
3284 count = exec->counts[trans->count];
3285 counter = &exec->comp->counters[trans->count];
3286 #ifdef DEBUG_REGEXP_EXEC
3287 printf("testing count %d: val %d, min %d, max %d\n",
3288 trans->count, count, counter->min, counter->max);
3289 #endif
3290 ret = ((count >= counter->min) && (count <= counter->max));
3291 if ((ret) && (counter->min != counter->max))
3292 deter = 0;
3293 } else if (atom == NULL) {
3294 fprintf(stderr, "epsilon transition left at runtime\n");
3295 exec->status = -2;
3296 break;
3297 } else if (exec->inputString[exec->index] != 0) {
3298 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
3299 ret = xmlRegCheckCharacter(atom, codepoint);
3300 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
3301 xmlRegStatePtr to = comp->states[trans->to];
3302
3303 /*
3304 * this is a multiple input sequence
3305 * If there is a counter associated increment it now.
3306 * before potentially saving and rollback
3307 * do not increment if the counter is already over the
3308 * maximum limit in which case get to next transition
3309 */
3310 if (trans->counter >= 0) {
3311 xmlRegCounterPtr counter;
3312
3313 if ((exec->counts == NULL) ||
3314 (exec->comp == NULL) ||
3315 (exec->comp->counters == NULL)) {
3316 exec->status = -1;
3317 goto error;
3318 }
3319 counter = &exec->comp->counters[trans->counter];
3320 if (exec->counts[trans->counter] >= counter->max)
3321 continue; /* for loop on transitions */
3322
3323 #ifdef DEBUG_REGEXP_EXEC
3324 printf("Increasing count %d\n", trans->counter);
3325 #endif
3326 exec->counts[trans->counter]++;
3327 }
3328 if (exec->state->nbTrans > exec->transno + 1) {
3329 xmlFARegExecSave(exec);
3330 }
3331 exec->transcount = 1;
3332 do {
3333 /*
3334 * Try to progress as much as possible on the input
3335 */
3336 if (exec->transcount == atom->max) {
3337 break;
3338 }
3339 exec->index += len;
3340 /*
3341 * End of input: stop here
3342 */
3343 if (exec->inputString[exec->index] == 0) {
3344 exec->index -= len;
3345 break;
3346 }
3347 if (exec->transcount >= atom->min) {
3348 int transno = exec->transno;
3349 xmlRegStatePtr state = exec->state;
3350
3351 /*
3352 * The transition is acceptable save it
3353 */
3354 exec->transno = -1; /* trick */
3355 exec->state = to;
3356 xmlFARegExecSave(exec);
3357 exec->transno = transno;
3358 exec->state = state;
3359 }
3360 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
3361 len);
3362 ret = xmlRegCheckCharacter(atom, codepoint);
3363 exec->transcount++;
3364 } while (ret == 1);
3365 if (exec->transcount < atom->min)
3366 ret = 0;
3367
3368 /*
3369 * If the last check failed but one transition was found
3370 * possible, rollback
3371 */
3372 if (ret < 0)
3373 ret = 0;
3374 if (ret == 0) {
3375 goto rollback;
3376 }
3377 if (trans->counter >= 0) {
3378 if (exec->counts == NULL) {
3379 exec->status = -1;
3380 goto error;
3381 }
3382 #ifdef DEBUG_REGEXP_EXEC
3383 printf("Decreasing count %d\n", trans->counter);
3384 #endif
3385 exec->counts[trans->counter]--;
3386 }
3387 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3388 /*
3389 * we don't match on the codepoint, but minOccurs of 0
3390 * says that's ok. Setting len to 0 inhibits stepping
3391 * over the codepoint.
3392 */
3393 exec->transcount = 1;
3394 len = 0;
3395 ret = 1;
3396 }
3397 } else if ((atom->min == 0) && (atom->max > 0)) {
3398 /* another spot to match when minOccurs is 0 */
3399 exec->transcount = 1;
3400 len = 0;
3401 ret = 1;
3402 }
3403 if (ret == 1) {
3404 if ((trans->nd == 1) ||
3405 ((trans->count >= 0) && (deter == 0) &&
3406 (exec->state->nbTrans > exec->transno + 1))) {
3407 #ifdef DEBUG_REGEXP_EXEC
3408 if (trans->nd == 1)
3409 printf("Saving on nd transition atom %d for %c at %d\n",
3410 trans->atom->no, codepoint, exec->index);
3411 else
3412 printf("Saving on counted transition count %d for %c at %d\n",
3413 trans->count, codepoint, exec->index);
3414 #endif
3415 xmlFARegExecSave(exec);
3416 }
3417 if (trans->counter >= 0) {
3418 xmlRegCounterPtr counter;
3419
3420 /* make sure we don't go over the counter maximum value */
3421 if ((exec->counts == NULL) ||
3422 (exec->comp == NULL) ||
3423 (exec->comp->counters == NULL)) {
3424 exec->status = -1;
3425 goto error;
3426 }
3427 counter = &exec->comp->counters[trans->counter];
3428 if (exec->counts[trans->counter] >= counter->max)
3429 continue; /* for loop on transitions */
3430 #ifdef DEBUG_REGEXP_EXEC
3431 printf("Increasing count %d\n", trans->counter);
3432 #endif
3433 exec->counts[trans->counter]++;
3434 }
3435 if ((trans->count >= 0) &&
3436 (trans->count < REGEXP_ALL_COUNTER)) {
3437 if (exec->counts == NULL) {
3438 exec->status = -1;
3439 goto error;
3440 }
3441 #ifdef DEBUG_REGEXP_EXEC
3442 printf("resetting count %d on transition\n",
3443 trans->count);
3444 #endif
3445 exec->counts[trans->count] = 0;
3446 }
3447 #ifdef DEBUG_REGEXP_EXEC
3448 printf("entering state %d\n", trans->to);
3449 #endif
3450 exec->state = comp->states[trans->to];
3451 exec->transno = 0;
3452 if (trans->atom != NULL) {
3453 exec->index += len;
3454 }
3455 goto progress;
3456 } else if (ret < 0) {
3457 exec->status = -4;
3458 break;
3459 }
3460 }
3461 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3462 rollback:
3463 /*
3464 * Failed to find a way out
3465 */
3466 exec->determinist = 0;
3467 #ifdef DEBUG_REGEXP_EXEC
3468 printf("rollback from state %d on %d:%c\n", exec->state->no,
3469 codepoint,codepoint);
3470 #endif
3471 xmlFARegExecRollBack(exec);
3472 }
3473 progress:
3474 continue;
3475 }
3476 error:
3477 if (exec->rollbacks != NULL) {
3478 if (exec->counts != NULL) {
3479 int i;
3480
3481 for (i = 0;i < exec->maxRollbacks;i++)
3482 if (exec->rollbacks[i].counts != NULL)
3483 xmlFree(exec->rollbacks[i].counts);
3484 }
3485 xmlFree(exec->rollbacks);
3486 }
3487 if (exec->state == NULL)
3488 return(-1);
3489 if (exec->counts != NULL)
3490 xmlFree(exec->counts);
3491 if (exec->status == 0)
3492 return(1);
3493 if (exec->status == -1) {
3494 if (exec->nbPush > MAX_PUSH)
3495 return(-1);
3496 return(0);
3497 }
3498 return(exec->status);
3499 }
3500
3501 /************************************************************************
3502 * *
3503 * Progressive interface to the verifier one atom at a time *
3504 * *
3505 ************************************************************************/
3506 #ifdef DEBUG_ERR
3507 static void testerr(xmlRegExecCtxtPtr exec);
3508 #endif
3509
3510 /**
3511 * xmlRegNewExecCtxt:
3512 * @comp: a precompiled regular expression
3513 * @callback: a callback function used for handling progresses in the
3514 * automata matching phase
3515 * @data: the context data associated to the callback in this context
3516 *
3517 * Build a context used for progressive evaluation of a regexp.
3518 *
3519 * Returns the new context
3520 */
3521 xmlRegExecCtxtPtr
xmlRegNewExecCtxt(xmlRegexpPtr comp,xmlRegExecCallbacks callback,void * data)3522 xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3523 xmlRegExecCtxtPtr exec;
3524
3525 if (comp == NULL)
3526 return(NULL);
3527 if ((comp->compact == NULL) && (comp->states == NULL))
3528 return(NULL);
3529 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3530 if (exec == NULL) {
3531 xmlRegexpErrMemory(NULL, "creating execution context");
3532 return(NULL);
3533 }
3534 memset(exec, 0, sizeof(xmlRegExecCtxt));
3535 exec->inputString = NULL;
3536 exec->index = 0;
3537 exec->determinist = 1;
3538 exec->maxRollbacks = 0;
3539 exec->nbRollbacks = 0;
3540 exec->rollbacks = NULL;
3541 exec->status = 0;
3542 exec->comp = comp;
3543 if (comp->compact == NULL)
3544 exec->state = comp->states[0];
3545 exec->transno = 0;
3546 exec->transcount = 0;
3547 exec->callback = callback;
3548 exec->data = data;
3549 if (comp->nbCounters > 0) {
3550 /*
3551 * For error handling, exec->counts is allocated twice the size
3552 * the second half is used to store the data in case of rollback
3553 */
3554 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3555 * 2);
3556 if (exec->counts == NULL) {
3557 xmlRegexpErrMemory(NULL, "creating execution context");
3558 xmlFree(exec);
3559 return(NULL);
3560 }
3561 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3562 exec->errCounts = &exec->counts[comp->nbCounters];
3563 } else {
3564 exec->counts = NULL;
3565 exec->errCounts = NULL;
3566 }
3567 exec->inputStackMax = 0;
3568 exec->inputStackNr = 0;
3569 exec->inputStack = NULL;
3570 exec->errStateNo = -1;
3571 exec->errString = NULL;
3572 exec->nbPush = 0;
3573 return(exec);
3574 }
3575
3576 /**
3577 * xmlRegFreeExecCtxt:
3578 * @exec: a regular expression evaluation context
3579 *
3580 * Free the structures associated to a regular expression evaluation context.
3581 */
3582 void
xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec)3583 xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3584 if (exec == NULL)
3585 return;
3586
3587 if (exec->rollbacks != NULL) {
3588 if (exec->counts != NULL) {
3589 int i;
3590
3591 for (i = 0;i < exec->maxRollbacks;i++)
3592 if (exec->rollbacks[i].counts != NULL)
3593 xmlFree(exec->rollbacks[i].counts);
3594 }
3595 xmlFree(exec->rollbacks);
3596 }
3597 if (exec->counts != NULL)
3598 xmlFree(exec->counts);
3599 if (exec->inputStack != NULL) {
3600 int i;
3601
3602 for (i = 0;i < exec->inputStackNr;i++) {
3603 if (exec->inputStack[i].value != NULL)
3604 xmlFree(exec->inputStack[i].value);
3605 }
3606 xmlFree(exec->inputStack);
3607 }
3608 if (exec->errString != NULL)
3609 xmlFree(exec->errString);
3610 xmlFree(exec);
3611 }
3612
3613 static void
xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)3614 xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3615 void *data) {
3616 #ifdef DEBUG_PUSH
3617 printf("saving value: %d:%s\n", exec->inputStackNr, value);
3618 #endif
3619 if (exec->inputStackMax == 0) {
3620 exec->inputStackMax = 4;
3621 exec->inputStack = (xmlRegInputTokenPtr)
3622 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3623 if (exec->inputStack == NULL) {
3624 xmlRegexpErrMemory(NULL, "pushing input string");
3625 exec->inputStackMax = 0;
3626 return;
3627 }
3628 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3629 xmlRegInputTokenPtr tmp;
3630
3631 exec->inputStackMax *= 2;
3632 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3633 exec->inputStackMax * sizeof(xmlRegInputToken));
3634 if (tmp == NULL) {
3635 xmlRegexpErrMemory(NULL, "pushing input string");
3636 exec->inputStackMax /= 2;
3637 return;
3638 }
3639 exec->inputStack = tmp;
3640 }
3641 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3642 exec->inputStack[exec->inputStackNr].data = data;
3643 exec->inputStackNr++;
3644 exec->inputStack[exec->inputStackNr].value = NULL;
3645 exec->inputStack[exec->inputStackNr].data = NULL;
3646 }
3647
3648 /**
3649 * xmlRegStrEqualWildcard:
3650 * @expStr: the string to be evaluated
3651 * @valStr: the validation string
3652 *
3653 * Checks if both strings are equal or have the same content. "*"
3654 * can be used as a wildcard in @valStr; "|" is used as a separator of
3655 * substrings in both @expStr and @valStr.
3656 *
3657 * Returns 1 if the comparison is satisfied and the number of substrings
3658 * is equal, 0 otherwise.
3659 */
3660
3661 static int
xmlRegStrEqualWildcard(const xmlChar * expStr,const xmlChar * valStr)3662 xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3663 if (expStr == valStr) return(1);
3664 if (expStr == NULL) return(0);
3665 if (valStr == NULL) return(0);
3666 do {
3667 /*
3668 * Eval if we have a wildcard for the current item.
3669 */
3670 if (*expStr != *valStr) {
3671 /* if one of them starts with a wildcard make valStr be it */
3672 if (*valStr == '*') {
3673 const xmlChar *tmp;
3674
3675 tmp = valStr;
3676 valStr = expStr;
3677 expStr = tmp;
3678 }
3679 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3680 do {
3681 if (*valStr == XML_REG_STRING_SEPARATOR)
3682 break;
3683 valStr++;
3684 } while (*valStr != 0);
3685 continue;
3686 } else
3687 return(0);
3688 }
3689 expStr++;
3690 valStr++;
3691 } while (*valStr != 0);
3692 if (*expStr != 0)
3693 return (0);
3694 else
3695 return (1);
3696 }
3697
3698 /**
3699 * xmlRegCompactPushString:
3700 * @exec: a regexp execution context
3701 * @comp: the precompiled exec with a compact table
3702 * @value: a string token input
3703 * @data: data associated to the token to reuse in callbacks
3704 *
3705 * Push one input token in the execution context
3706 *
3707 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3708 * a negative value in case of error.
3709 */
3710 static int
xmlRegCompactPushString(xmlRegExecCtxtPtr exec,xmlRegexpPtr comp,const xmlChar * value,void * data)3711 xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3712 xmlRegexpPtr comp,
3713 const xmlChar *value,
3714 void *data) {
3715 int state = exec->index;
3716 int i, target;
3717
3718 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3719 return(-1);
3720
3721 if (value == NULL) {
3722 /*
3723 * are we at a final state ?
3724 */
3725 if (comp->compact[state * (comp->nbstrings + 1)] ==
3726 XML_REGEXP_FINAL_STATE)
3727 return(1);
3728 return(0);
3729 }
3730
3731 #ifdef DEBUG_PUSH
3732 printf("value pushed: %s\n", value);
3733 #endif
3734
3735 /*
3736 * Examine all outside transitions from current state
3737 */
3738 for (i = 0;i < comp->nbstrings;i++) {
3739 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3740 if ((target > 0) && (target <= comp->nbstates)) {
3741 target--; /* to avoid 0 */
3742 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3743 exec->index = target;
3744 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3745 exec->callback(exec->data, value,
3746 comp->transdata[state * comp->nbstrings + i], data);
3747 }
3748 #ifdef DEBUG_PUSH
3749 printf("entering state %d\n", target);
3750 #endif
3751 if (comp->compact[target * (comp->nbstrings + 1)] ==
3752 XML_REGEXP_SINK_STATE)
3753 goto error;
3754
3755 if (comp->compact[target * (comp->nbstrings + 1)] ==
3756 XML_REGEXP_FINAL_STATE)
3757 return(1);
3758 return(0);
3759 }
3760 }
3761 }
3762 /*
3763 * Failed to find an exit transition out from current state for the
3764 * current token
3765 */
3766 #ifdef DEBUG_PUSH
3767 printf("failed to find a transition for %s on state %d\n", value, state);
3768 #endif
3769 error:
3770 if (exec->errString != NULL)
3771 xmlFree(exec->errString);
3772 exec->errString = xmlStrdup(value);
3773 exec->errStateNo = state;
3774 exec->status = -1;
3775 #ifdef DEBUG_ERR
3776 testerr(exec);
3777 #endif
3778 return(-1);
3779 }
3780
3781 /**
3782 * xmlRegExecPushStringInternal:
3783 * @exec: a regexp execution context or NULL to indicate the end
3784 * @value: a string token input
3785 * @data: data associated to the token to reuse in callbacks
3786 * @compound: value was assembled from 2 strings
3787 *
3788 * Push one input token in the execution context
3789 *
3790 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3791 * a negative value in case of error.
3792 */
3793 static int
xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data,int compound)3794 xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3795 void *data, int compound) {
3796 xmlRegTransPtr trans;
3797 xmlRegAtomPtr atom;
3798 int ret;
3799 int final = 0;
3800 int progress = 1;
3801
3802 if (exec == NULL)
3803 return(-1);
3804 if (exec->comp == NULL)
3805 return(-1);
3806 if (exec->status != 0)
3807 return(exec->status);
3808
3809 if (exec->comp->compact != NULL)
3810 return(xmlRegCompactPushString(exec, exec->comp, value, data));
3811
3812 if (value == NULL) {
3813 if (exec->state->type == XML_REGEXP_FINAL_STATE)
3814 return(1);
3815 final = 1;
3816 }
3817
3818 #ifdef DEBUG_PUSH
3819 printf("value pushed: %s\n", value);
3820 #endif
3821 /*
3822 * If we have an active rollback stack push the new value there
3823 * and get back to where we were left
3824 */
3825 if ((value != NULL) && (exec->inputStackNr > 0)) {
3826 xmlFARegExecSaveInputString(exec, value, data);
3827 value = exec->inputStack[exec->index].value;
3828 data = exec->inputStack[exec->index].data;
3829 #ifdef DEBUG_PUSH
3830 printf("value loaded: %s\n", value);
3831 #endif
3832 }
3833
3834 while ((exec->status == 0) &&
3835 ((value != NULL) ||
3836 ((final == 1) &&
3837 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3838
3839 /*
3840 * End of input on non-terminal state, rollback, however we may
3841 * still have epsilon like transition for counted transitions
3842 * on counters, in that case don't break too early.
3843 */
3844 if ((value == NULL) && (exec->counts == NULL))
3845 goto rollback;
3846
3847 exec->transcount = 0;
3848 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3849 trans = &exec->state->trans[exec->transno];
3850 if (trans->to < 0)
3851 continue;
3852 atom = trans->atom;
3853 ret = 0;
3854 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3855 int i;
3856 int count;
3857 xmlRegTransPtr t;
3858 xmlRegCounterPtr counter;
3859
3860 ret = 0;
3861
3862 #ifdef DEBUG_PUSH
3863 printf("testing all lax %d\n", trans->count);
3864 #endif
3865 /*
3866 * Check all counted transitions from the current state
3867 */
3868 if ((value == NULL) && (final)) {
3869 ret = 1;
3870 } else if (value != NULL) {
3871 for (i = 0;i < exec->state->nbTrans;i++) {
3872 t = &exec->state->trans[i];
3873 if ((t->counter < 0) || (t == trans))
3874 continue;
3875 counter = &exec->comp->counters[t->counter];
3876 count = exec->counts[t->counter];
3877 if ((count < counter->max) &&
3878 (t->atom != NULL) &&
3879 (xmlStrEqual(value, t->atom->valuep))) {
3880 ret = 0;
3881 break;
3882 }
3883 if ((count >= counter->min) &&
3884 (count < counter->max) &&
3885 (t->atom != NULL) &&
3886 (xmlStrEqual(value, t->atom->valuep))) {
3887 ret = 1;
3888 break;
3889 }
3890 }
3891 }
3892 } else if (trans->count == REGEXP_ALL_COUNTER) {
3893 int i;
3894 int count;
3895 xmlRegTransPtr t;
3896 xmlRegCounterPtr counter;
3897
3898 ret = 1;
3899
3900 #ifdef DEBUG_PUSH
3901 printf("testing all %d\n", trans->count);
3902 #endif
3903 /*
3904 * Check all counted transitions from the current state
3905 */
3906 for (i = 0;i < exec->state->nbTrans;i++) {
3907 t = &exec->state->trans[i];
3908 if ((t->counter < 0) || (t == trans))
3909 continue;
3910 counter = &exec->comp->counters[t->counter];
3911 count = exec->counts[t->counter];
3912 if ((count < counter->min) || (count > counter->max)) {
3913 ret = 0;
3914 break;
3915 }
3916 }
3917 } else if (trans->count >= 0) {
3918 int count;
3919 xmlRegCounterPtr counter;
3920
3921 /*
3922 * A counted transition.
3923 */
3924
3925 count = exec->counts[trans->count];
3926 counter = &exec->comp->counters[trans->count];
3927 #ifdef DEBUG_PUSH
3928 printf("testing count %d: val %d, min %d, max %d\n",
3929 trans->count, count, counter->min, counter->max);
3930 #endif
3931 ret = ((count >= counter->min) && (count <= counter->max));
3932 } else if (atom == NULL) {
3933 fprintf(stderr, "epsilon transition left at runtime\n");
3934 exec->status = -2;
3935 break;
3936 } else if (value != NULL) {
3937 ret = xmlRegStrEqualWildcard(atom->valuep, value);
3938 if (atom->neg) {
3939 ret = !ret;
3940 if (!compound)
3941 ret = 0;
3942 }
3943 if ((ret == 1) && (trans->counter >= 0)) {
3944 xmlRegCounterPtr counter;
3945 int count;
3946
3947 count = exec->counts[trans->counter];
3948 counter = &exec->comp->counters[trans->counter];
3949 if (count >= counter->max)
3950 ret = 0;
3951 }
3952
3953 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3954 xmlRegStatePtr to = exec->comp->states[trans->to];
3955
3956 /*
3957 * this is a multiple input sequence
3958 */
3959 if (exec->state->nbTrans > exec->transno + 1) {
3960 if (exec->inputStackNr <= 0) {
3961 xmlFARegExecSaveInputString(exec, value, data);
3962 }
3963 xmlFARegExecSave(exec);
3964 }
3965 exec->transcount = 1;
3966 do {
3967 /*
3968 * Try to progress as much as possible on the input
3969 */
3970 if (exec->transcount == atom->max) {
3971 break;
3972 }
3973 exec->index++;
3974 value = exec->inputStack[exec->index].value;
3975 data = exec->inputStack[exec->index].data;
3976 #ifdef DEBUG_PUSH
3977 printf("value loaded: %s\n", value);
3978 #endif
3979
3980 /*
3981 * End of input: stop here
3982 */
3983 if (value == NULL) {
3984 exec->index --;
3985 break;
3986 }
3987 if (exec->transcount >= atom->min) {
3988 int transno = exec->transno;
3989 xmlRegStatePtr state = exec->state;
3990
3991 /*
3992 * The transition is acceptable save it
3993 */
3994 exec->transno = -1; /* trick */
3995 exec->state = to;
3996 if (exec->inputStackNr <= 0) {
3997 xmlFARegExecSaveInputString(exec, value, data);
3998 }
3999 xmlFARegExecSave(exec);
4000 exec->transno = transno;
4001 exec->state = state;
4002 }
4003 ret = xmlStrEqual(value, atom->valuep);
4004 exec->transcount++;
4005 } while (ret == 1);
4006 if (exec->transcount < atom->min)
4007 ret = 0;
4008
4009 /*
4010 * If the last check failed but one transition was found
4011 * possible, rollback
4012 */
4013 if (ret < 0)
4014 ret = 0;
4015 if (ret == 0) {
4016 goto rollback;
4017 }
4018 }
4019 }
4020 if (ret == 1) {
4021 if ((exec->callback != NULL) && (atom != NULL) &&
4022 (data != NULL)) {
4023 exec->callback(exec->data, atom->valuep,
4024 atom->data, data);
4025 }
4026 if (exec->state->nbTrans > exec->transno + 1) {
4027 if (exec->inputStackNr <= 0) {
4028 xmlFARegExecSaveInputString(exec, value, data);
4029 }
4030 xmlFARegExecSave(exec);
4031 }
4032 if (trans->counter >= 0) {
4033 #ifdef DEBUG_PUSH
4034 printf("Increasing count %d\n", trans->counter);
4035 #endif
4036 exec->counts[trans->counter]++;
4037 }
4038 if ((trans->count >= 0) &&
4039 (trans->count < REGEXP_ALL_COUNTER)) {
4040 #ifdef DEBUG_REGEXP_EXEC
4041 printf("resetting count %d on transition\n",
4042 trans->count);
4043 #endif
4044 exec->counts[trans->count] = 0;
4045 }
4046 #ifdef DEBUG_PUSH
4047 printf("entering state %d\n", trans->to);
4048 #endif
4049 if ((exec->comp->states[trans->to] != NULL) &&
4050 (exec->comp->states[trans->to]->type ==
4051 XML_REGEXP_SINK_STATE)) {
4052 /*
4053 * entering a sink state, save the current state as error
4054 * state.
4055 */
4056 if (exec->errString != NULL)
4057 xmlFree(exec->errString);
4058 exec->errString = xmlStrdup(value);
4059 exec->errState = exec->state;
4060 memcpy(exec->errCounts, exec->counts,
4061 exec->comp->nbCounters * sizeof(int));
4062 }
4063 exec->state = exec->comp->states[trans->to];
4064 exec->transno = 0;
4065 if (trans->atom != NULL) {
4066 if (exec->inputStack != NULL) {
4067 exec->index++;
4068 if (exec->index < exec->inputStackNr) {
4069 value = exec->inputStack[exec->index].value;
4070 data = exec->inputStack[exec->index].data;
4071 #ifdef DEBUG_PUSH
4072 printf("value loaded: %s\n", value);
4073 #endif
4074 } else {
4075 value = NULL;
4076 data = NULL;
4077 #ifdef DEBUG_PUSH
4078 printf("end of input\n");
4079 #endif
4080 }
4081 } else {
4082 value = NULL;
4083 data = NULL;
4084 #ifdef DEBUG_PUSH
4085 printf("end of input\n");
4086 #endif
4087 }
4088 }
4089 goto progress;
4090 } else if (ret < 0) {
4091 exec->status = -4;
4092 break;
4093 }
4094 }
4095 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4096 rollback:
4097 /*
4098 * if we didn't yet rollback on the current input
4099 * store the current state as the error state.
4100 */
4101 if ((progress) && (exec->state != NULL) &&
4102 (exec->state->type != XML_REGEXP_SINK_STATE)) {
4103 progress = 0;
4104 if (exec->errString != NULL)
4105 xmlFree(exec->errString);
4106 exec->errString = xmlStrdup(value);
4107 exec->errState = exec->state;
4108 if (exec->comp->nbCounters)
4109 memcpy(exec->errCounts, exec->counts,
4110 exec->comp->nbCounters * sizeof(int));
4111 }
4112
4113 /*
4114 * Failed to find a way out
4115 */
4116 exec->determinist = 0;
4117 xmlFARegExecRollBack(exec);
4118 if ((exec->inputStack != NULL ) && (exec->status == 0)) {
4119 value = exec->inputStack[exec->index].value;
4120 data = exec->inputStack[exec->index].data;
4121 #ifdef DEBUG_PUSH
4122 printf("value loaded: %s\n", value);
4123 #endif
4124 }
4125 }
4126 continue;
4127 progress:
4128 progress = 1;
4129 continue;
4130 }
4131 if (exec->status == 0) {
4132 return(exec->state->type == XML_REGEXP_FINAL_STATE);
4133 }
4134 #ifdef DEBUG_ERR
4135 if (exec->status < 0) {
4136 testerr(exec);
4137 }
4138 #endif
4139 return(exec->status);
4140 }
4141
4142 /**
4143 * xmlRegExecPushString:
4144 * @exec: a regexp execution context or NULL to indicate the end
4145 * @value: a string token input
4146 * @data: data associated to the token to reuse in callbacks
4147 *
4148 * Push one input token in the execution context
4149 *
4150 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4151 * a negative value in case of error.
4152 */
4153 int
xmlRegExecPushString(xmlRegExecCtxtPtr exec,const xmlChar * value,void * data)4154 xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4155 void *data) {
4156 return(xmlRegExecPushStringInternal(exec, value, data, 0));
4157 }
4158
4159 /**
4160 * xmlRegExecPushString2:
4161 * @exec: a regexp execution context or NULL to indicate the end
4162 * @value: the first string token input
4163 * @value2: the second string token input
4164 * @data: data associated to the token to reuse in callbacks
4165 *
4166 * Push one input token in the execution context
4167 *
4168 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4169 * a negative value in case of error.
4170 */
4171 int
xmlRegExecPushString2(xmlRegExecCtxtPtr exec,const xmlChar * value,const xmlChar * value2,void * data)4172 xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4173 const xmlChar *value2, void *data) {
4174 xmlChar buf[150];
4175 int lenn, lenp, ret;
4176 xmlChar *str;
4177
4178 if (exec == NULL)
4179 return(-1);
4180 if (exec->comp == NULL)
4181 return(-1);
4182 if (exec->status != 0)
4183 return(exec->status);
4184
4185 if (value2 == NULL)
4186 return(xmlRegExecPushString(exec, value, data));
4187
4188 lenn = strlen((char *) value2);
4189 lenp = strlen((char *) value);
4190
4191 if (150 < lenn + lenp + 2) {
4192 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4193 if (str == NULL) {
4194 exec->status = -1;
4195 return(-1);
4196 }
4197 } else {
4198 str = buf;
4199 }
4200 memcpy(&str[0], value, lenp);
4201 str[lenp] = XML_REG_STRING_SEPARATOR;
4202 memcpy(&str[lenp + 1], value2, lenn);
4203 str[lenn + lenp + 1] = 0;
4204
4205 if (exec->comp->compact != NULL)
4206 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4207 else
4208 ret = xmlRegExecPushStringInternal(exec, str, data, 1);
4209
4210 if (str != buf)
4211 xmlFree(str);
4212 return(ret);
4213 }
4214
4215 /**
4216 * xmlRegExecGetValues:
4217 * @exec: a regexp execution context
4218 * @err: error extraction or normal one
4219 * @nbval: pointer to the number of accepted values IN/OUT
4220 * @nbneg: return number of negative transitions
4221 * @values: pointer to the array of acceptable values
4222 * @terminal: return value if this was a terminal state
4223 *
4224 * Extract informations from the regexp execution, internal routine to
4225 * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
4226 *
4227 * Returns: 0 in case of success or -1 in case of error.
4228 */
4229 static int
xmlRegExecGetValues(xmlRegExecCtxtPtr exec,int err,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4230 xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
4231 int *nbval, int *nbneg,
4232 xmlChar **values, int *terminal) {
4233 int maxval;
4234 int nb = 0;
4235
4236 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
4237 (values == NULL) || (*nbval <= 0))
4238 return(-1);
4239
4240 maxval = *nbval;
4241 *nbval = 0;
4242 *nbneg = 0;
4243 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4244 xmlRegexpPtr comp;
4245 int target, i, state;
4246
4247 comp = exec->comp;
4248
4249 if (err) {
4250 if (exec->errStateNo == -1) return(-1);
4251 state = exec->errStateNo;
4252 } else {
4253 state = exec->index;
4254 }
4255 if (terminal != NULL) {
4256 if (comp->compact[state * (comp->nbstrings + 1)] ==
4257 XML_REGEXP_FINAL_STATE)
4258 *terminal = 1;
4259 else
4260 *terminal = 0;
4261 }
4262 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4263 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4264 if ((target > 0) && (target <= comp->nbstates) &&
4265 (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4266 XML_REGEXP_SINK_STATE)) {
4267 values[nb++] = comp->stringMap[i];
4268 (*nbval)++;
4269 }
4270 }
4271 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4272 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4273 if ((target > 0) && (target <= comp->nbstates) &&
4274 (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4275 XML_REGEXP_SINK_STATE)) {
4276 values[nb++] = comp->stringMap[i];
4277 (*nbneg)++;
4278 }
4279 }
4280 } else {
4281 int transno;
4282 xmlRegTransPtr trans;
4283 xmlRegAtomPtr atom;
4284 xmlRegStatePtr state;
4285
4286 if (terminal != NULL) {
4287 if (exec->state->type == XML_REGEXP_FINAL_STATE)
4288 *terminal = 1;
4289 else
4290 *terminal = 0;
4291 }
4292
4293 if (err) {
4294 if (exec->errState == NULL) return(-1);
4295 state = exec->errState;
4296 } else {
4297 if (exec->state == NULL) return(-1);
4298 state = exec->state;
4299 }
4300 for (transno = 0;
4301 (transno < state->nbTrans) && (nb < maxval);
4302 transno++) {
4303 trans = &state->trans[transno];
4304 if (trans->to < 0)
4305 continue;
4306 atom = trans->atom;
4307 if ((atom == NULL) || (atom->valuep == NULL))
4308 continue;
4309 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4310 /* this should not be reached but ... */
4311 TODO;
4312 } else if (trans->count == REGEXP_ALL_COUNTER) {
4313 /* this should not be reached but ... */
4314 TODO;
4315 } else if (trans->counter >= 0) {
4316 xmlRegCounterPtr counter = NULL;
4317 int count;
4318
4319 if (err)
4320 count = exec->errCounts[trans->counter];
4321 else
4322 count = exec->counts[trans->counter];
4323 if (exec->comp != NULL)
4324 counter = &exec->comp->counters[trans->counter];
4325 if ((counter == NULL) || (count < counter->max)) {
4326 if (atom->neg)
4327 values[nb++] = (xmlChar *) atom->valuep2;
4328 else
4329 values[nb++] = (xmlChar *) atom->valuep;
4330 (*nbval)++;
4331 }
4332 } else {
4333 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
4334 (exec->comp->states[trans->to]->type !=
4335 XML_REGEXP_SINK_STATE)) {
4336 if (atom->neg)
4337 values[nb++] = (xmlChar *) atom->valuep2;
4338 else
4339 values[nb++] = (xmlChar *) atom->valuep;
4340 (*nbval)++;
4341 }
4342 }
4343 }
4344 for (transno = 0;
4345 (transno < state->nbTrans) && (nb < maxval);
4346 transno++) {
4347 trans = &state->trans[transno];
4348 if (trans->to < 0)
4349 continue;
4350 atom = trans->atom;
4351 if ((atom == NULL) || (atom->valuep == NULL))
4352 continue;
4353 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4354 continue;
4355 } else if (trans->count == REGEXP_ALL_COUNTER) {
4356 continue;
4357 } else if (trans->counter >= 0) {
4358 continue;
4359 } else {
4360 if ((exec->comp->states[trans->to] != NULL) &&
4361 (exec->comp->states[trans->to]->type ==
4362 XML_REGEXP_SINK_STATE)) {
4363 if (atom->neg)
4364 values[nb++] = (xmlChar *) atom->valuep2;
4365 else
4366 values[nb++] = (xmlChar *) atom->valuep;
4367 (*nbneg)++;
4368 }
4369 }
4370 }
4371 }
4372 return(0);
4373 }
4374
4375 /**
4376 * xmlRegExecNextValues:
4377 * @exec: a regexp execution context
4378 * @nbval: pointer to the number of accepted values IN/OUT
4379 * @nbneg: return number of negative transitions
4380 * @values: pointer to the array of acceptable values
4381 * @terminal: return value if this was a terminal state
4382 *
4383 * Extract informations from the regexp execution,
4384 * the parameter @values must point to an array of @nbval string pointers
4385 * on return nbval will contain the number of possible strings in that
4386 * state and the @values array will be updated with them. The string values
4387 * returned will be freed with the @exec context and don't need to be
4388 * deallocated.
4389 *
4390 * Returns: 0 in case of success or -1 in case of error.
4391 */
4392 int
xmlRegExecNextValues(xmlRegExecCtxtPtr exec,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4393 xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4394 xmlChar **values, int *terminal) {
4395 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
4396 }
4397
4398 /**
4399 * xmlRegExecErrInfo:
4400 * @exec: a regexp execution context generating an error
4401 * @string: return value for the error string
4402 * @nbval: pointer to the number of accepted values IN/OUT
4403 * @nbneg: return number of negative transitions
4404 * @values: pointer to the array of acceptable values
4405 * @terminal: return value if this was a terminal state
4406 *
4407 * Extract error informations from the regexp execution, the parameter
4408 * @string will be updated with the value pushed and not accepted,
4409 * the parameter @values must point to an array of @nbval string pointers
4410 * on return nbval will contain the number of possible strings in that
4411 * state and the @values array will be updated with them. The string values
4412 * returned will be freed with the @exec context and don't need to be
4413 * deallocated.
4414 *
4415 * Returns: 0 in case of success or -1 in case of error.
4416 */
4417 int
xmlRegExecErrInfo(xmlRegExecCtxtPtr exec,const xmlChar ** string,int * nbval,int * nbneg,xmlChar ** values,int * terminal)4418 xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
4419 int *nbval, int *nbneg, xmlChar **values, int *terminal) {
4420 if (exec == NULL)
4421 return(-1);
4422 if (string != NULL) {
4423 if (exec->status != 0)
4424 *string = exec->errString;
4425 else
4426 *string = NULL;
4427 }
4428 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
4429 }
4430
4431 #ifdef DEBUG_ERR
testerr(xmlRegExecCtxtPtr exec)4432 static void testerr(xmlRegExecCtxtPtr exec) {
4433 const xmlChar *string;
4434 xmlChar *values[5];
4435 int nb = 5;
4436 int nbneg;
4437 int terminal;
4438 xmlRegExecErrInfo(exec, &string, &nb, &nbneg, &values[0], &terminal);
4439 }
4440 #endif
4441
4442 #if 0
4443 static int
4444 xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4445 xmlRegTransPtr trans;
4446 xmlRegAtomPtr atom;
4447 int ret;
4448 int codepoint, len;
4449
4450 if (exec == NULL)
4451 return(-1);
4452 if (exec->status != 0)
4453 return(exec->status);
4454
4455 while ((exec->status == 0) &&
4456 ((exec->inputString[exec->index] != 0) ||
4457 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4458
4459 /*
4460 * End of input on non-terminal state, rollback, however we may
4461 * still have epsilon like transition for counted transitions
4462 * on counters, in that case don't break too early.
4463 */
4464 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4465 goto rollback;
4466
4467 exec->transcount = 0;
4468 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4469 trans = &exec->state->trans[exec->transno];
4470 if (trans->to < 0)
4471 continue;
4472 atom = trans->atom;
4473 ret = 0;
4474 if (trans->count >= 0) {
4475 int count;
4476 xmlRegCounterPtr counter;
4477
4478 /*
4479 * A counted transition.
4480 */
4481
4482 count = exec->counts[trans->count];
4483 counter = &exec->comp->counters[trans->count];
4484 #ifdef DEBUG_REGEXP_EXEC
4485 printf("testing count %d: val %d, min %d, max %d\n",
4486 trans->count, count, counter->min, counter->max);
4487 #endif
4488 ret = ((count >= counter->min) && (count <= counter->max));
4489 } else if (atom == NULL) {
4490 fprintf(stderr, "epsilon transition left at runtime\n");
4491 exec->status = -2;
4492 break;
4493 } else if (exec->inputString[exec->index] != 0) {
4494 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4495 ret = xmlRegCheckCharacter(atom, codepoint);
4496 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4497 xmlRegStatePtr to = exec->comp->states[trans->to];
4498
4499 /*
4500 * this is a multiple input sequence
4501 */
4502 if (exec->state->nbTrans > exec->transno + 1) {
4503 xmlFARegExecSave(exec);
4504 }
4505 exec->transcount = 1;
4506 do {
4507 /*
4508 * Try to progress as much as possible on the input
4509 */
4510 if (exec->transcount == atom->max) {
4511 break;
4512 }
4513 exec->index += len;
4514 /*
4515 * End of input: stop here
4516 */
4517 if (exec->inputString[exec->index] == 0) {
4518 exec->index -= len;
4519 break;
4520 }
4521 if (exec->transcount >= atom->min) {
4522 int transno = exec->transno;
4523 xmlRegStatePtr state = exec->state;
4524
4525 /*
4526 * The transition is acceptable save it
4527 */
4528 exec->transno = -1; /* trick */
4529 exec->state = to;
4530 xmlFARegExecSave(exec);
4531 exec->transno = transno;
4532 exec->state = state;
4533 }
4534 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4535 len);
4536 ret = xmlRegCheckCharacter(atom, codepoint);
4537 exec->transcount++;
4538 } while (ret == 1);
4539 if (exec->transcount < atom->min)
4540 ret = 0;
4541
4542 /*
4543 * If the last check failed but one transition was found
4544 * possible, rollback
4545 */
4546 if (ret < 0)
4547 ret = 0;
4548 if (ret == 0) {
4549 goto rollback;
4550 }
4551 }
4552 }
4553 if (ret == 1) {
4554 if (exec->state->nbTrans > exec->transno + 1) {
4555 xmlFARegExecSave(exec);
4556 }
4557 /*
4558 * restart count for expressions like this ((abc){2})*
4559 */
4560 if (trans->count >= 0) {
4561 #ifdef DEBUG_REGEXP_EXEC
4562 printf("Reset count %d\n", trans->count);
4563 #endif
4564 exec->counts[trans->count] = 0;
4565 }
4566 if (trans->counter >= 0) {
4567 #ifdef DEBUG_REGEXP_EXEC
4568 printf("Increasing count %d\n", trans->counter);
4569 #endif
4570 exec->counts[trans->counter]++;
4571 }
4572 #ifdef DEBUG_REGEXP_EXEC
4573 printf("entering state %d\n", trans->to);
4574 #endif
4575 exec->state = exec->comp->states[trans->to];
4576 exec->transno = 0;
4577 if (trans->atom != NULL) {
4578 exec->index += len;
4579 }
4580 goto progress;
4581 } else if (ret < 0) {
4582 exec->status = -4;
4583 break;
4584 }
4585 }
4586 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4587 rollback:
4588 /*
4589 * Failed to find a way out
4590 */
4591 exec->determinist = 0;
4592 xmlFARegExecRollBack(exec);
4593 }
4594 progress:
4595 continue;
4596 }
4597 }
4598 #endif
4599 /************************************************************************
4600 * *
4601 * Parser for the Schemas Datatype Regular Expressions *
4602 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
4603 * *
4604 ************************************************************************/
4605
4606 /**
4607 * xmlFAIsChar:
4608 * @ctxt: a regexp parser context
4609 *
4610 * [10] Char ::= [^.\?*+()|#x5B#x5D]
4611 */
4612 static int
xmlFAIsChar(xmlRegParserCtxtPtr ctxt)4613 xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4614 int cur;
4615 int len;
4616
4617 cur = CUR_SCHAR(ctxt->cur, len);
4618 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4619 (cur == '*') || (cur == '+') || (cur == '(') ||
4620 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
4621 (cur == 0x5D) || (cur == 0))
4622 return(-1);
4623 return(cur);
4624 }
4625
4626 /**
4627 * xmlFAParseCharProp:
4628 * @ctxt: a regexp parser context
4629 *
4630 * [27] charProp ::= IsCategory | IsBlock
4631 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
4632 * Separators | Symbols | Others
4633 * [29] Letters ::= 'L' [ultmo]?
4634 * [30] Marks ::= 'M' [nce]?
4635 * [31] Numbers ::= 'N' [dlo]?
4636 * [32] Punctuation ::= 'P' [cdseifo]?
4637 * [33] Separators ::= 'Z' [slp]?
4638 * [34] Symbols ::= 'S' [mcko]?
4639 * [35] Others ::= 'C' [cfon]?
4640 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
4641 */
4642 static void
xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt)4643 xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4644 int cur;
4645 xmlRegAtomType type = (xmlRegAtomType) 0;
4646 xmlChar *blockName = NULL;
4647
4648 cur = CUR;
4649 if (cur == 'L') {
4650 NEXT;
4651 cur = CUR;
4652 if (cur == 'u') {
4653 NEXT;
4654 type = XML_REGEXP_LETTER_UPPERCASE;
4655 } else if (cur == 'l') {
4656 NEXT;
4657 type = XML_REGEXP_LETTER_LOWERCASE;
4658 } else if (cur == 't') {
4659 NEXT;
4660 type = XML_REGEXP_LETTER_TITLECASE;
4661 } else if (cur == 'm') {
4662 NEXT;
4663 type = XML_REGEXP_LETTER_MODIFIER;
4664 } else if (cur == 'o') {
4665 NEXT;
4666 type = XML_REGEXP_LETTER_OTHERS;
4667 } else {
4668 type = XML_REGEXP_LETTER;
4669 }
4670 } else if (cur == 'M') {
4671 NEXT;
4672 cur = CUR;
4673 if (cur == 'n') {
4674 NEXT;
4675 /* nonspacing */
4676 type = XML_REGEXP_MARK_NONSPACING;
4677 } else if (cur == 'c') {
4678 NEXT;
4679 /* spacing combining */
4680 type = XML_REGEXP_MARK_SPACECOMBINING;
4681 } else if (cur == 'e') {
4682 NEXT;
4683 /* enclosing */
4684 type = XML_REGEXP_MARK_ENCLOSING;
4685 } else {
4686 /* all marks */
4687 type = XML_REGEXP_MARK;
4688 }
4689 } else if (cur == 'N') {
4690 NEXT;
4691 cur = CUR;
4692 if (cur == 'd') {
4693 NEXT;
4694 /* digital */
4695 type = XML_REGEXP_NUMBER_DECIMAL;
4696 } else if (cur == 'l') {
4697 NEXT;
4698 /* letter */
4699 type = XML_REGEXP_NUMBER_LETTER;
4700 } else if (cur == 'o') {
4701 NEXT;
4702 /* other */
4703 type = XML_REGEXP_NUMBER_OTHERS;
4704 } else {
4705 /* all numbers */
4706 type = XML_REGEXP_NUMBER;
4707 }
4708 } else if (cur == 'P') {
4709 NEXT;
4710 cur = CUR;
4711 if (cur == 'c') {
4712 NEXT;
4713 /* connector */
4714 type = XML_REGEXP_PUNCT_CONNECTOR;
4715 } else if (cur == 'd') {
4716 NEXT;
4717 /* dash */
4718 type = XML_REGEXP_PUNCT_DASH;
4719 } else if (cur == 's') {
4720 NEXT;
4721 /* open */
4722 type = XML_REGEXP_PUNCT_OPEN;
4723 } else if (cur == 'e') {
4724 NEXT;
4725 /* close */
4726 type = XML_REGEXP_PUNCT_CLOSE;
4727 } else if (cur == 'i') {
4728 NEXT;
4729 /* initial quote */
4730 type = XML_REGEXP_PUNCT_INITQUOTE;
4731 } else if (cur == 'f') {
4732 NEXT;
4733 /* final quote */
4734 type = XML_REGEXP_PUNCT_FINQUOTE;
4735 } else if (cur == 'o') {
4736 NEXT;
4737 /* other */
4738 type = XML_REGEXP_PUNCT_OTHERS;
4739 } else {
4740 /* all punctuation */
4741 type = XML_REGEXP_PUNCT;
4742 }
4743 } else if (cur == 'Z') {
4744 NEXT;
4745 cur = CUR;
4746 if (cur == 's') {
4747 NEXT;
4748 /* space */
4749 type = XML_REGEXP_SEPAR_SPACE;
4750 } else if (cur == 'l') {
4751 NEXT;
4752 /* line */
4753 type = XML_REGEXP_SEPAR_LINE;
4754 } else if (cur == 'p') {
4755 NEXT;
4756 /* paragraph */
4757 type = XML_REGEXP_SEPAR_PARA;
4758 } else {
4759 /* all separators */
4760 type = XML_REGEXP_SEPAR;
4761 }
4762 } else if (cur == 'S') {
4763 NEXT;
4764 cur = CUR;
4765 if (cur == 'm') {
4766 NEXT;
4767 type = XML_REGEXP_SYMBOL_MATH;
4768 /* math */
4769 } else if (cur == 'c') {
4770 NEXT;
4771 type = XML_REGEXP_SYMBOL_CURRENCY;
4772 /* currency */
4773 } else if (cur == 'k') {
4774 NEXT;
4775 type = XML_REGEXP_SYMBOL_MODIFIER;
4776 /* modifiers */
4777 } else if (cur == 'o') {
4778 NEXT;
4779 type = XML_REGEXP_SYMBOL_OTHERS;
4780 /* other */
4781 } else {
4782 /* all symbols */
4783 type = XML_REGEXP_SYMBOL;
4784 }
4785 } else if (cur == 'C') {
4786 NEXT;
4787 cur = CUR;
4788 if (cur == 'c') {
4789 NEXT;
4790 /* control */
4791 type = XML_REGEXP_OTHER_CONTROL;
4792 } else if (cur == 'f') {
4793 NEXT;
4794 /* format */
4795 type = XML_REGEXP_OTHER_FORMAT;
4796 } else if (cur == 'o') {
4797 NEXT;
4798 /* private use */
4799 type = XML_REGEXP_OTHER_PRIVATE;
4800 } else if (cur == 'n') {
4801 NEXT;
4802 /* not assigned */
4803 type = XML_REGEXP_OTHER_NA;
4804 } else {
4805 /* all others */
4806 type = XML_REGEXP_OTHER;
4807 }
4808 } else if (cur == 'I') {
4809 const xmlChar *start;
4810 NEXT;
4811 cur = CUR;
4812 if (cur != 's') {
4813 ERROR("IsXXXX expected");
4814 return;
4815 }
4816 NEXT;
4817 start = ctxt->cur;
4818 cur = CUR;
4819 if (((cur >= 'a') && (cur <= 'z')) ||
4820 ((cur >= 'A') && (cur <= 'Z')) ||
4821 ((cur >= '0') && (cur <= '9')) ||
4822 (cur == 0x2D)) {
4823 NEXT;
4824 cur = CUR;
4825 while (((cur >= 'a') && (cur <= 'z')) ||
4826 ((cur >= 'A') && (cur <= 'Z')) ||
4827 ((cur >= '0') && (cur <= '9')) ||
4828 (cur == 0x2D)) {
4829 NEXT;
4830 cur = CUR;
4831 }
4832 }
4833 type = XML_REGEXP_BLOCK_NAME;
4834 blockName = xmlStrndup(start, ctxt->cur - start);
4835 } else {
4836 ERROR("Unknown char property");
4837 return;
4838 }
4839 if (ctxt->atom == NULL) {
4840 ctxt->atom = xmlRegNewAtom(ctxt, type);
4841 if (ctxt->atom != NULL)
4842 ctxt->atom->valuep = blockName;
4843 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4844 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4845 type, 0, 0, blockName);
4846 }
4847 }
4848
4849 /**
4850 * xmlFAParseCharClassEsc:
4851 * @ctxt: a regexp parser context
4852 *
4853 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4854 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4855 * [25] catEsc ::= '\p{' charProp '}'
4856 * [26] complEsc ::= '\P{' charProp '}'
4857 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4858 */
4859 static void
xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt)4860 xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4861 int cur;
4862
4863 if (CUR == '.') {
4864 if (ctxt->atom == NULL) {
4865 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4866 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4867 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4868 XML_REGEXP_ANYCHAR, 0, 0, NULL);
4869 }
4870 NEXT;
4871 return;
4872 }
4873 if (CUR != '\\') {
4874 ERROR("Escaped sequence: expecting \\");
4875 return;
4876 }
4877 NEXT;
4878 cur = CUR;
4879 if (cur == 'p') {
4880 NEXT;
4881 if (CUR != '{') {
4882 ERROR("Expecting '{'");
4883 return;
4884 }
4885 NEXT;
4886 xmlFAParseCharProp(ctxt);
4887 if (CUR != '}') {
4888 ERROR("Expecting '}'");
4889 return;
4890 }
4891 NEXT;
4892 } else if (cur == 'P') {
4893 NEXT;
4894 if (CUR != '{') {
4895 ERROR("Expecting '{'");
4896 return;
4897 }
4898 NEXT;
4899 xmlFAParseCharProp(ctxt);
4900 if (ctxt->atom != NULL)
4901 ctxt->atom->neg = 1;
4902 if (CUR != '}') {
4903 ERROR("Expecting '}'");
4904 return;
4905 }
4906 NEXT;
4907 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4908 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4909 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4910 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4911 (cur == 0x5E)) {
4912 if (ctxt->atom == NULL) {
4913 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
4914 if (ctxt->atom != NULL) {
4915 switch (cur) {
4916 case 'n':
4917 ctxt->atom->codepoint = '\n';
4918 break;
4919 case 'r':
4920 ctxt->atom->codepoint = '\r';
4921 break;
4922 case 't':
4923 ctxt->atom->codepoint = '\t';
4924 break;
4925 default:
4926 ctxt->atom->codepoint = cur;
4927 }
4928 }
4929 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4930 switch (cur) {
4931 case 'n':
4932 cur = '\n';
4933 break;
4934 case 'r':
4935 cur = '\r';
4936 break;
4937 case 't':
4938 cur = '\t';
4939 break;
4940 }
4941 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4942 XML_REGEXP_CHARVAL, cur, cur, NULL);
4943 }
4944 NEXT;
4945 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
4946 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
4947 (cur == 'w') || (cur == 'W')) {
4948 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
4949
4950 switch (cur) {
4951 case 's':
4952 type = XML_REGEXP_ANYSPACE;
4953 break;
4954 case 'S':
4955 type = XML_REGEXP_NOTSPACE;
4956 break;
4957 case 'i':
4958 type = XML_REGEXP_INITNAME;
4959 break;
4960 case 'I':
4961 type = XML_REGEXP_NOTINITNAME;
4962 break;
4963 case 'c':
4964 type = XML_REGEXP_NAMECHAR;
4965 break;
4966 case 'C':
4967 type = XML_REGEXP_NOTNAMECHAR;
4968 break;
4969 case 'd':
4970 type = XML_REGEXP_DECIMAL;
4971 break;
4972 case 'D':
4973 type = XML_REGEXP_NOTDECIMAL;
4974 break;
4975 case 'w':
4976 type = XML_REGEXP_REALCHAR;
4977 break;
4978 case 'W':
4979 type = XML_REGEXP_NOTREALCHAR;
4980 break;
4981 }
4982 NEXT;
4983 if (ctxt->atom == NULL) {
4984 ctxt->atom = xmlRegNewAtom(ctxt, type);
4985 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4986 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4987 type, 0, 0, NULL);
4988 }
4989 } else {
4990 ERROR("Wrong escape sequence, misuse of character '\\'");
4991 }
4992 }
4993
4994 /**
4995 * xmlFAParseCharRange:
4996 * @ctxt: a regexp parser context
4997 *
4998 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
4999 * [18] seRange ::= charOrEsc '-' charOrEsc
5000 * [20] charOrEsc ::= XmlChar | SingleCharEsc
5001 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
5002 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
5003 */
5004 static void
xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt)5005 xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
5006 int cur, len;
5007 int start = -1;
5008 int end = -1;
5009
5010 if (CUR == '\0') {
5011 ERROR("Expecting ']'");
5012 return;
5013 }
5014
5015 cur = CUR;
5016 if (cur == '\\') {
5017 NEXT;
5018 cur = CUR;
5019 switch (cur) {
5020 case 'n': start = 0xA; break;
5021 case 'r': start = 0xD; break;
5022 case 't': start = 0x9; break;
5023 case '\\': case '|': case '.': case '-': case '^': case '?':
5024 case '*': case '+': case '{': case '}': case '(': case ')':
5025 case '[': case ']':
5026 start = cur; break;
5027 default:
5028 ERROR("Invalid escape value");
5029 return;
5030 }
5031 end = start;
5032 len = 1;
5033 } else if ((cur != 0x5B) && (cur != 0x5D)) {
5034 end = start = CUR_SCHAR(ctxt->cur, len);
5035 } else {
5036 ERROR("Expecting a char range");
5037 return;
5038 }
5039 /*
5040 * Since we are "inside" a range, we can assume ctxt->cur is past
5041 * the start of ctxt->string, and PREV should be safe
5042 */
5043 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
5044 NEXTL(len);
5045 return;
5046 }
5047 NEXTL(len);
5048 cur = CUR;
5049 if ((cur != '-') || (NXT(1) == ']')) {
5050 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5051 XML_REGEXP_CHARVAL, start, end, NULL);
5052 return;
5053 }
5054 NEXT;
5055 cur = CUR;
5056 if (cur == '\\') {
5057 NEXT;
5058 cur = CUR;
5059 switch (cur) {
5060 case 'n': end = 0xA; break;
5061 case 'r': end = 0xD; break;
5062 case 't': end = 0x9; break;
5063 case '\\': case '|': case '.': case '-': case '^': case '?':
5064 case '*': case '+': case '{': case '}': case '(': case ')':
5065 case '[': case ']':
5066 end = cur; break;
5067 default:
5068 ERROR("Invalid escape value");
5069 return;
5070 }
5071 len = 1;
5072 } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
5073 end = CUR_SCHAR(ctxt->cur, len);
5074 } else {
5075 ERROR("Expecting the end of a char range");
5076 return;
5077 }
5078
5079 /* TODO check that the values are acceptable character ranges for XML */
5080 if (end < start) {
5081 ERROR("End of range is before start of range");
5082 } else {
5083 NEXTL(len);
5084 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
5085 XML_REGEXP_CHARVAL, start, end, NULL);
5086 }
5087 return;
5088 }
5089
5090 /**
5091 * xmlFAParsePosCharGroup:
5092 * @ctxt: a regexp parser context
5093 *
5094 * [14] posCharGroup ::= ( charRange | charClassEsc )+
5095 */
5096 static void
xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt)5097 xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5098 do {
5099 if (CUR == '\\') {
5100 xmlFAParseCharClassEsc(ctxt);
5101 } else {
5102 xmlFAParseCharRange(ctxt);
5103 }
5104 } while ((CUR != ']') && (CUR != '^') && (CUR != '-') &&
5105 (CUR != 0) && (ctxt->error == 0));
5106 }
5107
5108 /**
5109 * xmlFAParseCharGroup:
5110 * @ctxt: a regexp parser context
5111 *
5112 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
5113 * [15] negCharGroup ::= '^' posCharGroup
5114 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
5115 * [12] charClassExpr ::= '[' charGroup ']'
5116 */
5117 static void
xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt)5118 xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
5119 int n = ctxt->neg;
5120 while ((CUR != ']') && (ctxt->error == 0)) {
5121 if (CUR == '^') {
5122 int neg = ctxt->neg;
5123
5124 NEXT;
5125 ctxt->neg = !ctxt->neg;
5126 xmlFAParsePosCharGroup(ctxt);
5127 ctxt->neg = neg;
5128 } else if ((CUR == '-') && (NXT(1) == '[')) {
5129 int neg = ctxt->neg;
5130 ctxt->neg = 2;
5131 NEXT; /* eat the '-' */
5132 NEXT; /* eat the '[' */
5133 xmlFAParseCharGroup(ctxt);
5134 if (CUR == ']') {
5135 NEXT;
5136 } else {
5137 ERROR("charClassExpr: ']' expected");
5138 break;
5139 }
5140 ctxt->neg = neg;
5141 break;
5142 } else if (CUR != ']') {
5143 xmlFAParsePosCharGroup(ctxt);
5144 }
5145 }
5146 ctxt->neg = n;
5147 }
5148
5149 /**
5150 * xmlFAParseCharClass:
5151 * @ctxt: a regexp parser context
5152 *
5153 * [11] charClass ::= charClassEsc | charClassExpr
5154 * [12] charClassExpr ::= '[' charGroup ']'
5155 */
5156 static void
xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt)5157 xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5158 if (CUR == '[') {
5159 NEXT;
5160 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5161 if (ctxt->atom == NULL)
5162 return;
5163 xmlFAParseCharGroup(ctxt);
5164 if (CUR == ']') {
5165 NEXT;
5166 } else {
5167 ERROR("xmlFAParseCharClass: ']' expected");
5168 }
5169 } else {
5170 xmlFAParseCharClassEsc(ctxt);
5171 }
5172 }
5173
5174 /**
5175 * xmlFAParseQuantExact:
5176 * @ctxt: a regexp parser context
5177 *
5178 * [8] QuantExact ::= [0-9]+
5179 *
5180 * Returns 0 if success or -1 in case of error
5181 */
5182 static int
xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt)5183 xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5184 int ret = 0;
5185 int ok = 0;
5186
5187 while ((CUR >= '0') && (CUR <= '9')) {
5188 ret = ret * 10 + (CUR - '0');
5189 ok = 1;
5190 NEXT;
5191 }
5192 if (ok != 1) {
5193 return(-1);
5194 }
5195 return(ret);
5196 }
5197
5198 /**
5199 * xmlFAParseQuantifier:
5200 * @ctxt: a regexp parser context
5201 *
5202 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
5203 * [5] quantity ::= quantRange | quantMin | QuantExact
5204 * [6] quantRange ::= QuantExact ',' QuantExact
5205 * [7] quantMin ::= QuantExact ','
5206 * [8] QuantExact ::= [0-9]+
5207 */
5208 static int
xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt)5209 xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5210 int cur;
5211
5212 cur = CUR;
5213 if ((cur == '?') || (cur == '*') || (cur == '+')) {
5214 if (ctxt->atom != NULL) {
5215 if (cur == '?')
5216 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5217 else if (cur == '*')
5218 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5219 else if (cur == '+')
5220 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5221 }
5222 NEXT;
5223 return(1);
5224 }
5225 if (cur == '{') {
5226 int min = 0, max = 0;
5227
5228 NEXT;
5229 cur = xmlFAParseQuantExact(ctxt);
5230 if (cur >= 0)
5231 min = cur;
5232 if (CUR == ',') {
5233 NEXT;
5234 if (CUR == '}')
5235 max = INT_MAX;
5236 else {
5237 cur = xmlFAParseQuantExact(ctxt);
5238 if (cur >= 0)
5239 max = cur;
5240 else {
5241 ERROR("Improper quantifier");
5242 }
5243 }
5244 }
5245 if (CUR == '}') {
5246 NEXT;
5247 } else {
5248 ERROR("Unterminated quantifier");
5249 }
5250 if (max == 0)
5251 max = min;
5252 if (ctxt->atom != NULL) {
5253 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5254 ctxt->atom->min = min;
5255 ctxt->atom->max = max;
5256 }
5257 return(1);
5258 }
5259 return(0);
5260 }
5261
5262 /**
5263 * xmlFAParseAtom:
5264 * @ctxt: a regexp parser context
5265 *
5266 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
5267 */
5268 static int
xmlFAParseAtom(xmlRegParserCtxtPtr ctxt)5269 xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5270 int codepoint, len;
5271
5272 codepoint = xmlFAIsChar(ctxt);
5273 if (codepoint > 0) {
5274 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5275 if (ctxt->atom == NULL)
5276 return(-1);
5277 codepoint = CUR_SCHAR(ctxt->cur, len);
5278 ctxt->atom->codepoint = codepoint;
5279 NEXTL(len);
5280 return(1);
5281 } else if (CUR == '|') {
5282 return(0);
5283 } else if (CUR == 0) {
5284 return(0);
5285 } else if (CUR == ')') {
5286 return(0);
5287 } else if (CUR == '(') {
5288 xmlRegStatePtr start, oldend, start0;
5289
5290 NEXT;
5291 /*
5292 * this extra Epsilon transition is needed if we count with 0 allowed
5293 * unfortunately this can't be known at that point
5294 */
5295 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5296 start0 = ctxt->state;
5297 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5298 start = ctxt->state;
5299 oldend = ctxt->end;
5300 ctxt->end = NULL;
5301 ctxt->atom = NULL;
5302 xmlFAParseRegExp(ctxt, 0);
5303 if (CUR == ')') {
5304 NEXT;
5305 } else {
5306 ERROR("xmlFAParseAtom: expecting ')'");
5307 }
5308 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5309 if (ctxt->atom == NULL)
5310 return(-1);
5311 ctxt->atom->start = start;
5312 ctxt->atom->start0 = start0;
5313 ctxt->atom->stop = ctxt->state;
5314 ctxt->end = oldend;
5315 return(1);
5316 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5317 xmlFAParseCharClass(ctxt);
5318 return(1);
5319 }
5320 return(0);
5321 }
5322
5323 /**
5324 * xmlFAParsePiece:
5325 * @ctxt: a regexp parser context
5326 *
5327 * [3] piece ::= atom quantifier?
5328 */
5329 static int
xmlFAParsePiece(xmlRegParserCtxtPtr ctxt)5330 xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5331 int ret;
5332
5333 ctxt->atom = NULL;
5334 ret = xmlFAParseAtom(ctxt);
5335 if (ret == 0)
5336 return(0);
5337 if (ctxt->atom == NULL) {
5338 ERROR("internal: no atom generated");
5339 }
5340 xmlFAParseQuantifier(ctxt);
5341 return(1);
5342 }
5343
5344 /**
5345 * xmlFAParseBranch:
5346 * @ctxt: a regexp parser context
5347 * @to: optional target to the end of the branch
5348 *
5349 * @to is used to optimize by removing duplicate path in automata
5350 * in expressions like (a|b)(c|d)
5351 *
5352 * [2] branch ::= piece*
5353 */
5354 static int
xmlFAParseBranch(xmlRegParserCtxtPtr ctxt,xmlRegStatePtr to)5355 xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
5356 xmlRegStatePtr previous;
5357 int ret;
5358
5359 previous = ctxt->state;
5360 ret = xmlFAParsePiece(ctxt);
5361 if (ret == 0) {
5362 /* Empty branch */
5363 xmlFAGenerateEpsilonTransition(ctxt, previous, to);
5364 } else {
5365 if (xmlFAGenerateTransitions(ctxt, previous,
5366 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL, ctxt->atom) < 0)
5367 return(-1);
5368 previous = ctxt->state;
5369 ctxt->atom = NULL;
5370 }
5371 while ((ret != 0) && (ctxt->error == 0)) {
5372 ret = xmlFAParsePiece(ctxt);
5373 if (ret != 0) {
5374 if (xmlFAGenerateTransitions(ctxt, previous,
5375 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5376 ctxt->atom) < 0)
5377 return(-1);
5378 previous = ctxt->state;
5379 ctxt->atom = NULL;
5380 }
5381 }
5382 return(0);
5383 }
5384
5385 /**
5386 * xmlFAParseRegExp:
5387 * @ctxt: a regexp parser context
5388 * @top: is this the top-level expression ?
5389 *
5390 * [1] regExp ::= branch ( '|' branch )*
5391 */
5392 static void
xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt,int top)5393 xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
5394 xmlRegStatePtr start, end;
5395
5396 /* if not top start should have been generated by an epsilon trans */
5397 start = ctxt->state;
5398 ctxt->end = NULL;
5399 xmlFAParseBranch(ctxt, NULL);
5400 if (top) {
5401 #ifdef DEBUG_REGEXP_GRAPH
5402 printf("State %d is final\n", ctxt->state->no);
5403 #endif
5404 ctxt->state->type = XML_REGEXP_FINAL_STATE;
5405 }
5406 if (CUR != '|') {
5407 ctxt->end = ctxt->state;
5408 return;
5409 }
5410 end = ctxt->state;
5411 while ((CUR == '|') && (ctxt->error == 0)) {
5412 NEXT;
5413 ctxt->state = start;
5414 ctxt->end = NULL;
5415 xmlFAParseBranch(ctxt, end);
5416 }
5417 if (!top) {
5418 ctxt->state = end;
5419 ctxt->end = end;
5420 }
5421 }
5422
5423 /************************************************************************
5424 * *
5425 * The basic API *
5426 * *
5427 ************************************************************************/
5428
5429 /**
5430 * xmlRegexpPrint:
5431 * @output: the file for the output debug
5432 * @regexp: the compiled regexp
5433 *
5434 * Print the content of the compiled regular expression
5435 */
5436 void
xmlRegexpPrint(FILE * output,xmlRegexpPtr regexp)5437 xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5438 int i;
5439
5440 if (output == NULL)
5441 return;
5442 fprintf(output, " regexp: ");
5443 if (regexp == NULL) {
5444 fprintf(output, "NULL\n");
5445 return;
5446 }
5447 fprintf(output, "'%s' ", regexp->string);
5448 fprintf(output, "\n");
5449 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5450 for (i = 0;i < regexp->nbAtoms; i++) {
5451 fprintf(output, " %02d ", i);
5452 xmlRegPrintAtom(output, regexp->atoms[i]);
5453 }
5454 fprintf(output, "%d states:", regexp->nbStates);
5455 fprintf(output, "\n");
5456 for (i = 0;i < regexp->nbStates; i++) {
5457 xmlRegPrintState(output, regexp->states[i]);
5458 }
5459 fprintf(output, "%d counters:\n", regexp->nbCounters);
5460 for (i = 0;i < regexp->nbCounters; i++) {
5461 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5462 regexp->counters[i].max);
5463 }
5464 }
5465
5466 /**
5467 * xmlRegexpCompile:
5468 * @regexp: a regular expression string
5469 *
5470 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
5471 * Appendix F and builds an automata suitable for testing strings against
5472 * that regular expression
5473 *
5474 * Returns the compiled expression or NULL in case of error
5475 */
5476 xmlRegexpPtr
xmlRegexpCompile(const xmlChar * regexp)5477 xmlRegexpCompile(const xmlChar *regexp) {
5478 xmlRegexpPtr ret;
5479 xmlRegParserCtxtPtr ctxt;
5480
5481 ctxt = xmlRegNewParserCtxt(regexp);
5482 if (ctxt == NULL)
5483 return(NULL);
5484
5485 /* initialize the parser */
5486 ctxt->end = NULL;
5487 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5488 xmlRegStatePush(ctxt, ctxt->start);
5489
5490 /* parse the expression building an automata */
5491 xmlFAParseRegExp(ctxt, 1);
5492 if (CUR != 0) {
5493 ERROR("xmlFAParseRegExp: extra characters");
5494 }
5495 if (ctxt->error != 0) {
5496 xmlRegFreeParserCtxt(ctxt);
5497 return(NULL);
5498 }
5499 ctxt->end = ctxt->state;
5500 ctxt->start->type = XML_REGEXP_START_STATE;
5501 ctxt->end->type = XML_REGEXP_FINAL_STATE;
5502
5503 /* remove the Epsilon except for counted transitions */
5504 xmlFAEliminateEpsilonTransitions(ctxt);
5505
5506
5507 if (ctxt->error != 0) {
5508 xmlRegFreeParserCtxt(ctxt);
5509 return(NULL);
5510 }
5511 ret = xmlRegEpxFromParse(ctxt);
5512 xmlRegFreeParserCtxt(ctxt);
5513 return(ret);
5514 }
5515
5516 /**
5517 * xmlRegexpExec:
5518 * @comp: the compiled regular expression
5519 * @content: the value to check against the regular expression
5520 *
5521 * Check if the regular expression generates the value
5522 *
5523 * Returns 1 if it matches, 0 if not and a negative value in case of error
5524 */
5525 int
xmlRegexpExec(xmlRegexpPtr comp,const xmlChar * content)5526 xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5527 if ((comp == NULL) || (content == NULL))
5528 return(-1);
5529 return(xmlFARegExec(comp, content));
5530 }
5531
5532 /**
5533 * xmlRegexpIsDeterminist:
5534 * @comp: the compiled regular expression
5535 *
5536 * Check if the regular expression is determinist
5537 *
5538 * Returns 1 if it yes, 0 if not and a negative value in case of error
5539 */
5540 int
xmlRegexpIsDeterminist(xmlRegexpPtr comp)5541 xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5542 xmlAutomataPtr am;
5543 int ret;
5544
5545 if (comp == NULL)
5546 return(-1);
5547 if (comp->determinist != -1)
5548 return(comp->determinist);
5549
5550 am = xmlNewAutomata();
5551 if (am == NULL)
5552 return(-1);
5553 if (am->states != NULL) {
5554 int i;
5555
5556 for (i = 0;i < am->nbStates;i++)
5557 xmlRegFreeState(am->states[i]);
5558 xmlFree(am->states);
5559 }
5560 am->nbAtoms = comp->nbAtoms;
5561 am->atoms = comp->atoms;
5562 am->nbStates = comp->nbStates;
5563 am->states = comp->states;
5564 am->determinist = -1;
5565 am->flags = comp->flags;
5566 ret = xmlFAComputesDeterminism(am);
5567 am->atoms = NULL;
5568 am->states = NULL;
5569 xmlFreeAutomata(am);
5570 comp->determinist = ret;
5571 return(ret);
5572 }
5573
5574 /**
5575 * xmlRegFreeRegexp:
5576 * @regexp: the regexp
5577 *
5578 * Free a regexp
5579 */
5580 void
xmlRegFreeRegexp(xmlRegexpPtr regexp)5581 xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5582 int i;
5583 if (regexp == NULL)
5584 return;
5585
5586 if (regexp->string != NULL)
5587 xmlFree(regexp->string);
5588 if (regexp->states != NULL) {
5589 for (i = 0;i < regexp->nbStates;i++)
5590 xmlRegFreeState(regexp->states[i]);
5591 xmlFree(regexp->states);
5592 }
5593 if (regexp->atoms != NULL) {
5594 for (i = 0;i < regexp->nbAtoms;i++)
5595 xmlRegFreeAtom(regexp->atoms[i]);
5596 xmlFree(regexp->atoms);
5597 }
5598 if (regexp->counters != NULL)
5599 xmlFree(regexp->counters);
5600 if (regexp->compact != NULL)
5601 xmlFree(regexp->compact);
5602 if (regexp->transdata != NULL)
5603 xmlFree(regexp->transdata);
5604 if (regexp->stringMap != NULL) {
5605 for (i = 0; i < regexp->nbstrings;i++)
5606 xmlFree(regexp->stringMap[i]);
5607 xmlFree(regexp->stringMap);
5608 }
5609
5610 xmlFree(regexp);
5611 }
5612
5613 #ifdef LIBXML_AUTOMATA_ENABLED
5614 /************************************************************************
5615 * *
5616 * The Automata interface *
5617 * *
5618 ************************************************************************/
5619
5620 /**
5621 * xmlNewAutomata:
5622 *
5623 * Create a new automata
5624 *
5625 * Returns the new object or NULL in case of failure
5626 */
5627 xmlAutomataPtr
xmlNewAutomata(void)5628 xmlNewAutomata(void) {
5629 xmlAutomataPtr ctxt;
5630
5631 ctxt = xmlRegNewParserCtxt(NULL);
5632 if (ctxt == NULL)
5633 return(NULL);
5634
5635 /* initialize the parser */
5636 ctxt->end = NULL;
5637 ctxt->start = ctxt->state = xmlRegNewState(ctxt);
5638 if (ctxt->start == NULL) {
5639 xmlFreeAutomata(ctxt);
5640 return(NULL);
5641 }
5642 ctxt->start->type = XML_REGEXP_START_STATE;
5643 if (xmlRegStatePush(ctxt, ctxt->start) < 0) {
5644 xmlRegFreeState(ctxt->start);
5645 xmlFreeAutomata(ctxt);
5646 return(NULL);
5647 }
5648 ctxt->flags = 0;
5649
5650 return(ctxt);
5651 }
5652
5653 /**
5654 * xmlFreeAutomata:
5655 * @am: an automata
5656 *
5657 * Free an automata
5658 */
5659 void
xmlFreeAutomata(xmlAutomataPtr am)5660 xmlFreeAutomata(xmlAutomataPtr am) {
5661 if (am == NULL)
5662 return;
5663 xmlRegFreeParserCtxt(am);
5664 }
5665
5666 /**
5667 * xmlAutomataSetFlags:
5668 * @am: an automata
5669 * @flags: a set of internal flags
5670 *
5671 * Set some flags on the automata
5672 */
5673 void
xmlAutomataSetFlags(xmlAutomataPtr am,int flags)5674 xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5675 if (am == NULL)
5676 return;
5677 am->flags |= flags;
5678 }
5679
5680 /**
5681 * xmlAutomataGetInitState:
5682 * @am: an automata
5683 *
5684 * Initial state lookup
5685 *
5686 * Returns the initial state of the automata
5687 */
5688 xmlAutomataStatePtr
xmlAutomataGetInitState(xmlAutomataPtr am)5689 xmlAutomataGetInitState(xmlAutomataPtr am) {
5690 if (am == NULL)
5691 return(NULL);
5692 return(am->start);
5693 }
5694
5695 /**
5696 * xmlAutomataSetFinalState:
5697 * @am: an automata
5698 * @state: a state in this automata
5699 *
5700 * Makes that state a final state
5701 *
5702 * Returns 0 or -1 in case of error
5703 */
5704 int
xmlAutomataSetFinalState(xmlAutomataPtr am,xmlAutomataStatePtr state)5705 xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5706 if ((am == NULL) || (state == NULL))
5707 return(-1);
5708 state->type = XML_REGEXP_FINAL_STATE;
5709 return(0);
5710 }
5711
5712 /**
5713 * xmlAutomataNewTransition:
5714 * @am: an automata
5715 * @from: the starting point of the transition
5716 * @to: the target point of the transition or NULL
5717 * @token: the input string associated to that transition
5718 * @data: data passed to the callback function if the transition is activated
5719 *
5720 * If @to is NULL, this creates first a new target state in the automata
5721 * and then adds a transition from the @from state to the target state
5722 * activated by the value of @token
5723 *
5724 * Returns the target state or NULL in case of error
5725 */
5726 xmlAutomataStatePtr
xmlAutomataNewTransition(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,void * data)5727 xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5728 xmlAutomataStatePtr to, const xmlChar *token,
5729 void *data) {
5730 xmlRegAtomPtr atom;
5731
5732 if ((am == NULL) || (from == NULL) || (token == NULL))
5733 return(NULL);
5734 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5735 if (atom == NULL)
5736 return(NULL);
5737 atom->data = data;
5738 atom->valuep = xmlStrdup(token);
5739
5740 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5741 xmlRegFreeAtom(atom);
5742 return(NULL);
5743 }
5744 if (to == NULL)
5745 return(am->state);
5746 return(to);
5747 }
5748
5749 /**
5750 * xmlAutomataNewTransition2:
5751 * @am: an automata
5752 * @from: the starting point of the transition
5753 * @to: the target point of the transition or NULL
5754 * @token: the first input string associated to that transition
5755 * @token2: the second input string associated to that transition
5756 * @data: data passed to the callback function if the transition is activated
5757 *
5758 * If @to is NULL, this creates first a new target state in the automata
5759 * and then adds a transition from the @from state to the target state
5760 * activated by the value of @token
5761 *
5762 * Returns the target state or NULL in case of error
5763 */
5764 xmlAutomataStatePtr
xmlAutomataNewTransition2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5765 xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5766 xmlAutomataStatePtr to, const xmlChar *token,
5767 const xmlChar *token2, void *data) {
5768 xmlRegAtomPtr atom;
5769
5770 if ((am == NULL) || (from == NULL) || (token == NULL))
5771 return(NULL);
5772 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5773 if (atom == NULL)
5774 return(NULL);
5775 atom->data = data;
5776 if ((token2 == NULL) || (*token2 == 0)) {
5777 atom->valuep = xmlStrdup(token);
5778 } else {
5779 int lenn, lenp;
5780 xmlChar *str;
5781
5782 lenn = strlen((char *) token2);
5783 lenp = strlen((char *) token);
5784
5785 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5786 if (str == NULL) {
5787 xmlRegFreeAtom(atom);
5788 return(NULL);
5789 }
5790 memcpy(&str[0], token, lenp);
5791 str[lenp] = '|';
5792 memcpy(&str[lenp + 1], token2, lenn);
5793 str[lenn + lenp + 1] = 0;
5794
5795 atom->valuep = str;
5796 }
5797
5798 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5799 xmlRegFreeAtom(atom);
5800 return(NULL);
5801 }
5802 if (to == NULL)
5803 return(am->state);
5804 return(to);
5805 }
5806
5807 /**
5808 * xmlAutomataNewNegTrans:
5809 * @am: an automata
5810 * @from: the starting point of the transition
5811 * @to: the target point of the transition or NULL
5812 * @token: the first input string associated to that transition
5813 * @token2: the second input string associated to that transition
5814 * @data: data passed to the callback function if the transition is activated
5815 *
5816 * If @to is NULL, this creates first a new target state in the automata
5817 * and then adds a transition from the @from state to the target state
5818 * activated by any value except (@token,@token2)
5819 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5820 # the semantic of XSD ##other
5821 *
5822 * Returns the target state or NULL in case of error
5823 */
5824 xmlAutomataStatePtr
xmlAutomataNewNegTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,void * data)5825 xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5826 xmlAutomataStatePtr to, const xmlChar *token,
5827 const xmlChar *token2, void *data) {
5828 xmlRegAtomPtr atom;
5829 xmlChar err_msg[200];
5830
5831 if ((am == NULL) || (from == NULL) || (token == NULL))
5832 return(NULL);
5833 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5834 if (atom == NULL)
5835 return(NULL);
5836 atom->data = data;
5837 atom->neg = 1;
5838 if ((token2 == NULL) || (*token2 == 0)) {
5839 atom->valuep = xmlStrdup(token);
5840 } else {
5841 int lenn, lenp;
5842 xmlChar *str;
5843
5844 lenn = strlen((char *) token2);
5845 lenp = strlen((char *) token);
5846
5847 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5848 if (str == NULL) {
5849 xmlRegFreeAtom(atom);
5850 return(NULL);
5851 }
5852 memcpy(&str[0], token, lenp);
5853 str[lenp] = '|';
5854 memcpy(&str[lenp + 1], token2, lenn);
5855 str[lenn + lenp + 1] = 0;
5856
5857 atom->valuep = str;
5858 }
5859 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
5860 err_msg[199] = 0;
5861 atom->valuep2 = xmlStrdup(err_msg);
5862
5863 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5864 xmlRegFreeAtom(atom);
5865 return(NULL);
5866 }
5867 am->negs++;
5868 if (to == NULL)
5869 return(am->state);
5870 return(to);
5871 }
5872
5873 /**
5874 * xmlAutomataNewCountTrans2:
5875 * @am: an automata
5876 * @from: the starting point of the transition
5877 * @to: the target point of the transition or NULL
5878 * @token: the input string associated to that transition
5879 * @token2: the second input string associated to that transition
5880 * @min: the minimum successive occurrences of token
5881 * @max: the maximum successive occurrences of token
5882 * @data: data associated to the transition
5883 *
5884 * If @to is NULL, this creates first a new target state in the automata
5885 * and then adds a transition from the @from state to the target state
5886 * activated by a succession of input of value @token and @token2 and
5887 * whose number is between @min and @max
5888 *
5889 * Returns the target state or NULL in case of error
5890 */
5891 xmlAutomataStatePtr
xmlAutomataNewCountTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)5892 xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5893 xmlAutomataStatePtr to, const xmlChar *token,
5894 const xmlChar *token2,
5895 int min, int max, void *data) {
5896 xmlRegAtomPtr atom;
5897 int counter;
5898
5899 if ((am == NULL) || (from == NULL) || (token == NULL))
5900 return(NULL);
5901 if (min < 0)
5902 return(NULL);
5903 if ((max < min) || (max < 1))
5904 return(NULL);
5905 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5906 if (atom == NULL)
5907 return(NULL);
5908 if ((token2 == NULL) || (*token2 == 0)) {
5909 atom->valuep = xmlStrdup(token);
5910 } else {
5911 int lenn, lenp;
5912 xmlChar *str;
5913
5914 lenn = strlen((char *) token2);
5915 lenp = strlen((char *) token);
5916
5917 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5918 if (str == NULL) {
5919 xmlRegFreeAtom(atom);
5920 return(NULL);
5921 }
5922 memcpy(&str[0], token, lenp);
5923 str[lenp] = '|';
5924 memcpy(&str[lenp + 1], token2, lenn);
5925 str[lenn + lenp + 1] = 0;
5926
5927 atom->valuep = str;
5928 }
5929 atom->data = data;
5930 if (min == 0)
5931 atom->min = 1;
5932 else
5933 atom->min = min;
5934 atom->max = max;
5935
5936 /*
5937 * associate a counter to the transition.
5938 */
5939 counter = xmlRegGetCounter(am);
5940 am->counters[counter].min = min;
5941 am->counters[counter].max = max;
5942
5943 /* xmlFAGenerateTransitions(am, from, to, atom); */
5944 if (to == NULL) {
5945 to = xmlRegNewState(am);
5946 xmlRegStatePush(am, to);
5947 }
5948 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5949 xmlRegAtomPush(am, atom);
5950 am->state = to;
5951
5952 if (to == NULL)
5953 to = am->state;
5954 if (to == NULL)
5955 return(NULL);
5956 if (min == 0)
5957 xmlFAGenerateEpsilonTransition(am, from, to);
5958 return(to);
5959 }
5960
5961 /**
5962 * xmlAutomataNewCountTrans:
5963 * @am: an automata
5964 * @from: the starting point of the transition
5965 * @to: the target point of the transition or NULL
5966 * @token: the input string associated to that transition
5967 * @min: the minimum successive occurrences of token
5968 * @max: the maximum successive occurrences of token
5969 * @data: data associated to the transition
5970 *
5971 * If @to is NULL, this creates first a new target state in the automata
5972 * and then adds a transition from the @from state to the target state
5973 * activated by a succession of input of value @token and whose number
5974 * is between @min and @max
5975 *
5976 * Returns the target state or NULL in case of error
5977 */
5978 xmlAutomataStatePtr
xmlAutomataNewCountTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)5979 xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5980 xmlAutomataStatePtr to, const xmlChar *token,
5981 int min, int max, void *data) {
5982 xmlRegAtomPtr atom;
5983 int counter;
5984
5985 if ((am == NULL) || (from == NULL) || (token == NULL))
5986 return(NULL);
5987 if (min < 0)
5988 return(NULL);
5989 if ((max < min) || (max < 1))
5990 return(NULL);
5991 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5992 if (atom == NULL)
5993 return(NULL);
5994 atom->valuep = xmlStrdup(token);
5995 atom->data = data;
5996 if (min == 0)
5997 atom->min = 1;
5998 else
5999 atom->min = min;
6000 atom->max = max;
6001
6002 /*
6003 * associate a counter to the transition.
6004 */
6005 counter = xmlRegGetCounter(am);
6006 am->counters[counter].min = min;
6007 am->counters[counter].max = max;
6008
6009 /* xmlFAGenerateTransitions(am, from, to, atom); */
6010 if (to == NULL) {
6011 to = xmlRegNewState(am);
6012 xmlRegStatePush(am, to);
6013 }
6014 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6015 xmlRegAtomPush(am, atom);
6016 am->state = to;
6017
6018 if (to == NULL)
6019 to = am->state;
6020 if (to == NULL)
6021 return(NULL);
6022 if (min == 0)
6023 xmlFAGenerateEpsilonTransition(am, from, to);
6024 return(to);
6025 }
6026
6027 /**
6028 * xmlAutomataNewOnceTrans2:
6029 * @am: an automata
6030 * @from: the starting point of the transition
6031 * @to: the target point of the transition or NULL
6032 * @token: the input string associated to that transition
6033 * @token2: the second input string associated to that transition
6034 * @min: the minimum successive occurrences of token
6035 * @max: the maximum successive occurrences of token
6036 * @data: data associated to the transition
6037 *
6038 * If @to is NULL, this creates first a new target state in the automata
6039 * and then adds a transition from the @from state to the target state
6040 * activated by a succession of input of value @token and @token2 and whose
6041 * number is between @min and @max, moreover that transition can only be
6042 * crossed once.
6043 *
6044 * Returns the target state or NULL in case of error
6045 */
6046 xmlAutomataStatePtr
xmlAutomataNewOnceTrans2(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,const xmlChar * token2,int min,int max,void * data)6047 xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6048 xmlAutomataStatePtr to, const xmlChar *token,
6049 const xmlChar *token2,
6050 int min, int max, void *data) {
6051 xmlRegAtomPtr atom;
6052 int counter;
6053
6054 if ((am == NULL) || (from == NULL) || (token == NULL))
6055 return(NULL);
6056 if (min < 1)
6057 return(NULL);
6058 if ((max < min) || (max < 1))
6059 return(NULL);
6060 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6061 if (atom == NULL)
6062 return(NULL);
6063 if ((token2 == NULL) || (*token2 == 0)) {
6064 atom->valuep = xmlStrdup(token);
6065 } else {
6066 int lenn, lenp;
6067 xmlChar *str;
6068
6069 lenn = strlen((char *) token2);
6070 lenp = strlen((char *) token);
6071
6072 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6073 if (str == NULL) {
6074 xmlRegFreeAtom(atom);
6075 return(NULL);
6076 }
6077 memcpy(&str[0], token, lenp);
6078 str[lenp] = '|';
6079 memcpy(&str[lenp + 1], token2, lenn);
6080 str[lenn + lenp + 1] = 0;
6081
6082 atom->valuep = str;
6083 }
6084 atom->data = data;
6085 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6086 atom->min = min;
6087 atom->max = max;
6088 /*
6089 * associate a counter to the transition.
6090 */
6091 counter = xmlRegGetCounter(am);
6092 am->counters[counter].min = 1;
6093 am->counters[counter].max = 1;
6094
6095 /* xmlFAGenerateTransitions(am, from, to, atom); */
6096 if (to == NULL) {
6097 to = xmlRegNewState(am);
6098 xmlRegStatePush(am, to);
6099 }
6100 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6101 xmlRegAtomPush(am, atom);
6102 am->state = to;
6103 return(to);
6104 }
6105
6106
6107
6108 /**
6109 * xmlAutomataNewOnceTrans:
6110 * @am: an automata
6111 * @from: the starting point of the transition
6112 * @to: the target point of the transition or NULL
6113 * @token: the input string associated to that transition
6114 * @min: the minimum successive occurrences of token
6115 * @max: the maximum successive occurrences of token
6116 * @data: data associated to the transition
6117 *
6118 * If @to is NULL, this creates first a new target state in the automata
6119 * and then adds a transition from the @from state to the target state
6120 * activated by a succession of input of value @token and whose number
6121 * is between @min and @max, moreover that transition can only be crossed
6122 * once.
6123 *
6124 * Returns the target state or NULL in case of error
6125 */
6126 xmlAutomataStatePtr
xmlAutomataNewOnceTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,const xmlChar * token,int min,int max,void * data)6127 xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6128 xmlAutomataStatePtr to, const xmlChar *token,
6129 int min, int max, void *data) {
6130 xmlRegAtomPtr atom;
6131 int counter;
6132
6133 if ((am == NULL) || (from == NULL) || (token == NULL))
6134 return(NULL);
6135 if (min < 1)
6136 return(NULL);
6137 if ((max < min) || (max < 1))
6138 return(NULL);
6139 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6140 if (atom == NULL)
6141 return(NULL);
6142 atom->valuep = xmlStrdup(token);
6143 atom->data = data;
6144 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6145 atom->min = min;
6146 atom->max = max;
6147 /*
6148 * associate a counter to the transition.
6149 */
6150 counter = xmlRegGetCounter(am);
6151 am->counters[counter].min = 1;
6152 am->counters[counter].max = 1;
6153
6154 /* xmlFAGenerateTransitions(am, from, to, atom); */
6155 if (to == NULL) {
6156 to = xmlRegNewState(am);
6157 xmlRegStatePush(am, to);
6158 }
6159 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6160 xmlRegAtomPush(am, atom);
6161 am->state = to;
6162 return(to);
6163 }
6164
6165 /**
6166 * xmlAutomataNewState:
6167 * @am: an automata
6168 *
6169 * Create a new disconnected state in the automata
6170 *
6171 * Returns the new state or NULL in case of error
6172 */
6173 xmlAutomataStatePtr
xmlAutomataNewState(xmlAutomataPtr am)6174 xmlAutomataNewState(xmlAutomataPtr am) {
6175 xmlAutomataStatePtr to;
6176
6177 if (am == NULL)
6178 return(NULL);
6179 to = xmlRegNewState(am);
6180 xmlRegStatePush(am, to);
6181 return(to);
6182 }
6183
6184 /**
6185 * xmlAutomataNewEpsilon:
6186 * @am: an automata
6187 * @from: the starting point of the transition
6188 * @to: the target point of the transition or NULL
6189 *
6190 * If @to is NULL, this creates first a new target state in the automata
6191 * and then adds an epsilon transition from the @from state to the
6192 * target state
6193 *
6194 * Returns the target state or NULL in case of error
6195 */
6196 xmlAutomataStatePtr
xmlAutomataNewEpsilon(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to)6197 xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
6198 xmlAutomataStatePtr to) {
6199 if ((am == NULL) || (from == NULL))
6200 return(NULL);
6201 xmlFAGenerateEpsilonTransition(am, from, to);
6202 if (to == NULL)
6203 return(am->state);
6204 return(to);
6205 }
6206
6207 /**
6208 * xmlAutomataNewAllTrans:
6209 * @am: an automata
6210 * @from: the starting point of the transition
6211 * @to: the target point of the transition or NULL
6212 * @lax: allow to transition if not all all transitions have been activated
6213 *
6214 * If @to is NULL, this creates first a new target state in the automata
6215 * and then adds a an ALL transition from the @from state to the
6216 * target state. That transition is an epsilon transition allowed only when
6217 * all transitions from the @from node have been activated.
6218 *
6219 * Returns the target state or NULL in case of error
6220 */
6221 xmlAutomataStatePtr
xmlAutomataNewAllTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int lax)6222 xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6223 xmlAutomataStatePtr to, int lax) {
6224 if ((am == NULL) || (from == NULL))
6225 return(NULL);
6226 xmlFAGenerateAllTransition(am, from, to, lax);
6227 if (to == NULL)
6228 return(am->state);
6229 return(to);
6230 }
6231
6232 /**
6233 * xmlAutomataNewCounter:
6234 * @am: an automata
6235 * @min: the minimal value on the counter
6236 * @max: the maximal value on the counter
6237 *
6238 * Create a new counter
6239 *
6240 * Returns the counter number or -1 in case of error
6241 */
6242 int
xmlAutomataNewCounter(xmlAutomataPtr am,int min,int max)6243 xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6244 int ret;
6245
6246 if (am == NULL)
6247 return(-1);
6248
6249 ret = xmlRegGetCounter(am);
6250 if (ret < 0)
6251 return(-1);
6252 am->counters[ret].min = min;
6253 am->counters[ret].max = max;
6254 return(ret);
6255 }
6256
6257 /**
6258 * xmlAutomataNewCountedTrans:
6259 * @am: an automata
6260 * @from: the starting point of the transition
6261 * @to: the target point of the transition or NULL
6262 * @counter: the counter associated to that transition
6263 *
6264 * If @to is NULL, this creates first a new target state in the automata
6265 * and then adds an epsilon transition from the @from state to the target state
6266 * which will increment the counter provided
6267 *
6268 * Returns the target state or NULL in case of error
6269 */
6270 xmlAutomataStatePtr
xmlAutomataNewCountedTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6271 xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6272 xmlAutomataStatePtr to, int counter) {
6273 if ((am == NULL) || (from == NULL) || (counter < 0))
6274 return(NULL);
6275 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6276 if (to == NULL)
6277 return(am->state);
6278 return(to);
6279 }
6280
6281 /**
6282 * xmlAutomataNewCounterTrans:
6283 * @am: an automata
6284 * @from: the starting point of the transition
6285 * @to: the target point of the transition or NULL
6286 * @counter: the counter associated to that transition
6287 *
6288 * If @to is NULL, this creates first a new target state in the automata
6289 * and then adds an epsilon transition from the @from state to the target state
6290 * which will be allowed only if the counter is within the right range.
6291 *
6292 * Returns the target state or NULL in case of error
6293 */
6294 xmlAutomataStatePtr
xmlAutomataNewCounterTrans(xmlAutomataPtr am,xmlAutomataStatePtr from,xmlAutomataStatePtr to,int counter)6295 xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6296 xmlAutomataStatePtr to, int counter) {
6297 if ((am == NULL) || (from == NULL) || (counter < 0))
6298 return(NULL);
6299 xmlFAGenerateCountedTransition(am, from, to, counter);
6300 if (to == NULL)
6301 return(am->state);
6302 return(to);
6303 }
6304
6305 /**
6306 * xmlAutomataCompile:
6307 * @am: an automata
6308 *
6309 * Compile the automata into a Reg Exp ready for being executed.
6310 * The automata should be free after this point.
6311 *
6312 * Returns the compiled regexp or NULL in case of error
6313 */
6314 xmlRegexpPtr
xmlAutomataCompile(xmlAutomataPtr am)6315 xmlAutomataCompile(xmlAutomataPtr am) {
6316 xmlRegexpPtr ret;
6317
6318 if ((am == NULL) || (am->error != 0)) return(NULL);
6319 xmlFAEliminateEpsilonTransitions(am);
6320 /* xmlFAComputesDeterminism(am); */
6321 ret = xmlRegEpxFromParse(am);
6322
6323 return(ret);
6324 }
6325
6326 /**
6327 * xmlAutomataIsDeterminist:
6328 * @am: an automata
6329 *
6330 * Checks if an automata is determinist.
6331 *
6332 * Returns 1 if true, 0 if not, and -1 in case of error
6333 */
6334 int
xmlAutomataIsDeterminist(xmlAutomataPtr am)6335 xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6336 int ret;
6337
6338 if (am == NULL)
6339 return(-1);
6340
6341 ret = xmlFAComputesDeterminism(am);
6342 return(ret);
6343 }
6344 #endif /* LIBXML_AUTOMATA_ENABLED */
6345
6346 #ifdef LIBXML_EXPR_ENABLED
6347 /************************************************************************
6348 * *
6349 * Formal Expression handling code *
6350 * *
6351 ************************************************************************/
6352 /************************************************************************
6353 * *
6354 * Expression handling context *
6355 * *
6356 ************************************************************************/
6357
6358 struct _xmlExpCtxt {
6359 xmlDictPtr dict;
6360 xmlExpNodePtr *table;
6361 int size;
6362 int nbElems;
6363 int nb_nodes;
6364 int maxNodes;
6365 const char *expr;
6366 const char *cur;
6367 int nb_cons;
6368 int tabSize;
6369 };
6370
6371 /**
6372 * xmlExpNewCtxt:
6373 * @maxNodes: the maximum number of nodes
6374 * @dict: optional dictionary to use internally
6375 *
6376 * Creates a new context for manipulating expressions
6377 *
6378 * Returns the context or NULL in case of error
6379 */
6380 xmlExpCtxtPtr
xmlExpNewCtxt(int maxNodes,xmlDictPtr dict)6381 xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6382 xmlExpCtxtPtr ret;
6383 int size = 256;
6384
6385 if (maxNodes <= 4096)
6386 maxNodes = 4096;
6387
6388 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6389 if (ret == NULL)
6390 return(NULL);
6391 memset(ret, 0, sizeof(xmlExpCtxt));
6392 ret->size = size;
6393 ret->nbElems = 0;
6394 ret->maxNodes = maxNodes;
6395 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6396 if (ret->table == NULL) {
6397 xmlFree(ret);
6398 return(NULL);
6399 }
6400 memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6401 if (dict == NULL) {
6402 ret->dict = xmlDictCreate();
6403 if (ret->dict == NULL) {
6404 xmlFree(ret->table);
6405 xmlFree(ret);
6406 return(NULL);
6407 }
6408 } else {
6409 ret->dict = dict;
6410 xmlDictReference(ret->dict);
6411 }
6412 return(ret);
6413 }
6414
6415 /**
6416 * xmlExpFreeCtxt:
6417 * @ctxt: an expression context
6418 *
6419 * Free an expression context
6420 */
6421 void
xmlExpFreeCtxt(xmlExpCtxtPtr ctxt)6422 xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6423 if (ctxt == NULL)
6424 return;
6425 xmlDictFree(ctxt->dict);
6426 if (ctxt->table != NULL)
6427 xmlFree(ctxt->table);
6428 xmlFree(ctxt);
6429 }
6430
6431 /************************************************************************
6432 * *
6433 * Structure associated to an expression node *
6434 * *
6435 ************************************************************************/
6436 #define MAX_NODES 10000
6437
6438 /* #define DEBUG_DERIV */
6439
6440 /*
6441 * TODO:
6442 * - Wildcards
6443 * - public API for creation
6444 *
6445 * Started
6446 * - regression testing
6447 *
6448 * Done
6449 * - split into module and test tool
6450 * - memleaks
6451 */
6452
6453 typedef enum {
6454 XML_EXP_NILABLE = (1 << 0)
6455 } xmlExpNodeInfo;
6456
6457 #define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6458
6459 struct _xmlExpNode {
6460 unsigned char type;/* xmlExpNodeType */
6461 unsigned char info;/* OR of xmlExpNodeInfo */
6462 unsigned short key; /* the hash key */
6463 unsigned int ref; /* The number of references */
6464 int c_max; /* the maximum length it can consume */
6465 xmlExpNodePtr exp_left;
6466 xmlExpNodePtr next;/* the next node in the hash table or free list */
6467 union {
6468 struct {
6469 int f_min;
6470 int f_max;
6471 } count;
6472 struct {
6473 xmlExpNodePtr f_right;
6474 } children;
6475 const xmlChar *f_str;
6476 } field;
6477 };
6478
6479 #define exp_min field.count.f_min
6480 #define exp_max field.count.f_max
6481 /* #define exp_left field.children.f_left */
6482 #define exp_right field.children.f_right
6483 #define exp_str field.f_str
6484
6485 static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6486 static xmlExpNode forbiddenExpNode = {
6487 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6488 };
6489 xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6490 static xmlExpNode emptyExpNode = {
6491 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6492 };
6493 xmlExpNodePtr emptyExp = &emptyExpNode;
6494
6495 /************************************************************************
6496 * *
6497 * The custom hash table for unicity and canonicalization *
6498 * of sub-expressions pointers *
6499 * *
6500 ************************************************************************/
6501 /*
6502 * xmlExpHashNameComputeKey:
6503 * Calculate the hash key for a token
6504 */
6505 static unsigned short
xmlExpHashNameComputeKey(const xmlChar * name)6506 xmlExpHashNameComputeKey(const xmlChar *name) {
6507 unsigned short value = 0L;
6508 char ch;
6509
6510 if (name != NULL) {
6511 value += 30 * (*name);
6512 while ((ch = *name++) != 0) {
6513 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6514 }
6515 }
6516 return (value);
6517 }
6518
6519 /*
6520 * xmlExpHashComputeKey:
6521 * Calculate the hash key for a compound expression
6522 */
6523 static unsigned short
xmlExpHashComputeKey(xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right)6524 xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6525 xmlExpNodePtr right) {
6526 unsigned long value;
6527 unsigned short ret;
6528
6529 switch (type) {
6530 case XML_EXP_SEQ:
6531 value = left->key;
6532 value += right->key;
6533 value *= 3;
6534 ret = (unsigned short) value;
6535 break;
6536 case XML_EXP_OR:
6537 value = left->key;
6538 value += right->key;
6539 value *= 7;
6540 ret = (unsigned short) value;
6541 break;
6542 case XML_EXP_COUNT:
6543 value = left->key;
6544 value += right->key;
6545 ret = (unsigned short) value;
6546 break;
6547 default:
6548 ret = 0;
6549 }
6550 return(ret);
6551 }
6552
6553
6554 static xmlExpNodePtr
xmlExpNewNode(xmlExpCtxtPtr ctxt,xmlExpNodeType type)6555 xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6556 xmlExpNodePtr ret;
6557
6558 if (ctxt->nb_nodes >= MAX_NODES)
6559 return(NULL);
6560 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6561 if (ret == NULL)
6562 return(NULL);
6563 memset(ret, 0, sizeof(xmlExpNode));
6564 ret->type = type;
6565 ret->next = NULL;
6566 ctxt->nb_nodes++;
6567 ctxt->nb_cons++;
6568 return(ret);
6569 }
6570
6571 /**
6572 * xmlExpHashGetEntry:
6573 * @table: the hash table
6574 *
6575 * Get the unique entry from the hash table. The entry is created if
6576 * needed. @left and @right are consumed, i.e. their ref count will
6577 * be decremented by the operation.
6578 *
6579 * Returns the pointer or NULL in case of error
6580 */
6581 static xmlExpNodePtr
xmlExpHashGetEntry(xmlExpCtxtPtr ctxt,xmlExpNodeType type,xmlExpNodePtr left,xmlExpNodePtr right,const xmlChar * name,int min,int max)6582 xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6583 xmlExpNodePtr left, xmlExpNodePtr right,
6584 const xmlChar *name, int min, int max) {
6585 unsigned short kbase, key;
6586 xmlExpNodePtr entry;
6587 xmlExpNodePtr insert;
6588
6589 if (ctxt == NULL)
6590 return(NULL);
6591
6592 /*
6593 * Check for duplicate and insertion location.
6594 */
6595 if (type == XML_EXP_ATOM) {
6596 kbase = xmlExpHashNameComputeKey(name);
6597 } else if (type == XML_EXP_COUNT) {
6598 /* COUNT reduction rule 1 */
6599 /* a{1} -> a */
6600 if (min == max) {
6601 if (min == 1) {
6602 return(left);
6603 }
6604 if (min == 0) {
6605 xmlExpFree(ctxt, left);
6606 return(emptyExp);
6607 }
6608 }
6609 if (min < 0) {
6610 xmlExpFree(ctxt, left);
6611 return(forbiddenExp);
6612 }
6613 if (max == -1)
6614 kbase = min + 79;
6615 else
6616 kbase = max - min;
6617 kbase += left->key;
6618 } else if (type == XML_EXP_OR) {
6619 /* Forbid reduction rules */
6620 if (left->type == XML_EXP_FORBID) {
6621 xmlExpFree(ctxt, left);
6622 return(right);
6623 }
6624 if (right->type == XML_EXP_FORBID) {
6625 xmlExpFree(ctxt, right);
6626 return(left);
6627 }
6628
6629 /* OR reduction rule 1 */
6630 /* a | a reduced to a */
6631 if (left == right) {
6632 xmlExpFree(ctxt, right);
6633 return(left);
6634 }
6635 /* OR canonicalization rule 1 */
6636 /* linearize (a | b) | c into a | (b | c) */
6637 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6638 xmlExpNodePtr tmp = left;
6639 left = right;
6640 right = tmp;
6641 }
6642 /* OR reduction rule 2 */
6643 /* a | (a | b) and b | (a | b) are reduced to a | b */
6644 if (right->type == XML_EXP_OR) {
6645 if ((left == right->exp_left) ||
6646 (left == right->exp_right)) {
6647 xmlExpFree(ctxt, left);
6648 return(right);
6649 }
6650 }
6651 /* OR canonicalization rule 2 */
6652 /* linearize (a | b) | c into a | (b | c) */
6653 if (left->type == XML_EXP_OR) {
6654 xmlExpNodePtr tmp;
6655
6656 /* OR canonicalization rule 2 */
6657 if ((left->exp_right->type != XML_EXP_OR) &&
6658 (left->exp_right->key < left->exp_left->key)) {
6659 tmp = left->exp_right;
6660 left->exp_right = left->exp_left;
6661 left->exp_left = tmp;
6662 }
6663 left->exp_right->ref++;
6664 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6665 NULL, 0, 0);
6666 left->exp_left->ref++;
6667 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6668 NULL, 0, 0);
6669
6670 xmlExpFree(ctxt, left);
6671 return(tmp);
6672 }
6673 if (right->type == XML_EXP_OR) {
6674 /* Ordering in the tree */
6675 /* C | (A | B) -> A | (B | C) */
6676 if (left->key > right->exp_right->key) {
6677 xmlExpNodePtr tmp;
6678 right->exp_right->ref++;
6679 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6680 left, NULL, 0, 0);
6681 right->exp_left->ref++;
6682 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6683 tmp, NULL, 0, 0);
6684 xmlExpFree(ctxt, right);
6685 return(tmp);
6686 }
6687 /* Ordering in the tree */
6688 /* B | (A | C) -> A | (B | C) */
6689 if (left->key > right->exp_left->key) {
6690 xmlExpNodePtr tmp;
6691 right->exp_right->ref++;
6692 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6693 right->exp_right, NULL, 0, 0);
6694 right->exp_left->ref++;
6695 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6696 tmp, NULL, 0, 0);
6697 xmlExpFree(ctxt, right);
6698 return(tmp);
6699 }
6700 }
6701 /* we know both types are != XML_EXP_OR here */
6702 else if (left->key > right->key) {
6703 xmlExpNodePtr tmp = left;
6704 left = right;
6705 right = tmp;
6706 }
6707 kbase = xmlExpHashComputeKey(type, left, right);
6708 } else if (type == XML_EXP_SEQ) {
6709 /* Forbid reduction rules */
6710 if (left->type == XML_EXP_FORBID) {
6711 xmlExpFree(ctxt, right);
6712 return(left);
6713 }
6714 if (right->type == XML_EXP_FORBID) {
6715 xmlExpFree(ctxt, left);
6716 return(right);
6717 }
6718 /* Empty reduction rules */
6719 if (right->type == XML_EXP_EMPTY) {
6720 return(left);
6721 }
6722 if (left->type == XML_EXP_EMPTY) {
6723 return(right);
6724 }
6725 kbase = xmlExpHashComputeKey(type, left, right);
6726 } else
6727 return(NULL);
6728
6729 key = kbase % ctxt->size;
6730 if (ctxt->table[key] != NULL) {
6731 for (insert = ctxt->table[key]; insert != NULL;
6732 insert = insert->next) {
6733 if ((insert->key == kbase) &&
6734 (insert->type == type)) {
6735 if (type == XML_EXP_ATOM) {
6736 if (name == insert->exp_str) {
6737 insert->ref++;
6738 return(insert);
6739 }
6740 } else if (type == XML_EXP_COUNT) {
6741 if ((insert->exp_min == min) && (insert->exp_max == max) &&
6742 (insert->exp_left == left)) {
6743 insert->ref++;
6744 left->ref--;
6745 return(insert);
6746 }
6747 } else if ((insert->exp_left == left) &&
6748 (insert->exp_right == right)) {
6749 insert->ref++;
6750 left->ref--;
6751 right->ref--;
6752 return(insert);
6753 }
6754 }
6755 }
6756 }
6757
6758 entry = xmlExpNewNode(ctxt, type);
6759 if (entry == NULL)
6760 return(NULL);
6761 entry->key = kbase;
6762 if (type == XML_EXP_ATOM) {
6763 entry->exp_str = name;
6764 entry->c_max = 1;
6765 } else if (type == XML_EXP_COUNT) {
6766 entry->exp_min = min;
6767 entry->exp_max = max;
6768 entry->exp_left = left;
6769 if ((min == 0) || (IS_NILLABLE(left)))
6770 entry->info |= XML_EXP_NILABLE;
6771 if (max < 0)
6772 entry->c_max = -1;
6773 else
6774 entry->c_max = max * entry->exp_left->c_max;
6775 } else {
6776 entry->exp_left = left;
6777 entry->exp_right = right;
6778 if (type == XML_EXP_OR) {
6779 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6780 entry->info |= XML_EXP_NILABLE;
6781 if ((entry->exp_left->c_max == -1) ||
6782 (entry->exp_right->c_max == -1))
6783 entry->c_max = -1;
6784 else if (entry->exp_left->c_max > entry->exp_right->c_max)
6785 entry->c_max = entry->exp_left->c_max;
6786 else
6787 entry->c_max = entry->exp_right->c_max;
6788 } else {
6789 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6790 entry->info |= XML_EXP_NILABLE;
6791 if ((entry->exp_left->c_max == -1) ||
6792 (entry->exp_right->c_max == -1))
6793 entry->c_max = -1;
6794 else
6795 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6796 }
6797 }
6798 entry->ref = 1;
6799 if (ctxt->table[key] != NULL)
6800 entry->next = ctxt->table[key];
6801
6802 ctxt->table[key] = entry;
6803 ctxt->nbElems++;
6804
6805 return(entry);
6806 }
6807
6808 /**
6809 * xmlExpFree:
6810 * @ctxt: the expression context
6811 * @exp: the expression
6812 *
6813 * Dereference the expression
6814 */
6815 void
xmlExpFree(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp)6816 xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6817 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6818 return;
6819 exp->ref--;
6820 if (exp->ref == 0) {
6821 unsigned short key;
6822
6823 /* Unlink it first from the hash table */
6824 key = exp->key % ctxt->size;
6825 if (ctxt->table[key] == exp) {
6826 ctxt->table[key] = exp->next;
6827 } else {
6828 xmlExpNodePtr tmp;
6829
6830 tmp = ctxt->table[key];
6831 while (tmp != NULL) {
6832 if (tmp->next == exp) {
6833 tmp->next = exp->next;
6834 break;
6835 }
6836 tmp = tmp->next;
6837 }
6838 }
6839
6840 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6841 xmlExpFree(ctxt, exp->exp_left);
6842 xmlExpFree(ctxt, exp->exp_right);
6843 } else if (exp->type == XML_EXP_COUNT) {
6844 xmlExpFree(ctxt, exp->exp_left);
6845 }
6846 xmlFree(exp);
6847 ctxt->nb_nodes--;
6848 }
6849 }
6850
6851 /**
6852 * xmlExpRef:
6853 * @exp: the expression
6854 *
6855 * Increase the reference count of the expression
6856 */
6857 void
xmlExpRef(xmlExpNodePtr exp)6858 xmlExpRef(xmlExpNodePtr exp) {
6859 if (exp != NULL)
6860 exp->ref++;
6861 }
6862
6863 /**
6864 * xmlExpNewAtom:
6865 * @ctxt: the expression context
6866 * @name: the atom name
6867 * @len: the atom name length in byte (or -1);
6868 *
6869 * Get the atom associated to this name from that context
6870 *
6871 * Returns the node or NULL in case of error
6872 */
6873 xmlExpNodePtr
xmlExpNewAtom(xmlExpCtxtPtr ctxt,const xmlChar * name,int len)6874 xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
6875 if ((ctxt == NULL) || (name == NULL))
6876 return(NULL);
6877 name = xmlDictLookup(ctxt->dict, name, len);
6878 if (name == NULL)
6879 return(NULL);
6880 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
6881 }
6882
6883 /**
6884 * xmlExpNewOr:
6885 * @ctxt: the expression context
6886 * @left: left expression
6887 * @right: right expression
6888 *
6889 * Get the atom associated to the choice @left | @right
6890 * Note that @left and @right are consumed in the operation, to keep
6891 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6892 * this is true even in case of failure (unless ctxt == NULL).
6893 *
6894 * Returns the node or NULL in case of error
6895 */
6896 xmlExpNodePtr
xmlExpNewOr(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)6897 xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6898 if (ctxt == NULL)
6899 return(NULL);
6900 if ((left == NULL) || (right == NULL)) {
6901 xmlExpFree(ctxt, left);
6902 xmlExpFree(ctxt, right);
6903 return(NULL);
6904 }
6905 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
6906 }
6907
6908 /**
6909 * xmlExpNewSeq:
6910 * @ctxt: the expression context
6911 * @left: left expression
6912 * @right: right expression
6913 *
6914 * Get the atom associated to the sequence @left , @right
6915 * Note that @left and @right are consumed in the operation, to keep
6916 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6917 * this is true even in case of failure (unless ctxt == NULL).
6918 *
6919 * Returns the node or NULL in case of error
6920 */
6921 xmlExpNodePtr
xmlExpNewSeq(xmlExpCtxtPtr ctxt,xmlExpNodePtr left,xmlExpNodePtr right)6922 xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6923 if (ctxt == NULL)
6924 return(NULL);
6925 if ((left == NULL) || (right == NULL)) {
6926 xmlExpFree(ctxt, left);
6927 xmlExpFree(ctxt, right);
6928 return(NULL);
6929 }
6930 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
6931 }
6932
6933 /**
6934 * xmlExpNewRange:
6935 * @ctxt: the expression context
6936 * @subset: the expression to be repeated
6937 * @min: the lower bound for the repetition
6938 * @max: the upper bound for the repetition, -1 means infinite
6939 *
6940 * Get the atom associated to the range (@subset){@min, @max}
6941 * Note that @subset is consumed in the operation, to keep
6942 * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
6943 * this is true even in case of failure (unless ctxt == NULL).
6944 *
6945 * Returns the node or NULL in case of error
6946 */
6947 xmlExpNodePtr
xmlExpNewRange(xmlExpCtxtPtr ctxt,xmlExpNodePtr subset,int min,int max)6948 xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
6949 if (ctxt == NULL)
6950 return(NULL);
6951 if ((subset == NULL) || (min < 0) || (max < -1) ||
6952 ((max >= 0) && (min > max))) {
6953 xmlExpFree(ctxt, subset);
6954 return(NULL);
6955 }
6956 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
6957 NULL, NULL, min, max));
6958 }
6959
6960 /************************************************************************
6961 * *
6962 * Public API for operations on expressions *
6963 * *
6964 ************************************************************************/
6965
6966 static int
xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)6967 xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6968 const xmlChar**list, int len, int nb) {
6969 int tmp, tmp2;
6970 tail:
6971 switch (exp->type) {
6972 case XML_EXP_EMPTY:
6973 return(0);
6974 case XML_EXP_ATOM:
6975 for (tmp = 0;tmp < nb;tmp++)
6976 if (list[tmp] == exp->exp_str)
6977 return(0);
6978 if (nb >= len)
6979 return(-2);
6980 list[nb] = exp->exp_str;
6981 return(1);
6982 case XML_EXP_COUNT:
6983 exp = exp->exp_left;
6984 goto tail;
6985 case XML_EXP_SEQ:
6986 case XML_EXP_OR:
6987 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
6988 if (tmp < 0)
6989 return(tmp);
6990 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
6991 nb + tmp);
6992 if (tmp2 < 0)
6993 return(tmp2);
6994 return(tmp + tmp2);
6995 }
6996 return(-1);
6997 }
6998
6999 /**
7000 * xmlExpGetLanguage:
7001 * @ctxt: the expression context
7002 * @exp: the expression
7003 * @langList: where to store the tokens
7004 * @len: the allocated length of @list
7005 *
7006 * Find all the strings used in @exp and store them in @list
7007 *
7008 * Returns the number of unique strings found, -1 in case of errors and
7009 * -2 if there is more than @len strings
7010 */
7011 int
xmlExpGetLanguage(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** langList,int len)7012 xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7013 const xmlChar**langList, int len) {
7014 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
7015 return(-1);
7016 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
7017 }
7018
7019 static int
xmlExpGetStartInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** list,int len,int nb)7020 xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7021 const xmlChar**list, int len, int nb) {
7022 int tmp, tmp2;
7023 tail:
7024 switch (exp->type) {
7025 case XML_EXP_FORBID:
7026 return(0);
7027 case XML_EXP_EMPTY:
7028 return(0);
7029 case XML_EXP_ATOM:
7030 for (tmp = 0;tmp < nb;tmp++)
7031 if (list[tmp] == exp->exp_str)
7032 return(0);
7033 if (nb >= len)
7034 return(-2);
7035 list[nb] = exp->exp_str;
7036 return(1);
7037 case XML_EXP_COUNT:
7038 exp = exp->exp_left;
7039 goto tail;
7040 case XML_EXP_SEQ:
7041 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7042 if (tmp < 0)
7043 return(tmp);
7044 if (IS_NILLABLE(exp->exp_left)) {
7045 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7046 nb + tmp);
7047 if (tmp2 < 0)
7048 return(tmp2);
7049 tmp += tmp2;
7050 }
7051 return(tmp);
7052 case XML_EXP_OR:
7053 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7054 if (tmp < 0)
7055 return(tmp);
7056 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7057 nb + tmp);
7058 if (tmp2 < 0)
7059 return(tmp2);
7060 return(tmp + tmp2);
7061 }
7062 return(-1);
7063 }
7064
7065 /**
7066 * xmlExpGetStart:
7067 * @ctxt: the expression context
7068 * @exp: the expression
7069 * @tokList: where to store the tokens
7070 * @len: the allocated length of @list
7071 *
7072 * Find all the strings that appears at the start of the languages
7073 * accepted by @exp and store them in @list. E.g. for (a, b) | c
7074 * it will return the list [a, c]
7075 *
7076 * Returns the number of unique strings found, -1 in case of errors and
7077 * -2 if there is more than @len strings
7078 */
7079 int
xmlExpGetStart(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar ** tokList,int len)7080 xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7081 const xmlChar**tokList, int len) {
7082 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
7083 return(-1);
7084 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
7085 }
7086
7087 /**
7088 * xmlExpIsNillable:
7089 * @exp: the expression
7090 *
7091 * Finds if the expression is nillable, i.e. if it accepts the empty sequence
7092 *
7093 * Returns 1 if nillable, 0 if not and -1 in case of error
7094 */
7095 int
xmlExpIsNillable(xmlExpNodePtr exp)7096 xmlExpIsNillable(xmlExpNodePtr exp) {
7097 if (exp == NULL)
7098 return(-1);
7099 return(IS_NILLABLE(exp) != 0);
7100 }
7101
7102 static xmlExpNodePtr
xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str)7103 xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
7104 {
7105 xmlExpNodePtr ret;
7106
7107 switch (exp->type) {
7108 case XML_EXP_EMPTY:
7109 return(forbiddenExp);
7110 case XML_EXP_FORBID:
7111 return(forbiddenExp);
7112 case XML_EXP_ATOM:
7113 if (exp->exp_str == str) {
7114 #ifdef DEBUG_DERIV
7115 printf("deriv atom: equal => Empty\n");
7116 #endif
7117 ret = emptyExp;
7118 } else {
7119 #ifdef DEBUG_DERIV
7120 printf("deriv atom: mismatch => forbid\n");
7121 #endif
7122 /* TODO wildcards here */
7123 ret = forbiddenExp;
7124 }
7125 return(ret);
7126 case XML_EXP_OR: {
7127 xmlExpNodePtr tmp;
7128
7129 #ifdef DEBUG_DERIV
7130 printf("deriv or: => or(derivs)\n");
7131 #endif
7132 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7133 if (tmp == NULL) {
7134 return(NULL);
7135 }
7136 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7137 if (ret == NULL) {
7138 xmlExpFree(ctxt, tmp);
7139 return(NULL);
7140 }
7141 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
7142 NULL, 0, 0);
7143 return(ret);
7144 }
7145 case XML_EXP_SEQ:
7146 #ifdef DEBUG_DERIV
7147 printf("deriv seq: starting with left\n");
7148 #endif
7149 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7150 if (ret == NULL) {
7151 return(NULL);
7152 } else if (ret == forbiddenExp) {
7153 if (IS_NILLABLE(exp->exp_left)) {
7154 #ifdef DEBUG_DERIV
7155 printf("deriv seq: left failed but nillable\n");
7156 #endif
7157 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7158 }
7159 } else {
7160 #ifdef DEBUG_DERIV
7161 printf("deriv seq: left match => sequence\n");
7162 #endif
7163 exp->exp_right->ref++;
7164 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
7165 NULL, 0, 0);
7166 }
7167 return(ret);
7168 case XML_EXP_COUNT: {
7169 int min, max;
7170 xmlExpNodePtr tmp;
7171
7172 if (exp->exp_max == 0)
7173 return(forbiddenExp);
7174 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7175 if (ret == NULL)
7176 return(NULL);
7177 if (ret == forbiddenExp) {
7178 #ifdef DEBUG_DERIV
7179 printf("deriv count: pattern mismatch => forbid\n");
7180 #endif
7181 return(ret);
7182 }
7183 if (exp->exp_max == 1)
7184 return(ret);
7185 if (exp->exp_max < 0) /* unbounded */
7186 max = -1;
7187 else
7188 max = exp->exp_max - 1;
7189 if (exp->exp_min > 0)
7190 min = exp->exp_min - 1;
7191 else
7192 min = 0;
7193 exp->exp_left->ref++;
7194 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
7195 NULL, min, max);
7196 if (ret == emptyExp) {
7197 #ifdef DEBUG_DERIV
7198 printf("deriv count: match to empty => new count\n");
7199 #endif
7200 return(tmp);
7201 }
7202 #ifdef DEBUG_DERIV
7203 printf("deriv count: match => sequence with new count\n");
7204 #endif
7205 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
7206 NULL, 0, 0));
7207 }
7208 }
7209 return(NULL);
7210 }
7211
7212 /**
7213 * xmlExpStringDerive:
7214 * @ctxt: the expression context
7215 * @exp: the expression
7216 * @str: the string
7217 * @len: the string len in bytes if available
7218 *
7219 * Do one step of Brzozowski derivation of the expression @exp with
7220 * respect to the input string
7221 *
7222 * Returns the resulting expression or NULL in case of internal error
7223 */
7224 xmlExpNodePtr
xmlExpStringDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,const xmlChar * str,int len)7225 xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7226 const xmlChar *str, int len) {
7227 const xmlChar *input;
7228
7229 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
7230 return(NULL);
7231 }
7232 /*
7233 * check the string is in the dictionary, if yes use an interned
7234 * copy, otherwise we know it's not an acceptable input
7235 */
7236 input = xmlDictExists(ctxt->dict, str, len);
7237 if (input == NULL) {
7238 return(forbiddenExp);
7239 }
7240 return(xmlExpStringDeriveInt(ctxt, exp, input));
7241 }
7242
7243 static int
xmlExpCheckCard(xmlExpNodePtr exp,xmlExpNodePtr sub)7244 xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7245 int ret = 1;
7246
7247 if (sub->c_max == -1) {
7248 if (exp->c_max != -1)
7249 ret = 0;
7250 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7251 ret = 0;
7252 }
7253 #if 0
7254 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7255 ret = 0;
7256 #endif
7257 return(ret);
7258 }
7259
7260 static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7261 xmlExpNodePtr sub);
7262 /**
7263 * xmlExpDivide:
7264 * @ctxt: the expressions context
7265 * @exp: the englobing expression
7266 * @sub: the subexpression
7267 * @mult: the multiple expression
7268 * @remain: the remain from the derivation of the multiple
7269 *
7270 * Check if exp is a multiple of sub, i.e. if there is a finite number n
7271 * so that sub{n} subsume exp
7272 *
7273 * Returns the multiple value if successful, 0 if it is not a multiple
7274 * and -1 in case of internal error.
7275 */
7276
7277 static int
xmlExpDivide(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub,xmlExpNodePtr * mult,xmlExpNodePtr * remain)7278 xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7279 xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7280 int i;
7281 xmlExpNodePtr tmp, tmp2;
7282
7283 if (mult != NULL) *mult = NULL;
7284 if (remain != NULL) *remain = NULL;
7285 if (exp->c_max == -1) return(0);
7286 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7287
7288 for (i = 1;i <= exp->c_max;i++) {
7289 sub->ref++;
7290 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7291 sub, NULL, NULL, i, i);
7292 if (tmp == NULL) {
7293 return(-1);
7294 }
7295 if (!xmlExpCheckCard(tmp, exp)) {
7296 xmlExpFree(ctxt, tmp);
7297 continue;
7298 }
7299 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7300 if (tmp2 == NULL) {
7301 xmlExpFree(ctxt, tmp);
7302 return(-1);
7303 }
7304 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7305 if (remain != NULL)
7306 *remain = tmp2;
7307 else
7308 xmlExpFree(ctxt, tmp2);
7309 if (mult != NULL)
7310 *mult = tmp;
7311 else
7312 xmlExpFree(ctxt, tmp);
7313 #ifdef DEBUG_DERIV
7314 printf("Divide succeeded %d\n", i);
7315 #endif
7316 return(i);
7317 }
7318 xmlExpFree(ctxt, tmp);
7319 xmlExpFree(ctxt, tmp2);
7320 }
7321 #ifdef DEBUG_DERIV
7322 printf("Divide failed\n");
7323 #endif
7324 return(0);
7325 }
7326
7327 /**
7328 * xmlExpExpDeriveInt:
7329 * @ctxt: the expressions context
7330 * @exp: the englobing expression
7331 * @sub: the subexpression
7332 *
7333 * Try to do a step of Brzozowski derivation but at a higher level
7334 * the input being a subexpression.
7335 *
7336 * Returns the resulting expression or NULL in case of internal error
7337 */
7338 static xmlExpNodePtr
xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7339 xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7340 xmlExpNodePtr ret, tmp, tmp2, tmp3;
7341 const xmlChar **tab;
7342 int len, i;
7343
7344 /*
7345 * In case of equality and if the expression can only consume a finite
7346 * amount, then the derivation is empty
7347 */
7348 if ((exp == sub) && (exp->c_max >= 0)) {
7349 #ifdef DEBUG_DERIV
7350 printf("Equal(exp, sub) and finite -> Empty\n");
7351 #endif
7352 return(emptyExp);
7353 }
7354 /*
7355 * decompose sub sequence first
7356 */
7357 if (sub->type == XML_EXP_EMPTY) {
7358 #ifdef DEBUG_DERIV
7359 printf("Empty(sub) -> Empty\n");
7360 #endif
7361 exp->ref++;
7362 return(exp);
7363 }
7364 if (sub->type == XML_EXP_SEQ) {
7365 #ifdef DEBUG_DERIV
7366 printf("Seq(sub) -> decompose\n");
7367 #endif
7368 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7369 if (tmp == NULL)
7370 return(NULL);
7371 if (tmp == forbiddenExp)
7372 return(tmp);
7373 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7374 xmlExpFree(ctxt, tmp);
7375 return(ret);
7376 }
7377 if (sub->type == XML_EXP_OR) {
7378 #ifdef DEBUG_DERIV
7379 printf("Or(sub) -> decompose\n");
7380 #endif
7381 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7382 if (tmp == forbiddenExp)
7383 return(tmp);
7384 if (tmp == NULL)
7385 return(NULL);
7386 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7387 if ((ret == NULL) || (ret == forbiddenExp)) {
7388 xmlExpFree(ctxt, tmp);
7389 return(ret);
7390 }
7391 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7392 }
7393 if (!xmlExpCheckCard(exp, sub)) {
7394 #ifdef DEBUG_DERIV
7395 printf("CheckCard(exp, sub) failed -> Forbid\n");
7396 #endif
7397 return(forbiddenExp);
7398 }
7399 switch (exp->type) {
7400 case XML_EXP_EMPTY:
7401 if (sub == emptyExp)
7402 return(emptyExp);
7403 #ifdef DEBUG_DERIV
7404 printf("Empty(exp) -> Forbid\n");
7405 #endif
7406 return(forbiddenExp);
7407 case XML_EXP_FORBID:
7408 #ifdef DEBUG_DERIV
7409 printf("Forbid(exp) -> Forbid\n");
7410 #endif
7411 return(forbiddenExp);
7412 case XML_EXP_ATOM:
7413 if (sub->type == XML_EXP_ATOM) {
7414 /* TODO: handle wildcards */
7415 if (exp->exp_str == sub->exp_str) {
7416 #ifdef DEBUG_DERIV
7417 printf("Atom match -> Empty\n");
7418 #endif
7419 return(emptyExp);
7420 }
7421 #ifdef DEBUG_DERIV
7422 printf("Atom mismatch -> Forbid\n");
7423 #endif
7424 return(forbiddenExp);
7425 }
7426 if ((sub->type == XML_EXP_COUNT) &&
7427 (sub->exp_max == 1) &&
7428 (sub->exp_left->type == XML_EXP_ATOM)) {
7429 /* TODO: handle wildcards */
7430 if (exp->exp_str == sub->exp_left->exp_str) {
7431 #ifdef DEBUG_DERIV
7432 printf("Atom match -> Empty\n");
7433 #endif
7434 return(emptyExp);
7435 }
7436 #ifdef DEBUG_DERIV
7437 printf("Atom mismatch -> Forbid\n");
7438 #endif
7439 return(forbiddenExp);
7440 }
7441 #ifdef DEBUG_DERIV
7442 printf("Complex exp vs Atom -> Forbid\n");
7443 #endif
7444 return(forbiddenExp);
7445 case XML_EXP_SEQ:
7446 /* try to get the sequence consumed only if possible */
7447 if (xmlExpCheckCard(exp->exp_left, sub)) {
7448 /* See if the sequence can be consumed directly */
7449 #ifdef DEBUG_DERIV
7450 printf("Seq trying left only\n");
7451 #endif
7452 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7453 if ((ret != forbiddenExp) && (ret != NULL)) {
7454 #ifdef DEBUG_DERIV
7455 printf("Seq trying left only worked\n");
7456 #endif
7457 /*
7458 * TODO: assumption here that we are determinist
7459 * i.e. we won't get to a nillable exp left
7460 * subset which could be matched by the right
7461 * part too.
7462 * e.g.: (a | b)+,(a | c) and 'a+,a'
7463 */
7464 exp->exp_right->ref++;
7465 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7466 exp->exp_right, NULL, 0, 0));
7467 }
7468 #ifdef DEBUG_DERIV
7469 } else {
7470 printf("Seq: left too short\n");
7471 #endif
7472 }
7473 /* Try instead to decompose */
7474 if (sub->type == XML_EXP_COUNT) {
7475 int min, max;
7476
7477 #ifdef DEBUG_DERIV
7478 printf("Seq: sub is a count\n");
7479 #endif
7480 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7481 if (ret == NULL)
7482 return(NULL);
7483 if (ret != forbiddenExp) {
7484 #ifdef DEBUG_DERIV
7485 printf("Seq , Count match on left\n");
7486 #endif
7487 if (sub->exp_max < 0)
7488 max = -1;
7489 else
7490 max = sub->exp_max -1;
7491 if (sub->exp_min > 0)
7492 min = sub->exp_min -1;
7493 else
7494 min = 0;
7495 exp->exp_right->ref++;
7496 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7497 exp->exp_right, NULL, 0, 0);
7498 if (tmp == NULL)
7499 return(NULL);
7500
7501 sub->exp_left->ref++;
7502 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7503 sub->exp_left, NULL, NULL, min, max);
7504 if (tmp2 == NULL) {
7505 xmlExpFree(ctxt, tmp);
7506 return(NULL);
7507 }
7508 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7509 xmlExpFree(ctxt, tmp);
7510 xmlExpFree(ctxt, tmp2);
7511 return(ret);
7512 }
7513 }
7514 /* we made no progress on structured operations */
7515 break;
7516 case XML_EXP_OR:
7517 #ifdef DEBUG_DERIV
7518 printf("Or , trying both side\n");
7519 #endif
7520 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7521 if (ret == NULL)
7522 return(NULL);
7523 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7524 if (tmp == NULL) {
7525 xmlExpFree(ctxt, ret);
7526 return(NULL);
7527 }
7528 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7529 case XML_EXP_COUNT: {
7530 int min, max;
7531
7532 if (sub->type == XML_EXP_COUNT) {
7533 /*
7534 * Try to see if the loop is completely subsumed
7535 */
7536 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7537 if (tmp == NULL)
7538 return(NULL);
7539 if (tmp == forbiddenExp) {
7540 int mult;
7541
7542 #ifdef DEBUG_DERIV
7543 printf("Count, Count inner don't subsume\n");
7544 #endif
7545 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7546 NULL, &tmp);
7547 if (mult <= 0) {
7548 #ifdef DEBUG_DERIV
7549 printf("Count, Count not multiple => forbidden\n");
7550 #endif
7551 return(forbiddenExp);
7552 }
7553 if (sub->exp_max == -1) {
7554 max = -1;
7555 if (exp->exp_max == -1) {
7556 if (exp->exp_min <= sub->exp_min * mult)
7557 min = 0;
7558 else
7559 min = exp->exp_min - sub->exp_min * mult;
7560 } else {
7561 #ifdef DEBUG_DERIV
7562 printf("Count, Count finite can't subsume infinite\n");
7563 #endif
7564 xmlExpFree(ctxt, tmp);
7565 return(forbiddenExp);
7566 }
7567 } else {
7568 if (exp->exp_max == -1) {
7569 #ifdef DEBUG_DERIV
7570 printf("Infinite loop consume mult finite loop\n");
7571 #endif
7572 if (exp->exp_min > sub->exp_min * mult) {
7573 max = -1;
7574 min = exp->exp_min - sub->exp_min * mult;
7575 } else {
7576 max = -1;
7577 min = 0;
7578 }
7579 } else {
7580 if (exp->exp_max < sub->exp_max * mult) {
7581 #ifdef DEBUG_DERIV
7582 printf("loops max mult mismatch => forbidden\n");
7583 #endif
7584 xmlExpFree(ctxt, tmp);
7585 return(forbiddenExp);
7586 }
7587 if (sub->exp_max * mult > exp->exp_min)
7588 min = 0;
7589 else
7590 min = exp->exp_min - sub->exp_max * mult;
7591 max = exp->exp_max - sub->exp_max * mult;
7592 }
7593 }
7594 } else if (!IS_NILLABLE(tmp)) {
7595 /*
7596 * TODO: loop here to try to grow if working on finite
7597 * blocks.
7598 */
7599 #ifdef DEBUG_DERIV
7600 printf("Count, Count remain not nillable => forbidden\n");
7601 #endif
7602 xmlExpFree(ctxt, tmp);
7603 return(forbiddenExp);
7604 } else if (sub->exp_max == -1) {
7605 if (exp->exp_max == -1) {
7606 if (exp->exp_min <= sub->exp_min) {
7607 #ifdef DEBUG_DERIV
7608 printf("Infinite loops Okay => COUNT(0,Inf)\n");
7609 #endif
7610 max = -1;
7611 min = 0;
7612 } else {
7613 #ifdef DEBUG_DERIV
7614 printf("Infinite loops min => Count(X,Inf)\n");
7615 #endif
7616 max = -1;
7617 min = exp->exp_min - sub->exp_min;
7618 }
7619 } else if (exp->exp_min > sub->exp_min) {
7620 #ifdef DEBUG_DERIV
7621 printf("loops min mismatch 1 => forbidden ???\n");
7622 #endif
7623 xmlExpFree(ctxt, tmp);
7624 return(forbiddenExp);
7625 } else {
7626 max = -1;
7627 min = 0;
7628 }
7629 } else {
7630 if (exp->exp_max == -1) {
7631 #ifdef DEBUG_DERIV
7632 printf("Infinite loop consume finite loop\n");
7633 #endif
7634 if (exp->exp_min > sub->exp_min) {
7635 max = -1;
7636 min = exp->exp_min - sub->exp_min;
7637 } else {
7638 max = -1;
7639 min = 0;
7640 }
7641 } else {
7642 if (exp->exp_max < sub->exp_max) {
7643 #ifdef DEBUG_DERIV
7644 printf("loops max mismatch => forbidden\n");
7645 #endif
7646 xmlExpFree(ctxt, tmp);
7647 return(forbiddenExp);
7648 }
7649 if (sub->exp_max > exp->exp_min)
7650 min = 0;
7651 else
7652 min = exp->exp_min - sub->exp_max;
7653 max = exp->exp_max - sub->exp_max;
7654 }
7655 }
7656 #ifdef DEBUG_DERIV
7657 printf("loops match => SEQ(COUNT())\n");
7658 #endif
7659 exp->exp_left->ref++;
7660 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7661 NULL, NULL, min, max);
7662 if (tmp2 == NULL) {
7663 return(NULL);
7664 }
7665 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7666 NULL, 0, 0);
7667 return(ret);
7668 }
7669 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7670 if (tmp == NULL)
7671 return(NULL);
7672 if (tmp == forbiddenExp) {
7673 #ifdef DEBUG_DERIV
7674 printf("loop mismatch => forbidden\n");
7675 #endif
7676 return(forbiddenExp);
7677 }
7678 if (exp->exp_min > 0)
7679 min = exp->exp_min - 1;
7680 else
7681 min = 0;
7682 if (exp->exp_max < 0)
7683 max = -1;
7684 else
7685 max = exp->exp_max - 1;
7686
7687 #ifdef DEBUG_DERIV
7688 printf("loop match => SEQ(COUNT())\n");
7689 #endif
7690 exp->exp_left->ref++;
7691 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7692 NULL, NULL, min, max);
7693 if (tmp2 == NULL)
7694 return(NULL);
7695 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7696 NULL, 0, 0);
7697 return(ret);
7698 }
7699 }
7700
7701 #ifdef DEBUG_DERIV
7702 printf("Fallback to derivative\n");
7703 #endif
7704 if (IS_NILLABLE(sub)) {
7705 if (!(IS_NILLABLE(exp)))
7706 return(forbiddenExp);
7707 else
7708 ret = emptyExp;
7709 } else
7710 ret = NULL;
7711 /*
7712 * here the structured derivation made no progress so
7713 * we use the default token based derivation to force one more step
7714 */
7715 if (ctxt->tabSize == 0)
7716 ctxt->tabSize = 40;
7717
7718 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7719 sizeof(const xmlChar *));
7720 if (tab == NULL) {
7721 return(NULL);
7722 }
7723
7724 /*
7725 * collect all the strings accepted by the subexpression on input
7726 */
7727 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7728 while (len < 0) {
7729 const xmlChar **temp;
7730 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
7731 sizeof(const xmlChar *));
7732 if (temp == NULL) {
7733 xmlFree((xmlChar **) tab);
7734 return(NULL);
7735 }
7736 tab = temp;
7737 ctxt->tabSize *= 2;
7738 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7739 }
7740 for (i = 0;i < len;i++) {
7741 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7742 if ((tmp == NULL) || (tmp == forbiddenExp)) {
7743 xmlExpFree(ctxt, ret);
7744 xmlFree((xmlChar **) tab);
7745 return(tmp);
7746 }
7747 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7748 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7749 xmlExpFree(ctxt, tmp);
7750 xmlExpFree(ctxt, ret);
7751 xmlFree((xmlChar **) tab);
7752 return(tmp);
7753 }
7754 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7755 xmlExpFree(ctxt, tmp);
7756 xmlExpFree(ctxt, tmp2);
7757
7758 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7759 xmlExpFree(ctxt, ret);
7760 xmlFree((xmlChar **) tab);
7761 return(tmp3);
7762 }
7763
7764 if (ret == NULL)
7765 ret = tmp3;
7766 else {
7767 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7768 if (ret == NULL) {
7769 xmlFree((xmlChar **) tab);
7770 return(NULL);
7771 }
7772 }
7773 }
7774 xmlFree((xmlChar **) tab);
7775 return(ret);
7776 }
7777
7778 /**
7779 * xmlExpExpDerive:
7780 * @ctxt: the expressions context
7781 * @exp: the englobing expression
7782 * @sub: the subexpression
7783 *
7784 * Evaluates the expression resulting from @exp consuming a sub expression @sub
7785 * Based on algebraic derivation and sometimes direct Brzozowski derivation
7786 * it usually takes less than linear time and can handle expressions generating
7787 * infinite languages.
7788 *
7789 * Returns the resulting expression or NULL in case of internal error, the
7790 * result must be freed
7791 */
7792 xmlExpNodePtr
xmlExpExpDerive(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7793 xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7794 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7795 return(NULL);
7796
7797 /*
7798 * O(1) speedups
7799 */
7800 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7801 #ifdef DEBUG_DERIV
7802 printf("Sub nillable and not exp : can't subsume\n");
7803 #endif
7804 return(forbiddenExp);
7805 }
7806 if (xmlExpCheckCard(exp, sub) == 0) {
7807 #ifdef DEBUG_DERIV
7808 printf("sub generate longer sequences than exp : can't subsume\n");
7809 #endif
7810 return(forbiddenExp);
7811 }
7812 return(xmlExpExpDeriveInt(ctxt, exp, sub));
7813 }
7814
7815 /**
7816 * xmlExpSubsume:
7817 * @ctxt: the expressions context
7818 * @exp: the englobing expression
7819 * @sub: the subexpression
7820 *
7821 * Check whether @exp accepts all the languages accepted by @sub
7822 * the input being a subexpression.
7823 *
7824 * Returns 1 if true 0 if false and -1 in case of failure.
7825 */
7826 int
xmlExpSubsume(xmlExpCtxtPtr ctxt,xmlExpNodePtr exp,xmlExpNodePtr sub)7827 xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7828 xmlExpNodePtr tmp;
7829
7830 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7831 return(-1);
7832
7833 /*
7834 * TODO: speedup by checking the language of sub is a subset of the
7835 * language of exp
7836 */
7837 /*
7838 * O(1) speedups
7839 */
7840 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7841 #ifdef DEBUG_DERIV
7842 printf("Sub nillable and not exp : can't subsume\n");
7843 #endif
7844 return(0);
7845 }
7846 if (xmlExpCheckCard(exp, sub) == 0) {
7847 #ifdef DEBUG_DERIV
7848 printf("sub generate longer sequences than exp : can't subsume\n");
7849 #endif
7850 return(0);
7851 }
7852 tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7853 #ifdef DEBUG_DERIV
7854 printf("Result derivation :\n");
7855 PRINT_EXP(tmp);
7856 #endif
7857 if (tmp == NULL)
7858 return(-1);
7859 if (tmp == forbiddenExp)
7860 return(0);
7861 if (tmp == emptyExp)
7862 return(1);
7863 if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
7864 xmlExpFree(ctxt, tmp);
7865 return(1);
7866 }
7867 xmlExpFree(ctxt, tmp);
7868 return(0);
7869 }
7870
7871 /************************************************************************
7872 * *
7873 * Parsing expression *
7874 * *
7875 ************************************************************************/
7876
7877 static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
7878
7879 #undef CUR
7880 #define CUR (*ctxt->cur)
7881 #undef NEXT
7882 #define NEXT ctxt->cur++;
7883 #undef IS_BLANK
7884 #define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
7885 #define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
7886
7887 static int
xmlExpParseNumber(xmlExpCtxtPtr ctxt)7888 xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
7889 int ret = 0;
7890
7891 SKIP_BLANKS
7892 if (CUR == '*') {
7893 NEXT
7894 return(-1);
7895 }
7896 if ((CUR < '0') || (CUR > '9'))
7897 return(-1);
7898 while ((CUR >= '0') && (CUR <= '9')) {
7899 ret = ret * 10 + (CUR - '0');
7900 NEXT
7901 }
7902 return(ret);
7903 }
7904
7905 static xmlExpNodePtr
xmlExpParseOr(xmlExpCtxtPtr ctxt)7906 xmlExpParseOr(xmlExpCtxtPtr ctxt) {
7907 const char *base;
7908 xmlExpNodePtr ret;
7909 const xmlChar *val;
7910
7911 SKIP_BLANKS
7912 base = ctxt->cur;
7913 if (*ctxt->cur == '(') {
7914 NEXT
7915 ret = xmlExpParseExpr(ctxt);
7916 SKIP_BLANKS
7917 if (*ctxt->cur != ')') {
7918 fprintf(stderr, "unbalanced '(' : %s\n", base);
7919 xmlExpFree(ctxt, ret);
7920 return(NULL);
7921 }
7922 NEXT;
7923 SKIP_BLANKS
7924 goto parse_quantifier;
7925 }
7926 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
7927 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
7928 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
7929 NEXT;
7930 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
7931 if (val == NULL)
7932 return(NULL);
7933 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
7934 if (ret == NULL)
7935 return(NULL);
7936 SKIP_BLANKS
7937 parse_quantifier:
7938 if (CUR == '{') {
7939 int min, max;
7940
7941 NEXT
7942 min = xmlExpParseNumber(ctxt);
7943 if (min < 0) {
7944 xmlExpFree(ctxt, ret);
7945 return(NULL);
7946 }
7947 SKIP_BLANKS
7948 if (CUR == ',') {
7949 NEXT
7950 max = xmlExpParseNumber(ctxt);
7951 SKIP_BLANKS
7952 } else
7953 max = min;
7954 if (CUR != '}') {
7955 xmlExpFree(ctxt, ret);
7956 return(NULL);
7957 }
7958 NEXT
7959 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7960 min, max);
7961 SKIP_BLANKS
7962 } else if (CUR == '?') {
7963 NEXT
7964 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7965 0, 1);
7966 SKIP_BLANKS
7967 } else if (CUR == '+') {
7968 NEXT
7969 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7970 1, -1);
7971 SKIP_BLANKS
7972 } else if (CUR == '*') {
7973 NEXT
7974 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7975 0, -1);
7976 SKIP_BLANKS
7977 }
7978 return(ret);
7979 }
7980
7981
7982 static xmlExpNodePtr
xmlExpParseSeq(xmlExpCtxtPtr ctxt)7983 xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
7984 xmlExpNodePtr ret, right;
7985
7986 ret = xmlExpParseOr(ctxt);
7987 SKIP_BLANKS
7988 while (CUR == '|') {
7989 NEXT
7990 right = xmlExpParseOr(ctxt);
7991 if (right == NULL) {
7992 xmlExpFree(ctxt, ret);
7993 return(NULL);
7994 }
7995 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
7996 if (ret == NULL)
7997 return(NULL);
7998 }
7999 return(ret);
8000 }
8001
8002 static xmlExpNodePtr
xmlExpParseExpr(xmlExpCtxtPtr ctxt)8003 xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
8004 xmlExpNodePtr ret, right;
8005
8006 ret = xmlExpParseSeq(ctxt);
8007 SKIP_BLANKS
8008 while (CUR == ',') {
8009 NEXT
8010 right = xmlExpParseSeq(ctxt);
8011 if (right == NULL) {
8012 xmlExpFree(ctxt, ret);
8013 return(NULL);
8014 }
8015 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
8016 if (ret == NULL)
8017 return(NULL);
8018 }
8019 return(ret);
8020 }
8021
8022 /**
8023 * xmlExpParse:
8024 * @ctxt: the expressions context
8025 * @expr: the 0 terminated string
8026 *
8027 * Minimal parser for regexps, it understand the following constructs
8028 * - string terminals
8029 * - choice operator |
8030 * - sequence operator ,
8031 * - subexpressions (...)
8032 * - usual cardinality operators + * and ?
8033 * - finite sequences { min, max }
8034 * - infinite sequences { min, * }
8035 * There is minimal checkings made especially no checking on strings values
8036 *
8037 * Returns a new expression or NULL in case of failure
8038 */
8039 xmlExpNodePtr
xmlExpParse(xmlExpCtxtPtr ctxt,const char * expr)8040 xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
8041 xmlExpNodePtr ret;
8042
8043 ctxt->expr = expr;
8044 ctxt->cur = expr;
8045
8046 ret = xmlExpParseExpr(ctxt);
8047 SKIP_BLANKS
8048 if (*ctxt->cur != 0) {
8049 xmlExpFree(ctxt, ret);
8050 return(NULL);
8051 }
8052 return(ret);
8053 }
8054
8055 static void
xmlExpDumpInt(xmlBufferPtr buf,xmlExpNodePtr expr,int glob)8056 xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
8057 xmlExpNodePtr c;
8058
8059 if (expr == NULL) return;
8060 if (glob) xmlBufferWriteChar(buf, "(");
8061 switch (expr->type) {
8062 case XML_EXP_EMPTY:
8063 xmlBufferWriteChar(buf, "empty");
8064 break;
8065 case XML_EXP_FORBID:
8066 xmlBufferWriteChar(buf, "forbidden");
8067 break;
8068 case XML_EXP_ATOM:
8069 xmlBufferWriteCHAR(buf, expr->exp_str);
8070 break;
8071 case XML_EXP_SEQ:
8072 c = expr->exp_left;
8073 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8074 xmlExpDumpInt(buf, c, 1);
8075 else
8076 xmlExpDumpInt(buf, c, 0);
8077 xmlBufferWriteChar(buf, " , ");
8078 c = expr->exp_right;
8079 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8080 xmlExpDumpInt(buf, c, 1);
8081 else
8082 xmlExpDumpInt(buf, c, 0);
8083 break;
8084 case XML_EXP_OR:
8085 c = expr->exp_left;
8086 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8087 xmlExpDumpInt(buf, c, 1);
8088 else
8089 xmlExpDumpInt(buf, c, 0);
8090 xmlBufferWriteChar(buf, " | ");
8091 c = expr->exp_right;
8092 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8093 xmlExpDumpInt(buf, c, 1);
8094 else
8095 xmlExpDumpInt(buf, c, 0);
8096 break;
8097 case XML_EXP_COUNT: {
8098 char rep[40];
8099
8100 c = expr->exp_left;
8101 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
8102 xmlExpDumpInt(buf, c, 1);
8103 else
8104 xmlExpDumpInt(buf, c, 0);
8105 if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
8106 rep[0] = '?';
8107 rep[1] = 0;
8108 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
8109 rep[0] = '*';
8110 rep[1] = 0;
8111 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
8112 rep[0] = '+';
8113 rep[1] = 0;
8114 } else if (expr->exp_max == expr->exp_min) {
8115 snprintf(rep, 39, "{%d}", expr->exp_min);
8116 } else if (expr->exp_max < 0) {
8117 snprintf(rep, 39, "{%d,inf}", expr->exp_min);
8118 } else {
8119 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
8120 }
8121 rep[39] = 0;
8122 xmlBufferWriteChar(buf, rep);
8123 break;
8124 }
8125 default:
8126 fprintf(stderr, "Error in tree\n");
8127 }
8128 if (glob)
8129 xmlBufferWriteChar(buf, ")");
8130 }
8131 /**
8132 * xmlExpDump:
8133 * @buf: a buffer to receive the output
8134 * @expr: the compiled expression
8135 *
8136 * Serialize the expression as compiled to the buffer
8137 */
8138 void
xmlExpDump(xmlBufferPtr buf,xmlExpNodePtr expr)8139 xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
8140 if ((buf == NULL) || (expr == NULL))
8141 return;
8142 xmlExpDumpInt(buf, expr, 0);
8143 }
8144
8145 /**
8146 * xmlExpMaxToken:
8147 * @expr: a compiled expression
8148 *
8149 * Indicate the maximum number of input a expression can accept
8150 *
8151 * Returns the maximum length or -1 in case of error
8152 */
8153 int
xmlExpMaxToken(xmlExpNodePtr expr)8154 xmlExpMaxToken(xmlExpNodePtr expr) {
8155 if (expr == NULL)
8156 return(-1);
8157 return(expr->c_max);
8158 }
8159
8160 /**
8161 * xmlExpCtxtNbNodes:
8162 * @ctxt: an expression context
8163 *
8164 * Debugging facility provides the number of allocated nodes at a that point
8165 *
8166 * Returns the number of nodes in use or -1 in case of error
8167 */
8168 int
xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt)8169 xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
8170 if (ctxt == NULL)
8171 return(-1);
8172 return(ctxt->nb_nodes);
8173 }
8174
8175 /**
8176 * xmlExpCtxtNbCons:
8177 * @ctxt: an expression context
8178 *
8179 * Debugging facility provides the number of allocated nodes over lifetime
8180 *
8181 * Returns the number of nodes ever allocated or -1 in case of error
8182 */
8183 int
xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt)8184 xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
8185 if (ctxt == NULL)
8186 return(-1);
8187 return(ctxt->nb_cons);
8188 }
8189
8190 #endif /* LIBXML_EXPR_ENABLED */
8191 #define bottom_xmlregexp
8192 #include "elfgcchack.h"
8193 #endif /* LIBXML_REGEXP_ENABLED */
8194