1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 //
4 // file: repattrn.cpp
5 //
6 /*
7 ***************************************************************************
8 * Copyright (C) 2002-2016 International Business Machines Corporation
9 * and others. All rights reserved.
10 ***************************************************************************
11 */
12
13 #include "unicode/utypes.h"
14
15 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
16
17 #include "unicode/regex.h"
18 #include "unicode/uclean.h"
19 #include "cmemory.h"
20 #include "cstr.h"
21 #include "uassert.h"
22 #include "uhash.h"
23 #include "uvector.h"
24 #include "uvectr32.h"
25 #include "uvectr64.h"
26 #include "regexcmp.h"
27 #include "regeximp.h"
28 #include "regexst.h"
29
30 U_NAMESPACE_BEGIN
31
32 //--------------------------------------------------------------------------
33 //
34 // RegexPattern Default Constructor
35 //
36 //--------------------------------------------------------------------------
RegexPattern()37 RegexPattern::RegexPattern() {
38 // Init all of this instances data.
39 init();
40 }
41
42
43 //--------------------------------------------------------------------------
44 //
45 // Copy Constructor Note: This is a rather inefficient implementation,
46 // but it probably doesn't matter.
47 //
48 //--------------------------------------------------------------------------
RegexPattern(const RegexPattern & other)49 RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) {
50 init();
51 *this = other;
52 }
53
54
55
56 //--------------------------------------------------------------------------
57 //
58 // Assignment Operator
59 //
60 //--------------------------------------------------------------------------
operator =(const RegexPattern & other)61 RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
62 if (this == &other) {
63 // Source and destination are the same. Don't do anything.
64 return *this;
65 }
66
67 // Clean out any previous contents of object being assigned to.
68 zap();
69
70 // Give target object a default initialization
71 init();
72
73 // Copy simple fields
74 fDeferredStatus = other.fDeferredStatus;
75
76 if (U_FAILURE(fDeferredStatus)) {
77 return *this;
78 }
79
80 if (other.fPatternString == NULL) {
81 fPatternString = NULL;
82 fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus);
83 } else {
84 fPatternString = new UnicodeString(*(other.fPatternString));
85 if (fPatternString == NULL) {
86 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
87 } else {
88 fPattern = utext_openConstUnicodeString(NULL, fPatternString, &fDeferredStatus);
89 }
90 }
91 if (U_FAILURE(fDeferredStatus)) {
92 return *this;
93 }
94
95 fFlags = other.fFlags;
96 fLiteralText = other.fLiteralText;
97 fMinMatchLen = other.fMinMatchLen;
98 fFrameSize = other.fFrameSize;
99 fDataSize = other.fDataSize;
100
101 fStartType = other.fStartType;
102 fInitialStringIdx = other.fInitialStringIdx;
103 fInitialStringLen = other.fInitialStringLen;
104 *fInitialChars = *other.fInitialChars;
105 fInitialChar = other.fInitialChar;
106 *fInitialChars8 = *other.fInitialChars8;
107 fNeedsAltInput = other.fNeedsAltInput;
108
109 // Copy the pattern. It's just values, nothing deep to copy.
110 fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
111 fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
112
113 // Copy the Unicode Sets.
114 // Could be made more efficient if the sets were reference counted and shared,
115 // but I doubt that pattern copying will be particularly common.
116 // Note: init() already added an empty element zero to fSets
117 int32_t i;
118 int32_t numSets = other.fSets->size();
119 fSets8 = new Regex8BitSet[numSets];
120 if (fSets8 == NULL) {
121 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
122 return *this;
123 }
124 for (i=1; i<numSets; i++) {
125 if (U_FAILURE(fDeferredStatus)) {
126 return *this;
127 }
128 UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
129 UnicodeSet *newSet = new UnicodeSet(*sourceSet);
130 if (newSet == NULL) {
131 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
132 break;
133 }
134 fSets->addElement(newSet, fDeferredStatus);
135 fSets8[i] = other.fSets8[i];
136 }
137
138 // Copy the named capture group hash map.
139 if (other.fNamedCaptureMap != nullptr && initNamedCaptureMap()) {
140 int32_t hashPos = UHASH_FIRST;
141 while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
142 if (U_FAILURE(fDeferredStatus)) {
143 break;
144 }
145 const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
146 UnicodeString *key = new UnicodeString(*name);
147 int32_t val = hashEl->value.integer;
148 if (key == NULL) {
149 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
150 } else {
151 uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
152 }
153 }
154 }
155 return *this;
156 }
157
158
159 //--------------------------------------------------------------------------
160 //
161 // init Shared initialization for use by constructors.
162 // Bring an uninitialized RegexPattern up to a default state.
163 //
164 //--------------------------------------------------------------------------
init()165 void RegexPattern::init() {
166 fFlags = 0;
167 fCompiledPat = 0;
168 fLiteralText.remove();
169 fSets = NULL;
170 fSets8 = NULL;
171 fDeferredStatus = U_ZERO_ERROR;
172 fMinMatchLen = 0;
173 fFrameSize = 0;
174 fDataSize = 0;
175 fGroupMap = NULL;
176 fStartType = START_NO_INFO;
177 fInitialStringIdx = 0;
178 fInitialStringLen = 0;
179 fInitialChars = NULL;
180 fInitialChar = 0;
181 fInitialChars8 = NULL;
182 fNeedsAltInput = FALSE;
183 fNamedCaptureMap = NULL;
184
185 fPattern = NULL; // will be set later
186 fPatternString = NULL; // may be set later
187 fCompiledPat = new UVector64(fDeferredStatus);
188 fGroupMap = new UVector32(fDeferredStatus);
189 fSets = new UVector(fDeferredStatus);
190 fInitialChars = new UnicodeSet;
191 fInitialChars8 = new Regex8BitSet;
192 if (U_FAILURE(fDeferredStatus)) {
193 return;
194 }
195 if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
196 fInitialChars == NULL || fInitialChars8 == NULL) {
197 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
198 return;
199 }
200
201 // Slot zero of the vector of sets is reserved. Fill it here.
202 fSets->addElement((int32_t)0, fDeferredStatus);
203 }
204
205
initNamedCaptureMap()206 bool RegexPattern::initNamedCaptureMap() {
207 if (fNamedCaptureMap) {
208 return true;
209 }
210 fNamedCaptureMap = uhash_openSize(uhash_hashUnicodeString, // Key hash function
211 uhash_compareUnicodeString, // Key comparator function
212 uhash_compareLong, // Value comparator function
213 7, // Initial table capacity
214 &fDeferredStatus);
215 if (U_FAILURE(fDeferredStatus)) {
216 return false;
217 }
218
219 // fNamedCaptureMap owns its key strings, type (UnicodeString *)
220 uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject);
221 return true;
222 }
223
224 //--------------------------------------------------------------------------
225 //
226 // zap Delete everything owned by this RegexPattern.
227 //
228 //--------------------------------------------------------------------------
zap()229 void RegexPattern::zap() {
230 delete fCompiledPat;
231 fCompiledPat = NULL;
232 int i;
233 for (i=1; i<fSets->size(); i++) {
234 UnicodeSet *s;
235 s = (UnicodeSet *)fSets->elementAt(i);
236 if (s != NULL) {
237 delete s;
238 }
239 }
240 delete fSets;
241 fSets = NULL;
242 delete[] fSets8;
243 fSets8 = NULL;
244 delete fGroupMap;
245 fGroupMap = NULL;
246 delete fInitialChars;
247 fInitialChars = NULL;
248 delete fInitialChars8;
249 fInitialChars8 = NULL;
250 if (fPattern != NULL) {
251 utext_close(fPattern);
252 fPattern = NULL;
253 }
254 if (fPatternString != NULL) {
255 delete fPatternString;
256 fPatternString = NULL;
257 }
258 if (fNamedCaptureMap != NULL) {
259 uhash_close(fNamedCaptureMap);
260 fNamedCaptureMap = NULL;
261 }
262 }
263
264
265 //--------------------------------------------------------------------------
266 //
267 // Destructor
268 //
269 //--------------------------------------------------------------------------
~RegexPattern()270 RegexPattern::~RegexPattern() {
271 zap();
272 }
273
274
275 //--------------------------------------------------------------------------
276 //
277 // Clone
278 //
279 //--------------------------------------------------------------------------
clone() const280 RegexPattern *RegexPattern::clone() const {
281 RegexPattern *copy = new RegexPattern(*this);
282 return copy;
283 }
284
285
286 //--------------------------------------------------------------------------
287 //
288 // operator == (comparison) Consider to patterns to be == if the
289 // pattern strings and the flags are the same.
290 // Note that pattern strings with the same
291 // characters can still be considered different.
292 //
293 //--------------------------------------------------------------------------
operator ==(const RegexPattern & other) const294 UBool RegexPattern::operator ==(const RegexPattern &other) const {
295 if (this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus) {
296 if (this->fPatternString != NULL && other.fPatternString != NULL) {
297 return *(this->fPatternString) == *(other.fPatternString);
298 } else if (this->fPattern == NULL) {
299 if (other.fPattern == NULL) {
300 return TRUE;
301 }
302 } else if (other.fPattern != NULL) {
303 UTEXT_SETNATIVEINDEX(this->fPattern, 0);
304 UTEXT_SETNATIVEINDEX(other.fPattern, 0);
305 return utext_equals(this->fPattern, other.fPattern);
306 }
307 }
308 return FALSE;
309 }
310
311 //---------------------------------------------------------------------
312 //
313 // compile
314 //
315 //---------------------------------------------------------------------
316 RegexPattern * U_EXPORT2
compile(const UnicodeString & regex,uint32_t flags,UParseError & pe,UErrorCode & status)317 RegexPattern::compile(const UnicodeString ®ex,
318 uint32_t flags,
319 UParseError &pe,
320 UErrorCode &status)
321 {
322 if (U_FAILURE(status)) {
323 return NULL;
324 }
325
326 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
327 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
328 UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL;
329
330 if ((flags & ~allFlags) != 0) {
331 status = U_REGEX_INVALID_FLAG;
332 return NULL;
333 }
334
335 if ((flags & UREGEX_CANON_EQ) != 0) {
336 status = U_REGEX_UNIMPLEMENTED;
337 return NULL;
338 }
339
340 RegexPattern *This = new RegexPattern;
341 if (This == NULL) {
342 status = U_MEMORY_ALLOCATION_ERROR;
343 return NULL;
344 }
345 if (U_FAILURE(This->fDeferredStatus)) {
346 status = This->fDeferredStatus;
347 delete This;
348 return NULL;
349 }
350 This->fFlags = flags;
351
352 RegexCompile compiler(This, status);
353 compiler.compile(regex, pe, status);
354
355 if (U_FAILURE(status)) {
356 delete This;
357 This = NULL;
358 }
359
360 return This;
361 }
362
363
364 //
365 // compile, UText mode
366 //
367 RegexPattern * U_EXPORT2
compile(UText * regex,uint32_t flags,UParseError & pe,UErrorCode & status)368 RegexPattern::compile(UText *regex,
369 uint32_t flags,
370 UParseError &pe,
371 UErrorCode &status)
372 {
373 if (U_FAILURE(status)) {
374 return NULL;
375 }
376
377 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
378 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
379 UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL;
380
381 if ((flags & ~allFlags) != 0) {
382 status = U_REGEX_INVALID_FLAG;
383 return NULL;
384 }
385
386 if ((flags & UREGEX_CANON_EQ) != 0) {
387 status = U_REGEX_UNIMPLEMENTED;
388 return NULL;
389 }
390
391 RegexPattern *This = new RegexPattern;
392 if (This == NULL) {
393 status = U_MEMORY_ALLOCATION_ERROR;
394 return NULL;
395 }
396 if (U_FAILURE(This->fDeferredStatus)) {
397 status = This->fDeferredStatus;
398 delete This;
399 return NULL;
400 }
401 This->fFlags = flags;
402
403 RegexCompile compiler(This, status);
404 compiler.compile(regex, pe, status);
405
406 if (U_FAILURE(status)) {
407 delete This;
408 This = NULL;
409 }
410
411 return This;
412 }
413
414 //
415 // compile with default flags.
416 //
417 RegexPattern * U_EXPORT2
compile(const UnicodeString & regex,UParseError & pe,UErrorCode & err)418 RegexPattern::compile(const UnicodeString ®ex,
419 UParseError &pe,
420 UErrorCode &err)
421 {
422 return compile(regex, 0, pe, err);
423 }
424
425
426 //
427 // compile with default flags, UText mode
428 //
429 RegexPattern * U_EXPORT2
compile(UText * regex,UParseError & pe,UErrorCode & err)430 RegexPattern::compile(UText *regex,
431 UParseError &pe,
432 UErrorCode &err)
433 {
434 return compile(regex, 0, pe, err);
435 }
436
437
438 //
439 // compile with no UParseErr parameter.
440 //
441 RegexPattern * U_EXPORT2
compile(const UnicodeString & regex,uint32_t flags,UErrorCode & err)442 RegexPattern::compile(const UnicodeString ®ex,
443 uint32_t flags,
444 UErrorCode &err)
445 {
446 UParseError pe;
447 return compile(regex, flags, pe, err);
448 }
449
450
451 //
452 // compile with no UParseErr parameter, UText mode
453 //
454 RegexPattern * U_EXPORT2
compile(UText * regex,uint32_t flags,UErrorCode & err)455 RegexPattern::compile(UText *regex,
456 uint32_t flags,
457 UErrorCode &err)
458 {
459 UParseError pe;
460 return compile(regex, flags, pe, err);
461 }
462
463
464 //---------------------------------------------------------------------
465 //
466 // flags
467 //
468 //---------------------------------------------------------------------
flags() const469 uint32_t RegexPattern::flags() const {
470 return fFlags;
471 }
472
473
474 //---------------------------------------------------------------------
475 //
476 // matcher(UnicodeString, err)
477 //
478 //---------------------------------------------------------------------
matcher(const UnicodeString & input,UErrorCode & status) const479 RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
480 UErrorCode &status) const {
481 RegexMatcher *retMatcher = matcher(status);
482 if (retMatcher != NULL) {
483 retMatcher->fDeferredStatus = status;
484 retMatcher->reset(input);
485 }
486 return retMatcher;
487 }
488
489
490 //---------------------------------------------------------------------
491 //
492 // matcher(status)
493 //
494 //---------------------------------------------------------------------
matcher(UErrorCode & status) const495 RegexMatcher *RegexPattern::matcher(UErrorCode &status) const {
496 RegexMatcher *retMatcher = NULL;
497
498 if (U_FAILURE(status)) {
499 return NULL;
500 }
501 if (U_FAILURE(fDeferredStatus)) {
502 status = fDeferredStatus;
503 return NULL;
504 }
505
506 retMatcher = new RegexMatcher(this);
507 if (retMatcher == NULL) {
508 status = U_MEMORY_ALLOCATION_ERROR;
509 return NULL;
510 }
511 return retMatcher;
512 }
513
514
515
516 //---------------------------------------------------------------------
517 //
518 // matches Convenience function to test for a match, starting
519 // with a pattern string and a data string.
520 //
521 //---------------------------------------------------------------------
matches(const UnicodeString & regex,const UnicodeString & input,UParseError & pe,UErrorCode & status)522 UBool U_EXPORT2 RegexPattern::matches(const UnicodeString ®ex,
523 const UnicodeString &input,
524 UParseError &pe,
525 UErrorCode &status) {
526
527 if (U_FAILURE(status)) {return FALSE;}
528
529 UBool retVal;
530 RegexPattern *pat = NULL;
531 RegexMatcher *matcher = NULL;
532
533 pat = RegexPattern::compile(regex, 0, pe, status);
534 matcher = pat->matcher(input, status);
535 retVal = matcher->matches(status);
536
537 delete matcher;
538 delete pat;
539 return retVal;
540 }
541
542
543 //
544 // matches, UText mode
545 //
matches(UText * regex,UText * input,UParseError & pe,UErrorCode & status)546 UBool U_EXPORT2 RegexPattern::matches(UText *regex,
547 UText *input,
548 UParseError &pe,
549 UErrorCode &status) {
550
551 if (U_FAILURE(status)) {return FALSE;}
552
553 UBool retVal = FALSE;
554 RegexPattern *pat = NULL;
555 RegexMatcher *matcher = NULL;
556
557 pat = RegexPattern::compile(regex, 0, pe, status);
558 matcher = pat->matcher(status);
559 if (U_SUCCESS(status)) {
560 matcher->reset(input);
561 retVal = matcher->matches(status);
562 }
563
564 delete matcher;
565 delete pat;
566 return retVal;
567 }
568
569
570
571
572
573 //---------------------------------------------------------------------
574 //
575 // pattern
576 //
577 //---------------------------------------------------------------------
pattern() const578 UnicodeString RegexPattern::pattern() const {
579 if (fPatternString != NULL) {
580 return *fPatternString;
581 } else if (fPattern == NULL) {
582 return UnicodeString();
583 } else {
584 UErrorCode status = U_ZERO_ERROR;
585 int64_t nativeLen = utext_nativeLength(fPattern);
586 int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error
587 UnicodeString result;
588
589 status = U_ZERO_ERROR;
590 UChar *resultChars = result.getBuffer(len16);
591 utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning
592 result.releaseBuffer(len16);
593
594 return result;
595 }
596 }
597
598
599
600
601 //---------------------------------------------------------------------
602 //
603 // patternText
604 //
605 //---------------------------------------------------------------------
patternText(UErrorCode & status) const606 UText *RegexPattern::patternText(UErrorCode &status) const {
607 if (U_FAILURE(status)) {return NULL;}
608 status = U_ZERO_ERROR;
609
610 if (fPattern != NULL) {
611 return fPattern;
612 } else {
613 RegexStaticSets::initGlobals(&status);
614 return RegexStaticSets::gStaticSets->fEmptyText;
615 }
616 }
617
618
619 //--------------------------------------------------------------------------------
620 //
621 // groupNumberFromName()
622 //
623 //--------------------------------------------------------------------------------
groupNumberFromName(const UnicodeString & groupName,UErrorCode & status) const624 int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const {
625 if (U_FAILURE(status)) {
626 return 0;
627 }
628
629 // No need to explicitly check for syntactically valid names.
630 // Invalid ones will never be in the map, and the lookup will fail.
631
632 int32_t number = fNamedCaptureMap ? uhash_geti(fNamedCaptureMap, &groupName) : 0;
633 if (number == 0) {
634 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
635 }
636 return number;
637 }
638
groupNumberFromName(const char * groupName,int32_t nameLength,UErrorCode & status) const639 int32_t RegexPattern::groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const {
640 if (U_FAILURE(status)) {
641 return 0;
642 }
643 UnicodeString name(groupName, nameLength, US_INV);
644 return groupNumberFromName(name, status);
645 }
646
647
648 //---------------------------------------------------------------------
649 //
650 // split
651 //
652 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status) const653 int32_t RegexPattern::split(const UnicodeString &input,
654 UnicodeString dest[],
655 int32_t destCapacity,
656 UErrorCode &status) const
657 {
658 if (U_FAILURE(status)) {
659 return 0;
660 }
661
662 RegexMatcher m(this);
663 int32_t r = 0;
664 // Check m's status to make sure all is ok.
665 if (U_SUCCESS(m.fDeferredStatus)) {
666 r = m.split(input, dest, destCapacity, status);
667 }
668 return r;
669 }
670
671 //
672 // split, UText mode
673 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status) const674 int32_t RegexPattern::split(UText *input,
675 UText *dest[],
676 int32_t destCapacity,
677 UErrorCode &status) const
678 {
679 if (U_FAILURE(status)) {
680 return 0;
681 }
682
683 RegexMatcher m(this);
684 int32_t r = 0;
685 // Check m's status to make sure all is ok.
686 if (U_SUCCESS(m.fDeferredStatus)) {
687 r = m.split(input, dest, destCapacity, status);
688 }
689 return r;
690 }
691
692
693 //---------------------------------------------------------------------
694 //
695 // dump Output the compiled form of the pattern.
696 // Debugging function only.
697 //
698 //---------------------------------------------------------------------
dumpOp(int32_t index) const699 void RegexPattern::dumpOp(int32_t index) const {
700 (void)index; // Suppress warnings in non-debug build.
701 #if defined(REGEX_DEBUG)
702 static const char * const opNames[] = {URX_OPCODE_NAMES};
703 int32_t op = fCompiledPat->elementAti(index);
704 int32_t val = URX_VAL(op);
705 int32_t type = URX_TYPE(op);
706 int32_t pinnedType = type;
707 if ((uint32_t)pinnedType >= UPRV_LENGTHOF(opNames)) {
708 pinnedType = 0;
709 }
710
711 printf("%4d %08x %-15s ", index, op, opNames[pinnedType]);
712 switch (type) {
713 case URX_NOP:
714 case URX_DOTANY:
715 case URX_DOTANY_ALL:
716 case URX_FAIL:
717 case URX_CARET:
718 case URX_DOLLAR:
719 case URX_BACKSLASH_G:
720 case URX_BACKSLASH_X:
721 case URX_END:
722 case URX_DOLLAR_M:
723 case URX_CARET_M:
724 // Types with no operand field of interest.
725 break;
726
727 case URX_RESERVED_OP:
728 case URX_START_CAPTURE:
729 case URX_END_CAPTURE:
730 case URX_STATE_SAVE:
731 case URX_JMP:
732 case URX_JMP_SAV:
733 case URX_JMP_SAV_X:
734 case URX_BACKSLASH_B:
735 case URX_BACKSLASH_BU:
736 case URX_BACKSLASH_D:
737 case URX_BACKSLASH_Z:
738 case URX_STRING_LEN:
739 case URX_CTR_INIT:
740 case URX_CTR_INIT_NG:
741 case URX_CTR_LOOP:
742 case URX_CTR_LOOP_NG:
743 case URX_RELOC_OPRND:
744 case URX_STO_SP:
745 case URX_LD_SP:
746 case URX_BACKREF:
747 case URX_STO_INP_LOC:
748 case URX_JMPX:
749 case URX_LA_START:
750 case URX_LA_END:
751 case URX_BACKREF_I:
752 case URX_LB_START:
753 case URX_LB_CONT:
754 case URX_LB_END:
755 case URX_LBN_CONT:
756 case URX_LBN_END:
757 case URX_LOOP_C:
758 case URX_LOOP_DOT_I:
759 case URX_BACKSLASH_H:
760 case URX_BACKSLASH_R:
761 case URX_BACKSLASH_V:
762 // types with an integer operand field.
763 printf("%d", val);
764 break;
765
766 case URX_ONECHAR:
767 case URX_ONECHAR_I:
768 if (val < 0x20) {
769 printf("%#x", val);
770 } else {
771 printf("'%s'", CStr(UnicodeString(val))());
772 }
773 break;
774
775 case URX_STRING:
776 case URX_STRING_I:
777 {
778 int32_t lengthOp = fCompiledPat->elementAti(index+1);
779 U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
780 int32_t length = URX_VAL(lengthOp);
781 UnicodeString str(fLiteralText, val, length);
782 printf("%s", CStr(str)());
783 }
784 break;
785
786 case URX_SETREF:
787 case URX_LOOP_SR_I:
788 {
789 UnicodeString s;
790 UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
791 set->toPattern(s, TRUE);
792 printf("%s", CStr(s)());
793 }
794 break;
795
796 case URX_STATIC_SETREF:
797 case URX_STAT_SETREF_N:
798 {
799 UnicodeString s;
800 if (val & URX_NEG_SET) {
801 printf("NOT ");
802 val &= ~URX_NEG_SET;
803 }
804 UnicodeSet &set = RegexStaticSets::gStaticSets->fPropSets[val];
805 set.toPattern(s, TRUE);
806 printf("%s", CStr(s)());
807 }
808 break;
809
810
811 default:
812 printf("??????");
813 break;
814 }
815 printf("\n");
816 #endif
817 }
818
819
dumpPattern() const820 void RegexPattern::dumpPattern() const {
821 #if defined(REGEX_DEBUG)
822 int index;
823
824 UnicodeString patStr;
825 for (UChar32 c = utext_next32From(fPattern, 0); c != U_SENTINEL; c = utext_next32(fPattern)) {
826 patStr.append(c);
827 }
828 printf("Original Pattern: \"%s\"\n", CStr(patStr)());
829 printf(" Min Match Length: %d\n", fMinMatchLen);
830 printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType));
831 if (fStartType == START_STRING) {
832 UnicodeString initialString(fLiteralText,fInitialStringIdx, fInitialStringLen);
833 printf(" Initial match string: \"%s\"\n", CStr(initialString)());
834 } else if (fStartType == START_SET) {
835 UnicodeString s;
836 fInitialChars->toPattern(s, TRUE);
837 printf(" Match First Chars: %s\n", CStr(s)());
838
839 } else if (fStartType == START_CHAR) {
840 printf(" First char of Match: ");
841 if (fInitialChar > 0x20) {
842 printf("'%s'\n", CStr(UnicodeString(fInitialChar))());
843 } else {
844 printf("%#x\n", fInitialChar);
845 }
846 }
847
848 printf("Named Capture Groups:\n");
849 if (!fNamedCaptureMap || uhash_count(fNamedCaptureMap) == 0) {
850 printf(" None\n");
851 } else {
852 int32_t pos = UHASH_FIRST;
853 const UHashElement *el = NULL;
854 while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) {
855 const UnicodeString *name = (const UnicodeString *)el->key.pointer;
856 int32_t number = el->value.integer;
857 printf(" %d\t%s\n", number, CStr(*name)());
858 }
859 }
860
861 printf("\nIndex Binary Type Operand\n" \
862 "-------------------------------------------\n");
863 for (index = 0; index<fCompiledPat->size(); index++) {
864 dumpOp(index);
865 }
866 printf("\n\n");
867 #endif
868 }
869
870
871
872 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern)
873
874 U_NAMESPACE_END
875 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
876