1 //
2 // file: repattrn.cpp
3 //
4 /*
5 ***************************************************************************
6 * Copyright (C) 2002-2015 International Business Machines Corporation *
7 * and others. All rights reserved. *
8 ***************************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
14
15 #include "unicode/regex.h"
16 #include "unicode/uclean.h"
17 #include "uassert.h"
18 #include "uhash.h"
19 #include "uvector.h"
20 #include "uvectr32.h"
21 #include "uvectr64.h"
22 #include "regexcmp.h"
23 #include "regeximp.h"
24 #include "regexst.h"
25
26 U_NAMESPACE_BEGIN
27
28 //--------------------------------------------------------------------------
29 //
30 // RegexPattern Default Constructor
31 //
32 //--------------------------------------------------------------------------
RegexPattern()33 RegexPattern::RegexPattern() {
34 // Init all of this instances data.
35 init();
36 }
37
38
39 //--------------------------------------------------------------------------
40 //
41 // Copy Constructor Note: This is a rather inefficient implementation,
42 // but it probably doesn't matter.
43 //
44 //--------------------------------------------------------------------------
RegexPattern(const RegexPattern & other)45 RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) {
46 init();
47 *this = other;
48 }
49
50
51
52 //--------------------------------------------------------------------------
53 //
54 // Assignment Operator
55 //
56 //--------------------------------------------------------------------------
operator =(const RegexPattern & other)57 RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
58 if (this == &other) {
59 // Source and destination are the same. Don't do anything.
60 return *this;
61 }
62
63 // Clean out any previous contents of object being assigned to.
64 zap();
65
66 // Give target object a default initialization
67 init();
68
69 // Copy simple fields
70 fDeferredStatus = other.fDeferredStatus;
71
72 if (U_FAILURE(fDeferredStatus)) {
73 return *this;
74 }
75
76 if (other.fPatternString == NULL) {
77 fPatternString = NULL;
78 fPattern = utext_clone(fPattern, other.fPattern, FALSE, TRUE, &fDeferredStatus);
79 } else {
80 fPatternString = new UnicodeString(*(other.fPatternString));
81 if (fPatternString == NULL) {
82 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
83 } else {
84 fPattern = utext_openConstUnicodeString(NULL, fPatternString, &fDeferredStatus);
85 }
86 }
87 if (U_FAILURE(fDeferredStatus)) {
88 return *this;
89 }
90
91 fFlags = other.fFlags;
92 fLiteralText = other.fLiteralText;
93 fMinMatchLen = other.fMinMatchLen;
94 fFrameSize = other.fFrameSize;
95 fDataSize = other.fDataSize;
96 fStaticSets = other.fStaticSets;
97 fStaticSets8 = other.fStaticSets8;
98
99 fStartType = other.fStartType;
100 fInitialStringIdx = other.fInitialStringIdx;
101 fInitialStringLen = other.fInitialStringLen;
102 *fInitialChars = *other.fInitialChars;
103 fInitialChar = other.fInitialChar;
104 *fInitialChars8 = *other.fInitialChars8;
105 fNeedsAltInput = other.fNeedsAltInput;
106
107 // Copy the pattern. It's just values, nothing deep to copy.
108 fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
109 fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
110
111 // Copy the Unicode Sets.
112 // Could be made more efficient if the sets were reference counted and shared,
113 // but I doubt that pattern copying will be particularly common.
114 // Note: init() already added an empty element zero to fSets
115 int32_t i;
116 int32_t numSets = other.fSets->size();
117 fSets8 = new Regex8BitSet[numSets];
118 if (fSets8 == NULL) {
119 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
120 return *this;
121 }
122 for (i=1; i<numSets; i++) {
123 if (U_FAILURE(fDeferredStatus)) {
124 return *this;
125 }
126 UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
127 UnicodeSet *newSet = new UnicodeSet(*sourceSet);
128 if (newSet == NULL) {
129 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
130 break;
131 }
132 fSets->addElement(newSet, fDeferredStatus);
133 fSets8[i] = other.fSets8[i];
134 }
135
136 // Copy the named capture group hash map.
137 int32_t hashPos = UHASH_FIRST;
138 while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
139 if (U_FAILURE(fDeferredStatus)) {
140 break;
141 }
142 const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
143 UnicodeString *key = new UnicodeString(*name);
144 int32_t val = hashEl->value.integer;
145 if (key == NULL) {
146 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
147 } else {
148 uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
149 }
150 }
151 return *this;
152 }
153
154
155 //--------------------------------------------------------------------------
156 //
157 // init Shared initialization for use by constructors.
158 // Bring an uninitialized RegexPattern up to a default state.
159 //
160 //--------------------------------------------------------------------------
init()161 void RegexPattern::init() {
162 fFlags = 0;
163 fCompiledPat = 0;
164 fLiteralText.remove();
165 fSets = NULL;
166 fSets8 = NULL;
167 fDeferredStatus = U_ZERO_ERROR;
168 fMinMatchLen = 0;
169 fFrameSize = 0;
170 fDataSize = 0;
171 fGroupMap = NULL;
172 fStaticSets = NULL;
173 fStaticSets8 = NULL;
174 fStartType = START_NO_INFO;
175 fInitialStringIdx = 0;
176 fInitialStringLen = 0;
177 fInitialChars = NULL;
178 fInitialChar = 0;
179 fInitialChars8 = NULL;
180 fNeedsAltInput = FALSE;
181 fNamedCaptureMap = NULL;
182
183 fPattern = NULL; // will be set later
184 fPatternString = NULL; // may be set later
185 fCompiledPat = new UVector64(fDeferredStatus);
186 fGroupMap = new UVector32(fDeferredStatus);
187 fSets = new UVector(fDeferredStatus);
188 fInitialChars = new UnicodeSet;
189 fInitialChars8 = new Regex8BitSet;
190 fNamedCaptureMap = uhash_open(uhash_hashUnicodeString, // Key hash function
191 uhash_compareUnicodeString, // Key comparator function
192 uhash_compareLong, // Value comparator function
193 &fDeferredStatus);
194 if (U_FAILURE(fDeferredStatus)) {
195 return;
196 }
197 if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
198 fInitialChars == NULL || fInitialChars8 == NULL || fNamedCaptureMap == NULL) {
199 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
200 return;
201 }
202
203 // Slot zero of the vector of sets is reserved. Fill it here.
204 fSets->addElement((int32_t)0, fDeferredStatus);
205
206 // fNamedCaptureMap owns its key strings, type (UnicodeString *)
207 uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject);
208 }
209
210
211 //--------------------------------------------------------------------------
212 //
213 // zap Delete everything owned by this RegexPattern.
214 //
215 //--------------------------------------------------------------------------
zap()216 void RegexPattern::zap() {
217 delete fCompiledPat;
218 fCompiledPat = NULL;
219 int i;
220 for (i=1; i<fSets->size(); i++) {
221 UnicodeSet *s;
222 s = (UnicodeSet *)fSets->elementAt(i);
223 if (s != NULL) {
224 delete s;
225 }
226 }
227 delete fSets;
228 fSets = NULL;
229 delete[] fSets8;
230 fSets8 = NULL;
231 delete fGroupMap;
232 fGroupMap = NULL;
233 delete fInitialChars;
234 fInitialChars = NULL;
235 delete fInitialChars8;
236 fInitialChars8 = NULL;
237 if (fPattern != NULL) {
238 utext_close(fPattern);
239 fPattern = NULL;
240 }
241 if (fPatternString != NULL) {
242 delete fPatternString;
243 fPatternString = NULL;
244 }
245 uhash_close(fNamedCaptureMap);
246 fNamedCaptureMap = NULL;
247 }
248
249
250 //--------------------------------------------------------------------------
251 //
252 // Destructor
253 //
254 //--------------------------------------------------------------------------
~RegexPattern()255 RegexPattern::~RegexPattern() {
256 zap();
257 }
258
259
260 //--------------------------------------------------------------------------
261 //
262 // Clone
263 //
264 //--------------------------------------------------------------------------
clone() const265 RegexPattern *RegexPattern::clone() const {
266 RegexPattern *copy = new RegexPattern(*this);
267 return copy;
268 }
269
270
271 //--------------------------------------------------------------------------
272 //
273 // operator == (comparison) Consider to patterns to be == if the
274 // pattern strings and the flags are the same.
275 // Note that pattern strings with the same
276 // characters can still be considered different.
277 //
278 //--------------------------------------------------------------------------
operator ==(const RegexPattern & other) const279 UBool RegexPattern::operator ==(const RegexPattern &other) const {
280 if (this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus) {
281 if (this->fPatternString != NULL && other.fPatternString != NULL) {
282 return *(this->fPatternString) == *(other.fPatternString);
283 } else if (this->fPattern == NULL) {
284 if (other.fPattern == NULL) {
285 return TRUE;
286 }
287 } else if (other.fPattern != NULL) {
288 UTEXT_SETNATIVEINDEX(this->fPattern, 0);
289 UTEXT_SETNATIVEINDEX(other.fPattern, 0);
290 return utext_equals(this->fPattern, other.fPattern);
291 }
292 }
293 return FALSE;
294 }
295
296 //---------------------------------------------------------------------
297 //
298 // compile
299 //
300 //---------------------------------------------------------------------
301 RegexPattern * U_EXPORT2
compile(const UnicodeString & regex,uint32_t flags,UParseError & pe,UErrorCode & status)302 RegexPattern::compile(const UnicodeString ®ex,
303 uint32_t flags,
304 UParseError &pe,
305 UErrorCode &status)
306 {
307 if (U_FAILURE(status)) {
308 return NULL;
309 }
310
311 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
312 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
313 UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL;
314
315 if ((flags & ~allFlags) != 0) {
316 status = U_REGEX_INVALID_FLAG;
317 return NULL;
318 }
319
320 if ((flags & UREGEX_CANON_EQ) != 0) {
321 status = U_REGEX_UNIMPLEMENTED;
322 return NULL;
323 }
324
325 RegexPattern *This = new RegexPattern;
326 if (This == NULL) {
327 status = U_MEMORY_ALLOCATION_ERROR;
328 return NULL;
329 }
330 if (U_FAILURE(This->fDeferredStatus)) {
331 status = This->fDeferredStatus;
332 delete This;
333 return NULL;
334 }
335 This->fFlags = flags;
336
337 RegexCompile compiler(This, status);
338 compiler.compile(regex, pe, status);
339
340 if (U_FAILURE(status)) {
341 delete This;
342 This = NULL;
343 }
344
345 return This;
346 }
347
348
349 //
350 // compile, UText mode
351 //
352 RegexPattern * U_EXPORT2
compile(UText * regex,uint32_t flags,UParseError & pe,UErrorCode & status)353 RegexPattern::compile(UText *regex,
354 uint32_t flags,
355 UParseError &pe,
356 UErrorCode &status)
357 {
358 if (U_FAILURE(status)) {
359 return NULL;
360 }
361
362 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
363 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
364 UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL;
365
366 if ((flags & ~allFlags) != 0) {
367 status = U_REGEX_INVALID_FLAG;
368 return NULL;
369 }
370
371 if ((flags & UREGEX_CANON_EQ) != 0) {
372 status = U_REGEX_UNIMPLEMENTED;
373 return NULL;
374 }
375
376 RegexPattern *This = new RegexPattern;
377 if (This == NULL) {
378 status = U_MEMORY_ALLOCATION_ERROR;
379 return NULL;
380 }
381 if (U_FAILURE(This->fDeferredStatus)) {
382 status = This->fDeferredStatus;
383 delete This;
384 return NULL;
385 }
386 This->fFlags = flags;
387
388 RegexCompile compiler(This, status);
389 compiler.compile(regex, pe, status);
390
391 if (U_FAILURE(status)) {
392 delete This;
393 This = NULL;
394 }
395
396 return This;
397 }
398
399 //
400 // compile with default flags.
401 //
402 RegexPattern * U_EXPORT2
compile(const UnicodeString & regex,UParseError & pe,UErrorCode & err)403 RegexPattern::compile(const UnicodeString ®ex,
404 UParseError &pe,
405 UErrorCode &err)
406 {
407 return compile(regex, 0, pe, err);
408 }
409
410
411 //
412 // compile with default flags, UText mode
413 //
414 RegexPattern * U_EXPORT2
compile(UText * regex,UParseError & pe,UErrorCode & err)415 RegexPattern::compile(UText *regex,
416 UParseError &pe,
417 UErrorCode &err)
418 {
419 return compile(regex, 0, pe, err);
420 }
421
422
423 //
424 // compile with no UParseErr parameter.
425 //
426 RegexPattern * U_EXPORT2
compile(const UnicodeString & regex,uint32_t flags,UErrorCode & err)427 RegexPattern::compile(const UnicodeString ®ex,
428 uint32_t flags,
429 UErrorCode &err)
430 {
431 UParseError pe;
432 return compile(regex, flags, pe, err);
433 }
434
435
436 //
437 // compile with no UParseErr parameter, UText mode
438 //
439 RegexPattern * U_EXPORT2
compile(UText * regex,uint32_t flags,UErrorCode & err)440 RegexPattern::compile(UText *regex,
441 uint32_t flags,
442 UErrorCode &err)
443 {
444 UParseError pe;
445 return compile(regex, flags, pe, err);
446 }
447
448
449 //---------------------------------------------------------------------
450 //
451 // flags
452 //
453 //---------------------------------------------------------------------
flags() const454 uint32_t RegexPattern::flags() const {
455 return fFlags;
456 }
457
458
459 //---------------------------------------------------------------------
460 //
461 // matcher(UnicodeString, err)
462 //
463 //---------------------------------------------------------------------
matcher(const UnicodeString & input,UErrorCode & status) const464 RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
465 UErrorCode &status) const {
466 RegexMatcher *retMatcher = matcher(status);
467 if (retMatcher != NULL) {
468 retMatcher->fDeferredStatus = status;
469 retMatcher->reset(input);
470 }
471 return retMatcher;
472 }
473
474
475 //---------------------------------------------------------------------
476 //
477 // matcher(status)
478 //
479 //---------------------------------------------------------------------
matcher(UErrorCode & status) const480 RegexMatcher *RegexPattern::matcher(UErrorCode &status) const {
481 RegexMatcher *retMatcher = NULL;
482
483 if (U_FAILURE(status)) {
484 return NULL;
485 }
486 if (U_FAILURE(fDeferredStatus)) {
487 status = fDeferredStatus;
488 return NULL;
489 }
490
491 retMatcher = new RegexMatcher(this);
492 if (retMatcher == NULL) {
493 status = U_MEMORY_ALLOCATION_ERROR;
494 return NULL;
495 }
496 return retMatcher;
497 }
498
499
500
501 //---------------------------------------------------------------------
502 //
503 // matches Convenience function to test for a match, starting
504 // with a pattern string and a data string.
505 //
506 //---------------------------------------------------------------------
matches(const UnicodeString & regex,const UnicodeString & input,UParseError & pe,UErrorCode & status)507 UBool U_EXPORT2 RegexPattern::matches(const UnicodeString ®ex,
508 const UnicodeString &input,
509 UParseError &pe,
510 UErrorCode &status) {
511
512 if (U_FAILURE(status)) {return FALSE;}
513
514 UBool retVal;
515 RegexPattern *pat = NULL;
516 RegexMatcher *matcher = NULL;
517
518 pat = RegexPattern::compile(regex, 0, pe, status);
519 matcher = pat->matcher(input, status);
520 retVal = matcher->matches(status);
521
522 delete matcher;
523 delete pat;
524 return retVal;
525 }
526
527
528 //
529 // matches, UText mode
530 //
matches(UText * regex,UText * input,UParseError & pe,UErrorCode & status)531 UBool U_EXPORT2 RegexPattern::matches(UText *regex,
532 UText *input,
533 UParseError &pe,
534 UErrorCode &status) {
535
536 if (U_FAILURE(status)) {return FALSE;}
537
538 UBool retVal = FALSE;
539 RegexPattern *pat = NULL;
540 RegexMatcher *matcher = NULL;
541
542 pat = RegexPattern::compile(regex, 0, pe, status);
543 matcher = pat->matcher(status);
544 if (U_SUCCESS(status)) {
545 matcher->reset(input);
546 retVal = matcher->matches(status);
547 }
548
549 delete matcher;
550 delete pat;
551 return retVal;
552 }
553
554
555
556
557
558 //---------------------------------------------------------------------
559 //
560 // pattern
561 //
562 //---------------------------------------------------------------------
pattern() const563 UnicodeString RegexPattern::pattern() const {
564 if (fPatternString != NULL) {
565 return *fPatternString;
566 } else if (fPattern == NULL) {
567 return UnicodeString();
568 } else {
569 UErrorCode status = U_ZERO_ERROR;
570 int64_t nativeLen = utext_nativeLength(fPattern);
571 int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error
572 UnicodeString result;
573
574 status = U_ZERO_ERROR;
575 UChar *resultChars = result.getBuffer(len16);
576 utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning
577 result.releaseBuffer(len16);
578
579 return result;
580 }
581 }
582
583
584
585
586 //---------------------------------------------------------------------
587 //
588 // patternText
589 //
590 //---------------------------------------------------------------------
patternText(UErrorCode & status) const591 UText *RegexPattern::patternText(UErrorCode &status) const {
592 if (U_FAILURE(status)) {return NULL;}
593 status = U_ZERO_ERROR;
594
595 if (fPattern != NULL) {
596 return fPattern;
597 } else {
598 RegexStaticSets::initGlobals(&status);
599 return RegexStaticSets::gStaticSets->fEmptyText;
600 }
601 }
602
603
604 //--------------------------------------------------------------------------------
605 //
606 // groupNumberFromName()
607 //
608 //--------------------------------------------------------------------------------
groupNumberFromName(const UnicodeString & groupName,UErrorCode & status) const609 int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const {
610 if (U_FAILURE(status)) {
611 return 0;
612 }
613
614 // No need to explicitly check for syntactically valid names.
615 // Invalid ones will never be in the map, and the lookup will fail.
616
617 int32_t number = uhash_geti(fNamedCaptureMap, &groupName);
618 if (number == 0) {
619 status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
620 }
621 return number;
622 }
623
groupNumberFromName(const char * groupName,int32_t nameLength,UErrorCode & status) const624 int32_t RegexPattern::groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const {
625 if (U_FAILURE(status)) {
626 return 0;
627 }
628 UnicodeString name(groupName, nameLength, US_INV);
629 return groupNumberFromName(name, status);
630 }
631
632
633 //---------------------------------------------------------------------
634 //
635 // split
636 //
637 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status) const638 int32_t RegexPattern::split(const UnicodeString &input,
639 UnicodeString dest[],
640 int32_t destCapacity,
641 UErrorCode &status) const
642 {
643 if (U_FAILURE(status)) {
644 return 0;
645 };
646
647 RegexMatcher m(this);
648 int32_t r = 0;
649 // Check m's status to make sure all is ok.
650 if (U_SUCCESS(m.fDeferredStatus)) {
651 r = m.split(input, dest, destCapacity, status);
652 }
653 return r;
654 }
655
656 //
657 // split, UText mode
658 //
split(UText * input,UText * dest[],int32_t destCapacity,UErrorCode & status) const659 int32_t RegexPattern::split(UText *input,
660 UText *dest[],
661 int32_t destCapacity,
662 UErrorCode &status) const
663 {
664 if (U_FAILURE(status)) {
665 return 0;
666 };
667
668 RegexMatcher m(this);
669 int32_t r = 0;
670 // Check m's status to make sure all is ok.
671 if (U_SUCCESS(m.fDeferredStatus)) {
672 r = m.split(input, dest, destCapacity, status);
673 }
674 return r;
675 }
676
677
678
679 //---------------------------------------------------------------------
680 //
681 // dump Output the compiled form of the pattern.
682 // Debugging function only.
683 //
684 //---------------------------------------------------------------------
dumpOp(int32_t index) const685 void RegexPattern::dumpOp(int32_t index) const {
686 (void)index; // Suppress warnings in non-debug build.
687 #if defined(REGEX_DEBUG)
688 static const char * const opNames[] = {URX_OPCODE_NAMES};
689 int32_t op = fCompiledPat->elementAti(index);
690 int32_t val = URX_VAL(op);
691 int32_t type = URX_TYPE(op);
692 int32_t pinnedType = type;
693 if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) {
694 pinnedType = 0;
695 }
696
697 printf("%4d %08x %-15s ", index, op, opNames[pinnedType]);
698 switch (type) {
699 case URX_NOP:
700 case URX_DOTANY:
701 case URX_DOTANY_ALL:
702 case URX_FAIL:
703 case URX_CARET:
704 case URX_DOLLAR:
705 case URX_BACKSLASH_G:
706 case URX_BACKSLASH_X:
707 case URX_END:
708 case URX_DOLLAR_M:
709 case URX_CARET_M:
710 // Types with no operand field of interest.
711 break;
712
713 case URX_RESERVED_OP:
714 case URX_START_CAPTURE:
715 case URX_END_CAPTURE:
716 case URX_STATE_SAVE:
717 case URX_JMP:
718 case URX_JMP_SAV:
719 case URX_JMP_SAV_X:
720 case URX_BACKSLASH_B:
721 case URX_BACKSLASH_BU:
722 case URX_BACKSLASH_D:
723 case URX_BACKSLASH_Z:
724 case URX_STRING_LEN:
725 case URX_CTR_INIT:
726 case URX_CTR_INIT_NG:
727 case URX_CTR_LOOP:
728 case URX_CTR_LOOP_NG:
729 case URX_RELOC_OPRND:
730 case URX_STO_SP:
731 case URX_LD_SP:
732 case URX_BACKREF:
733 case URX_STO_INP_LOC:
734 case URX_JMPX:
735 case URX_LA_START:
736 case URX_LA_END:
737 case URX_BACKREF_I:
738 case URX_LB_START:
739 case URX_LB_CONT:
740 case URX_LB_END:
741 case URX_LBN_CONT:
742 case URX_LBN_END:
743 case URX_LOOP_C:
744 case URX_LOOP_DOT_I:
745 case URX_BACKSLASH_H:
746 case URX_BACKSLASH_R:
747 case URX_BACKSLASH_V:
748 // types with an integer operand field.
749 printf("%d", val);
750 break;
751
752 case URX_ONECHAR:
753 case URX_ONECHAR_I:
754 printf("%c", val<256?val:'?');
755 break;
756
757 case URX_STRING:
758 case URX_STRING_I:
759 {
760 int32_t lengthOp = fCompiledPat->elementAti(index+1);
761 U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
762 int32_t length = URX_VAL(lengthOp);
763 int32_t i;
764 for (i=val; i<val+length; i++) {
765 UChar c = fLiteralText[i];
766 if (c < 32 || c >= 256) {c = '.';}
767 printf("%c", c);
768 }
769 }
770 break;
771
772 case URX_SETREF:
773 case URX_LOOP_SR_I:
774 {
775 UnicodeString s;
776 UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
777 set->toPattern(s, TRUE);
778 for (int32_t i=0; i<s.length(); i++) {
779 printf("%c", s.charAt(i));
780 }
781 }
782 break;
783
784 case URX_STATIC_SETREF:
785 case URX_STAT_SETREF_N:
786 {
787 UnicodeString s;
788 if (val & URX_NEG_SET) {
789 printf("NOT ");
790 val &= ~URX_NEG_SET;
791 }
792 UnicodeSet *set = fStaticSets[val];
793 set->toPattern(s, TRUE);
794 for (int32_t i=0; i<s.length(); i++) {
795 printf("%c", s.charAt(i));
796 }
797 }
798 break;
799
800
801 default:
802 printf("??????");
803 break;
804 }
805 printf("\n");
806 #endif
807 }
808
809
dumpPattern() const810 void RegexPattern::dumpPattern() const {
811 #if defined(REGEX_DEBUG)
812 // TODO: This function assumes an ASCII based charset.
813 int index;
814 int i;
815
816 printf("Original Pattern: ");
817 UChar32 c = utext_next32From(fPattern, 0);
818 while (c != U_SENTINEL) {
819 if (c<32 || c>256) {
820 c = '.';
821 }
822 printf("%c", c);
823
824 c = UTEXT_NEXT32(fPattern);
825 }
826 printf("\n");
827 printf(" Min Match Length: %d\n", fMinMatchLen);
828 printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType));
829 if (fStartType == START_STRING) {
830 printf(" Initial match string: \"");
831 for (i=fInitialStringIdx; i<fInitialStringIdx+fInitialStringLen; i++) {
832 printf("%c", fLiteralText[i]); // TODO: non-printables, surrogates.
833 }
834 printf("\"\n");
835
836 } else if (fStartType == START_SET) {
837 int32_t numSetChars = fInitialChars->size();
838 if (numSetChars > 20) {
839 numSetChars = 20;
840 }
841 printf(" Match First Chars : ");
842 for (i=0; i<numSetChars; i++) {
843 UChar32 c = fInitialChars->charAt(i);
844 if (0x20<c && c <0x7e) {
845 printf("%c ", c);
846 } else {
847 printf("%#x ", c);
848 }
849 }
850 if (numSetChars < fInitialChars->size()) {
851 printf(" ...");
852 }
853 printf("\n");
854
855 } else if (fStartType == START_CHAR) {
856 printf(" First char of Match : ");
857 if (0x20 < fInitialChar && fInitialChar<0x7e) {
858 printf("%c\n", fInitialChar);
859 } else {
860 printf("%#x\n", fInitialChar);
861 }
862 }
863
864 printf("Named Capture Groups:\n");
865 if (uhash_count(fNamedCaptureMap) == 0) {
866 printf(" None\n");
867 } else {
868 int32_t pos = UHASH_FIRST;
869 const UHashElement *el = NULL;
870 while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) {
871 const UnicodeString *name = (const UnicodeString *)el->key.pointer;
872 char s[100];
873 name->extract(0, 99, s, sizeof(s), US_INV); // capture group names are invariant.
874 int32_t number = el->value.integer;
875 printf(" %d\t%s\n", number, s);
876 }
877 }
878
879 printf("\nIndex Binary Type Operand\n" \
880 "-------------------------------------------\n");
881 for (index = 0; index<fCompiledPat->size(); index++) {
882 dumpOp(index);
883 }
884 printf("\n\n");
885 #endif
886 }
887
888
889
890 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern)
891
892 U_NAMESPACE_END
893 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
894