1 //
2 // file: repattrn.cpp
3 //
4 /*
5 ***************************************************************************
6 * Copyright (C) 2002-2007 International Business Machines Corporation *
7 * and others. All rights reserved. *
8 ***************************************************************************
9 */
10
11 #include "unicode/utypes.h"
12
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
14
15 #include "unicode/regex.h"
16 #include "unicode/uclean.h"
17 #include "uassert.h"
18 #include "uvector.h"
19 #include "uvectr32.h"
20 #include "regexcmp.h"
21 #include "regeximp.h"
22 #include "regexst.h"
23
24 U_NAMESPACE_BEGIN
25
26 //--------------------------------------------------------------------------
27 //
28 // RegexPattern Default Constructor
29 //
30 //--------------------------------------------------------------------------
RegexPattern()31 RegexPattern::RegexPattern() {
32 UErrorCode status = U_ZERO_ERROR;
33 u_init(&status);
34 // Init all of this instances data.
35 init();
36
37 // Lazy init of all shared global sets.
38 RegexStaticSets::initGlobals(&fDeferredStatus);
39 }
40
41
42 //--------------------------------------------------------------------------
43 //
44 // Copy Constructor Note: This is a rather inefficient implementation,
45 // but it probably doesn't matter.
46 //
47 //--------------------------------------------------------------------------
RegexPattern(const RegexPattern & other)48 RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) {
49 init();
50 *this = other;
51 }
52
53
54
55 //--------------------------------------------------------------------------
56 //
57 // Assignmenet Operator
58 //
59 //--------------------------------------------------------------------------
operator =(const RegexPattern & other)60 RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
61 if (this == &other) {
62 // Source and destination are the same. Don't do anything.
63 return *this;
64 }
65
66 // Clean out any previous contents of object being assigned to.
67 zap();
68
69 // Give target object a default initialization
70 init();
71
72 // Copy simple fields
73 fPattern = other.fPattern;
74 fFlags = other.fFlags;
75 fLiteralText = other.fLiteralText;
76 fDeferredStatus = other.fDeferredStatus;
77 fMinMatchLen = other.fMinMatchLen;
78 fFrameSize = other.fFrameSize;
79 fDataSize = other.fDataSize;
80 fMaxCaptureDigits = other.fMaxCaptureDigits;
81 fStaticSets = other.fStaticSets;
82 fStaticSets8 = other.fStaticSets8;
83
84 fStartType = other.fStartType;
85 fInitialStringIdx = other.fInitialStringIdx;
86 fInitialStringLen = other.fInitialStringLen;
87 *fInitialChars = *other.fInitialChars;
88 fInitialChar = other.fInitialChar;
89 *fInitialChars8 = *other.fInitialChars8;
90
91 // Copy the pattern. It's just values, nothing deep to copy.
92 fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
93 fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
94
95 // Copy the Unicode Sets.
96 // Could be made more efficient if the sets were reference counted and shared,
97 // but I doubt that pattern copying will be particularly common.
98 // Note: init() already added an empty element zero to fSets
99 int32_t i;
100 int32_t numSets = other.fSets->size();
101 fSets8 = new Regex8BitSet[numSets];
102 for (i=1; i<numSets; i++) {
103 if (U_FAILURE(fDeferredStatus)) {
104 return *this;
105 }
106 UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
107 UnicodeSet *newSet = new UnicodeSet(*sourceSet);
108 if (newSet == NULL) {
109 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
110 break;
111 }
112 fSets->addElement(newSet, fDeferredStatus);
113 fSets8[i] = other.fSets8[i];
114 }
115
116 return *this;
117 }
118
119
120 //--------------------------------------------------------------------------
121 //
122 // init Shared initialization for use by constructors.
123 // Bring an uninitialized RegexPattern up to a default state.
124 //
125 //--------------------------------------------------------------------------
init()126 void RegexPattern::init() {
127 fPattern.remove();
128 fFlags = 0;
129 fCompiledPat = 0;
130 fLiteralText.remove();
131 fSets = NULL;
132 fSets8 = NULL;
133 fDeferredStatus = U_ZERO_ERROR;
134 fMinMatchLen = 0;
135 fFrameSize = 0;
136 fDataSize = 0;
137 fGroupMap = NULL;
138 fMaxCaptureDigits = 1;
139 fStaticSets = NULL;
140 fStaticSets8 = NULL;
141 fStartType = START_NO_INFO;
142 fInitialStringIdx = 0;
143 fInitialStringLen = 0;
144 fInitialChars = NULL;
145 fInitialChar = 0;
146 fInitialChars8 = NULL;
147
148 fCompiledPat = new UVector32(fDeferredStatus);
149 fGroupMap = new UVector32(fDeferredStatus);
150 fSets = new UVector(fDeferredStatus);
151 fInitialChars = new UnicodeSet;
152 fInitialChars8 = new Regex8BitSet;
153 if (U_FAILURE(fDeferredStatus)) {
154 return;
155 }
156 if (fCompiledPat == NULL || fGroupMap == NULL || fSets == NULL ||
157 fInitialChars == NULL || fInitialChars8 == NULL) {
158 fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
159 return;
160 }
161
162 // Slot zero of the vector of sets is reserved. Fill it here.
163 fSets->addElement((int32_t)0, fDeferredStatus);
164 }
165
166
167 //--------------------------------------------------------------------------
168 //
169 // zap Delete everything owned by this RegexPattern.
170 //
171 //--------------------------------------------------------------------------
zap()172 void RegexPattern::zap() {
173 delete fCompiledPat;
174 fCompiledPat = NULL;
175 int i;
176 for (i=1; i<fSets->size(); i++) {
177 UnicodeSet *s;
178 s = (UnicodeSet *)fSets->elementAt(i);
179 if (s != NULL) {
180 delete s;
181 }
182 }
183 delete fSets;
184 fSets = NULL;
185 delete[] fSets8;
186 fSets8 = NULL;
187 delete fGroupMap;
188 fGroupMap = NULL;
189 delete fInitialChars;
190 fInitialChars = NULL;
191 delete fInitialChars8;
192 fInitialChars8 = NULL;
193 }
194
195
196 //--------------------------------------------------------------------------
197 //
198 // Destructor
199 //
200 //--------------------------------------------------------------------------
~RegexPattern()201 RegexPattern::~RegexPattern() {
202 zap();
203 }
204
205
206 //--------------------------------------------------------------------------
207 //
208 // Clone
209 //
210 //--------------------------------------------------------------------------
clone() const211 RegexPattern *RegexPattern::clone() const {
212 RegexPattern *copy = new RegexPattern(*this);
213 return copy;
214 }
215
216
217 //--------------------------------------------------------------------------
218 //
219 // operator == (comparison) Consider to patterns to be == if the
220 // pattern strings and the flags are the same.
221 //
222 //--------------------------------------------------------------------------
operator ==(const RegexPattern & other) const223 UBool RegexPattern::operator ==(const RegexPattern &other) const {
224 UBool r = this->fFlags == other.fFlags &&
225 this->fPattern == other.fPattern &&
226 this->fDeferredStatus == other.fDeferredStatus;
227 return r;
228 }
229
230 //---------------------------------------------------------------------
231 //
232 // compile
233 //
234 //---------------------------------------------------------------------
235 RegexPattern * U_EXPORT2
compile(const UnicodeString & regex,uint32_t flags,UParseError & pe,UErrorCode & status)236 RegexPattern::compile(const UnicodeString ®ex,
237 uint32_t flags,
238 UParseError &pe,
239 UErrorCode &status)
240 {
241
242 if (U_FAILURE(status)) {
243 return NULL;
244 }
245
246 const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
247 UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD |
248 UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES;
249
250 if ((flags & ~allFlags) != 0) {
251 status = U_REGEX_INVALID_FLAG;
252 return NULL;
253 }
254
255 if ((flags & UREGEX_CANON_EQ) != 0) {
256 status = U_REGEX_UNIMPLEMENTED;
257 return NULL;
258 }
259
260 RegexPattern *This = new RegexPattern;
261 if (This == NULL) {
262 status = U_MEMORY_ALLOCATION_ERROR;
263 return NULL;
264 }
265 if (U_FAILURE(This->fDeferredStatus)) {
266 status = This->fDeferredStatus;
267 return NULL;
268 }
269 This->fFlags = flags;
270
271 RegexCompile compiler(This, status);
272 compiler.compile(regex, pe, status);
273
274 if (U_FAILURE(status)) {
275 delete This;
276 This = NULL;
277 }
278
279 return This;
280 }
281
282 //
283 // compile with default flags.
284 //
285 RegexPattern * U_EXPORT2
compile(const UnicodeString & regex,UParseError & pe,UErrorCode & err)286 RegexPattern::compile(const UnicodeString ®ex,
287 UParseError &pe,
288 UErrorCode &err)
289 {
290 return compile(regex, 0, pe, err);
291 }
292
293
294
295 //
296 // compile with no UParseErr parameter.
297 //
298 RegexPattern * U_EXPORT2
compile(const UnicodeString & regex,uint32_t flags,UErrorCode & err)299 RegexPattern::compile( const UnicodeString ®ex,
300 uint32_t flags,
301 UErrorCode &err)
302 {
303 UParseError pe;
304 return compile(regex, flags, pe, err);
305 }
306
307
308
309 //---------------------------------------------------------------------
310 //
311 // flags
312 //
313 //---------------------------------------------------------------------
flags() const314 uint32_t RegexPattern::flags() const {
315 return fFlags;
316 }
317
318
319 //---------------------------------------------------------------------
320 //
321 // matcher(UnicodeString, err)
322 //
323 //---------------------------------------------------------------------
matcher(const UnicodeString & input,UErrorCode & status) const324 RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
325 UErrorCode &status) const {
326 RegexMatcher *retMatcher = matcher(status);
327 if (retMatcher != NULL) {
328 retMatcher->reset(input);
329 }
330 return retMatcher;
331 }
332
333 #if 0
334 RegexMatcher *RegexPattern::matcher(const UChar * /*input*/,
335 UErrorCode &status) const
336 {
337 /* This should never get called. The API with UnicodeString should be called instead. */
338 if (U_SUCCESS(status)) {
339 status = U_UNSUPPORTED_ERROR;
340 }
341 return NULL;
342 }
343 #endif
344
345 //---------------------------------------------------------------------
346 //
347 // matcher(status)
348 //
349 //---------------------------------------------------------------------
matcher(UErrorCode & status) const350 RegexMatcher *RegexPattern::matcher(UErrorCode &status) const {
351 RegexMatcher *retMatcher = NULL;
352
353 if (U_FAILURE(status)) {
354 return NULL;
355 }
356 if (U_FAILURE(fDeferredStatus)) {
357 status = fDeferredStatus;
358 return NULL;
359 }
360
361 retMatcher = new RegexMatcher(this);
362 if (retMatcher == NULL) {
363 status = U_MEMORY_ALLOCATION_ERROR;
364 return NULL;
365 }
366 return retMatcher;
367 }
368
369
370
371 //---------------------------------------------------------------------
372 //
373 // matches Convenience function to test for a match, starting
374 // with a pattern string and a data string.
375 //
376 //---------------------------------------------------------------------
matches(const UnicodeString & regex,const UnicodeString & input,UParseError & pe,UErrorCode & status)377 UBool U_EXPORT2 RegexPattern::matches(const UnicodeString ®ex,
378 const UnicodeString &input,
379 UParseError &pe,
380 UErrorCode &status) {
381
382 if (U_FAILURE(status)) {return FALSE;}
383
384 UBool retVal;
385 RegexPattern *pat = NULL;
386 RegexMatcher *matcher = NULL;
387
388 pat = RegexPattern::compile(regex, 0, pe, status);
389 matcher = pat->matcher(input, status);
390 retVal = matcher->matches(status);
391
392 delete matcher;
393 delete pat;
394 return retVal;
395 }
396
397
398
399
400 //---------------------------------------------------------------------
401 //
402 // pattern
403 //
404 //---------------------------------------------------------------------
pattern() const405 UnicodeString RegexPattern::pattern() const {
406 return fPattern;
407 }
408
409
410
411
412 //---------------------------------------------------------------------
413 //
414 // split
415 //
416 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status) const417 int32_t RegexPattern::split(const UnicodeString &input,
418 UnicodeString dest[],
419 int32_t destCapacity,
420 UErrorCode &status) const
421 {
422 if (U_FAILURE(status)) {
423 return 0;
424 };
425
426 RegexMatcher m(this);
427 int32_t r = m.split(input, dest, destCapacity, status);
428 return r;
429 }
430
431
432
433 //---------------------------------------------------------------------
434 //
435 // dump Output the compiled form of the pattern.
436 // Debugging function only.
437 //
438 //---------------------------------------------------------------------
439 #if defined(REGEX_DEBUG)
dumpOp(int32_t index) const440 void RegexPattern::dumpOp(int32_t index) const {
441 static const char * const opNames[] = {URX_OPCODE_NAMES};
442 int32_t op = fCompiledPat->elementAti(index);
443 int32_t val = URX_VAL(op);
444 int32_t type = URX_TYPE(op);
445 int32_t pinnedType = type;
446 if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) {
447 pinnedType = 0;
448 }
449
450 REGEX_DUMP_DEBUG_PRINTF(("%4d %08x %-15s ", index, op, opNames[pinnedType]));
451 switch (type) {
452 case URX_NOP:
453 case URX_DOTANY:
454 case URX_DOTANY_ALL:
455 case URX_FAIL:
456 case URX_CARET:
457 case URX_DOLLAR:
458 case URX_BACKSLASH_G:
459 case URX_BACKSLASH_X:
460 case URX_END:
461 case URX_DOLLAR_M:
462 case URX_CARET_M:
463 // Types with no operand field of interest.
464 break;
465
466 case URX_RESERVED_OP:
467 case URX_START_CAPTURE:
468 case URX_END_CAPTURE:
469 case URX_STATE_SAVE:
470 case URX_JMP:
471 case URX_JMP_SAV:
472 case URX_JMP_SAV_X:
473 case URX_BACKSLASH_B:
474 case URX_BACKSLASH_BU:
475 case URX_BACKSLASH_D:
476 case URX_BACKSLASH_Z:
477 case URX_STRING_LEN:
478 case URX_CTR_INIT:
479 case URX_CTR_INIT_NG:
480 case URX_CTR_LOOP:
481 case URX_CTR_LOOP_NG:
482 case URX_RELOC_OPRND:
483 case URX_STO_SP:
484 case URX_LD_SP:
485 case URX_BACKREF:
486 case URX_STO_INP_LOC:
487 case URX_JMPX:
488 case URX_LA_START:
489 case URX_LA_END:
490 case URX_BACKREF_I:
491 case URX_LB_START:
492 case URX_LB_CONT:
493 case URX_LB_END:
494 case URX_LBN_CONT:
495 case URX_LBN_END:
496 case URX_LOOP_C:
497 case URX_LOOP_DOT_I:
498 // types with an integer operand field.
499 REGEX_DUMP_DEBUG_PRINTF(("%d", val));
500 break;
501
502 case URX_ONECHAR:
503 case URX_ONECHAR_I:
504 REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?'));
505 break;
506
507 case URX_STRING:
508 case URX_STRING_I:
509 {
510 int32_t lengthOp = fCompiledPat->elementAti(index+1);
511 U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
512 int32_t length = URX_VAL(lengthOp);
513 int32_t i;
514 for (i=val; i<val+length; i++) {
515 UChar c = fLiteralText[i];
516 if (c < 32 || c >= 256) {c = '.';}
517 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
518 }
519 }
520 break;
521
522 case URX_SETREF:
523 case URX_LOOP_SR_I:
524 {
525 UnicodeString s;
526 UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
527 set->toPattern(s, TRUE);
528 for (int32_t i=0; i<s.length(); i++) {
529 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
530 }
531 }
532 break;
533
534 case URX_STATIC_SETREF:
535 case URX_STAT_SETREF_N:
536 {
537 UnicodeString s;
538 if (val & URX_NEG_SET) {
539 REGEX_DUMP_DEBUG_PRINTF(("NOT "));
540 val &= ~URX_NEG_SET;
541 }
542 UnicodeSet *set = fStaticSets[val];
543 set->toPattern(s, TRUE);
544 for (int32_t i=0; i<s.length(); i++) {
545 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
546 }
547 }
548 break;
549
550
551 default:
552 REGEX_DUMP_DEBUG_PRINTF(("??????"));
553 break;
554 }
555 REGEX_DUMP_DEBUG_PRINTF(("\n"));
556 }
557 #endif
558
559
560 #if defined(REGEX_DEBUG)
561 U_CAPI void U_EXPORT2
RegexPatternDump(const RegexPattern * This)562 RegexPatternDump(const RegexPattern *This) {
563 int index;
564 int i;
565
566 REGEX_DUMP_DEBUG_PRINTF(("Original Pattern: "));
567 for (i=0; i<This->fPattern.length(); i++) {
568 REGEX_DUMP_DEBUG_PRINTF(("%c", This->fPattern.charAt(i)));
569 }
570 REGEX_DUMP_DEBUG_PRINTF(("\n"));
571 REGEX_DUMP_DEBUG_PRINTF((" Min Match Length: %d\n", This->fMinMatchLen));
572 REGEX_DUMP_DEBUG_PRINTF((" Match Start Type: %s\n", START_OF_MATCH_STR(This->fStartType)));
573 if (This->fStartType == START_STRING) {
574 REGEX_DUMP_DEBUG_PRINTF((" Initial match sting: \""));
575 for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
576 REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i])); // TODO: non-printables, surrogates.
577 }
578
579 } else if (This->fStartType == START_SET) {
580 int32_t numSetChars = This->fInitialChars->size();
581 if (numSetChars > 20) {
582 numSetChars = 20;
583 }
584 REGEX_DUMP_DEBUG_PRINTF((" Match First Chars : "));
585 for (i=0; i<numSetChars; i++) {
586 UChar32 c = This->fInitialChars->charAt(i);
587 if (0x20<c && c <0x7e) {
588 REGEX_DUMP_DEBUG_PRINTF(("%c ", c));
589 } else {
590 REGEX_DUMP_DEBUG_PRINTF(("%#x ", c));
591 }
592 }
593 if (numSetChars < This->fInitialChars->size()) {
594 REGEX_DUMP_DEBUG_PRINTF((" ..."));
595 }
596 REGEX_DUMP_DEBUG_PRINTF(("\n"));
597
598 } else if (This->fStartType == START_CHAR) {
599 REGEX_DUMP_DEBUG_PRINTF((" First char of Match : "));
600 if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) {
601 REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar));
602 } else {
603 REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar));
604 }
605 }
606
607 REGEX_DUMP_DEBUG_PRINTF(("\nIndex Binary Type Operand\n" \
608 "-------------------------------------------\n"));
609 for (index = 0; index<This->fCompiledPat->size(); index++) {
610 This->dumpOp(index);
611 }
612 REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
613 }
614 #endif
615
616
617
618 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern)
619
620 U_NAMESPACE_END
621 #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
622