• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 //  file:  repattrn.cpp
3 //
4 /*
5 ***************************************************************************
6 *   Copyright (C) 2002-2007 International Business Machines Corporation   *
7 *   and others. All rights reserved.                                      *
8 ***************************************************************************
9 */
10 
11 #include "unicode/utypes.h"
12 
13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
14 
15 #include "unicode/regex.h"
16 #include "unicode/uclean.h"
17 #include "uassert.h"
18 #include "uvector.h"
19 #include "uvectr32.h"
20 #include "regexcmp.h"
21 #include "regeximp.h"
22 #include "regexst.h"
23 
24 U_NAMESPACE_BEGIN
25 
26 //--------------------------------------------------------------------------
27 //
28 //    RegexPattern    Default Constructor
29 //
30 //--------------------------------------------------------------------------
RegexPattern()31 RegexPattern::RegexPattern() {
32     UErrorCode status = U_ZERO_ERROR;
33     u_init(&status);
34     // Init all of this instances data.
35     init();
36 
37     // Lazy init of all shared global sets.
38     RegexStaticSets::initGlobals(&fDeferredStatus);
39 }
40 
41 
42 //--------------------------------------------------------------------------
43 //
44 //   Copy Constructor        Note:  This is a rather inefficient implementation,
45 //                                  but it probably doesn't matter.
46 //
47 //--------------------------------------------------------------------------
RegexPattern(const RegexPattern & other)48 RegexPattern::RegexPattern(const RegexPattern &other) :  UObject(other) {
49     init();
50     *this = other;
51 }
52 
53 
54 
55 //--------------------------------------------------------------------------
56 //
57 //    Assignmenet Operator
58 //
59 //--------------------------------------------------------------------------
operator =(const RegexPattern & other)60 RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
61     if (this == &other) {
62         // Source and destination are the same.  Don't do anything.
63         return *this;
64     }
65 
66     // Clean out any previous contents of object being assigned to.
67     zap();
68 
69     // Give target object a default initialization
70     init();
71 
72     // Copy simple fields
73     fPattern          = other.fPattern;
74     fFlags            = other.fFlags;
75     fLiteralText      = other.fLiteralText;
76     fDeferredStatus   = other.fDeferredStatus;
77     fMinMatchLen      = other.fMinMatchLen;
78     fFrameSize        = other.fFrameSize;
79     fDataSize         = other.fDataSize;
80     fMaxCaptureDigits = other.fMaxCaptureDigits;
81     fStaticSets       = other.fStaticSets;
82     fStaticSets8      = other.fStaticSets8;
83 
84     fStartType        = other.fStartType;
85     fInitialStringIdx = other.fInitialStringIdx;
86     fInitialStringLen = other.fInitialStringLen;
87     *fInitialChars    = *other.fInitialChars;
88     fInitialChar      = other.fInitialChar;
89     *fInitialChars8   = *other.fInitialChars8;
90 
91     //  Copy the pattern.  It's just values, nothing deep to copy.
92     fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
93     fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
94 
95     //  Copy the Unicode Sets.
96     //    Could be made more efficient if the sets were reference counted and shared,
97     //    but I doubt that pattern copying will be particularly common.
98     //    Note:  init() already added an empty element zero to fSets
99     int32_t i;
100     int32_t  numSets = other.fSets->size();
101     fSets8 = new Regex8BitSet[numSets];
102     for (i=1; i<numSets; i++) {
103         if (U_FAILURE(fDeferredStatus)) {
104             return *this;
105         }
106         UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
107         UnicodeSet *newSet    = new UnicodeSet(*sourceSet);
108         if (newSet == NULL) {
109             fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
110             break;
111         }
112         fSets->addElement(newSet, fDeferredStatus);
113         fSets8[i] = other.fSets8[i];
114     }
115 
116     return *this;
117 }
118 
119 
120 //--------------------------------------------------------------------------
121 //
122 //    init        Shared initialization for use by constructors.
123 //                Bring an uninitialized RegexPattern up to a default state.
124 //
125 //--------------------------------------------------------------------------
init()126 void RegexPattern::init() {
127     fPattern.remove();
128     fFlags            = 0;
129     fCompiledPat      = 0;
130     fLiteralText.remove();
131     fSets             = NULL;
132     fSets8            = NULL;
133     fDeferredStatus   = U_ZERO_ERROR;
134     fMinMatchLen      = 0;
135     fFrameSize        = 0;
136     fDataSize         = 0;
137     fGroupMap         = NULL;
138     fMaxCaptureDigits = 1;
139     fStaticSets       = NULL;
140     fStaticSets8      = NULL;
141     fStartType        = START_NO_INFO;
142     fInitialStringIdx = 0;
143     fInitialStringLen = 0;
144     fInitialChars     = NULL;
145     fInitialChar      = 0;
146     fInitialChars8    = NULL;
147 
148     fCompiledPat      = new UVector32(fDeferredStatus);
149     fGroupMap         = new UVector32(fDeferredStatus);
150     fSets             = new UVector(fDeferredStatus);
151     fInitialChars     = new UnicodeSet;
152     fInitialChars8    = new Regex8BitSet;
153     if (U_FAILURE(fDeferredStatus)) {
154         return;
155     }
156     if (fCompiledPat == NULL  || fGroupMap == NULL || fSets == NULL ||
157         fInitialChars == NULL || fInitialChars8 == NULL) {
158         fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
159         return;
160     }
161 
162     // Slot zero of the vector of sets is reserved.  Fill it here.
163     fSets->addElement((int32_t)0, fDeferredStatus);
164 }
165 
166 
167 //--------------------------------------------------------------------------
168 //
169 //   zap            Delete everything owned by this RegexPattern.
170 //
171 //--------------------------------------------------------------------------
zap()172 void RegexPattern::zap() {
173     delete fCompiledPat;
174     fCompiledPat = NULL;
175     int i;
176     for (i=1; i<fSets->size(); i++) {
177         UnicodeSet *s;
178         s = (UnicodeSet *)fSets->elementAt(i);
179         if (s != NULL) {
180             delete s;
181         }
182     }
183     delete fSets;
184     fSets = NULL;
185     delete[] fSets8;
186     fSets8 = NULL;
187     delete fGroupMap;
188     fGroupMap = NULL;
189     delete fInitialChars;
190     fInitialChars = NULL;
191     delete fInitialChars8;
192     fInitialChars8 = NULL;
193 }
194 
195 
196 //--------------------------------------------------------------------------
197 //
198 //   Destructor
199 //
200 //--------------------------------------------------------------------------
~RegexPattern()201 RegexPattern::~RegexPattern() {
202     zap();
203 }
204 
205 
206 //--------------------------------------------------------------------------
207 //
208 //   Clone
209 //
210 //--------------------------------------------------------------------------
clone() const211 RegexPattern  *RegexPattern::clone() const {
212     RegexPattern  *copy = new RegexPattern(*this);
213     return copy;
214 }
215 
216 
217 //--------------------------------------------------------------------------
218 //
219 //   operator ==   (comparison)    Consider to patterns to be == if the
220 //                                 pattern strings and the flags are the same.
221 //
222 //--------------------------------------------------------------------------
operator ==(const RegexPattern & other) const223 UBool   RegexPattern::operator ==(const RegexPattern &other) const {
224     UBool r = this->fFlags    == other.fFlags &&
225               this->fPattern  == other.fPattern &&
226               this->fDeferredStatus == other.fDeferredStatus;
227     return r;
228 }
229 
230 //---------------------------------------------------------------------
231 //
232 //   compile
233 //
234 //---------------------------------------------------------------------
235 RegexPattern * U_EXPORT2
compile(const UnicodeString & regex,uint32_t flags,UParseError & pe,UErrorCode & status)236 RegexPattern::compile(const UnicodeString &regex,
237                       uint32_t             flags,
238                       UParseError          &pe,
239                       UErrorCode           &status)
240 {
241 
242     if (U_FAILURE(status)) {
243         return NULL;
244     }
245 
246     const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
247                               UREGEX_DOTALL   | UREGEX_MULTILINE        | UREGEX_UWORD |
248                               UREGEX_ERROR_ON_UNKNOWN_ESCAPES           | UREGEX_UNIX_LINES;
249 
250     if ((flags & ~allFlags) != 0) {
251         status = U_REGEX_INVALID_FLAG;
252         return NULL;
253     }
254 
255     if ((flags & UREGEX_CANON_EQ) != 0) {
256         status = U_REGEX_UNIMPLEMENTED;
257         return NULL;
258     }
259 
260     RegexPattern *This = new RegexPattern;
261     if (This == NULL) {
262         status = U_MEMORY_ALLOCATION_ERROR;
263         return NULL;
264     }
265     if (U_FAILURE(This->fDeferredStatus)) {
266         status = This->fDeferredStatus;
267         return NULL;
268     }
269     This->fFlags = flags;
270 
271     RegexCompile     compiler(This, status);
272     compiler.compile(regex, pe, status);
273 
274     if (U_FAILURE(status)) {
275         delete This;
276         This = NULL;
277     }
278 
279     return This;
280 }
281 
282 //
283 //   compile with default flags.
284 //
285 RegexPattern * U_EXPORT2
compile(const UnicodeString & regex,UParseError & pe,UErrorCode & err)286 RegexPattern::compile(const UnicodeString &regex,
287                       UParseError         &pe,
288                       UErrorCode          &err)
289 {
290     return compile(regex, 0, pe, err);
291 }
292 
293 
294 
295 //
296 //   compile with no UParseErr parameter.
297 //
298 RegexPattern * U_EXPORT2
compile(const UnicodeString & regex,uint32_t flags,UErrorCode & err)299 RegexPattern::compile( const UnicodeString &regex,
300         uint32_t             flags,
301         UErrorCode           &err)
302 {
303     UParseError pe;
304     return compile(regex, flags, pe, err);
305 }
306 
307 
308 
309 //---------------------------------------------------------------------
310 //
311 //   flags
312 //
313 //---------------------------------------------------------------------
flags() const314 uint32_t RegexPattern::flags() const {
315     return fFlags;
316 }
317 
318 
319 //---------------------------------------------------------------------
320 //
321 //   matcher(UnicodeString, err)
322 //
323 //---------------------------------------------------------------------
matcher(const UnicodeString & input,UErrorCode & status) const324 RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
325                                     UErrorCode          &status)  const {
326     RegexMatcher    *retMatcher = matcher(status);
327     if (retMatcher != NULL) {
328         retMatcher->reset(input);
329     }
330     return retMatcher;
331 }
332 
333 #if 0
334 RegexMatcher *RegexPattern::matcher(const UChar * /*input*/,
335                                     UErrorCode          &status)  const
336 {
337     /* This should never get called. The API with UnicodeString should be called instead. */
338     if (U_SUCCESS(status)) {
339         status = U_UNSUPPORTED_ERROR;
340     }
341     return NULL;
342 }
343 #endif
344 
345 //---------------------------------------------------------------------
346 //
347 //   matcher(status)
348 //
349 //---------------------------------------------------------------------
matcher(UErrorCode & status) const350 RegexMatcher *RegexPattern::matcher(UErrorCode &status)  const {
351     RegexMatcher    *retMatcher = NULL;
352 
353     if (U_FAILURE(status)) {
354         return NULL;
355     }
356     if (U_FAILURE(fDeferredStatus)) {
357         status = fDeferredStatus;
358         return NULL;
359     }
360 
361     retMatcher = new RegexMatcher(this);
362     if (retMatcher == NULL) {
363         status = U_MEMORY_ALLOCATION_ERROR;
364         return NULL;
365     }
366     return retMatcher;
367 }
368 
369 
370 
371 //---------------------------------------------------------------------
372 //
373 //   matches        Convenience function to test for a match, starting
374 //                  with a pattern string and a data string.
375 //
376 //---------------------------------------------------------------------
matches(const UnicodeString & regex,const UnicodeString & input,UParseError & pe,UErrorCode & status)377 UBool U_EXPORT2 RegexPattern::matches(const UnicodeString   &regex,
378               const UnicodeString   &input,
379                     UParseError     &pe,
380                     UErrorCode      &status) {
381 
382     if (U_FAILURE(status)) {return FALSE;}
383 
384     UBool         retVal;
385     RegexPattern *pat     = NULL;
386     RegexMatcher *matcher = NULL;
387 
388     pat     = RegexPattern::compile(regex, 0, pe, status);
389     matcher = pat->matcher(input, status);
390     retVal  = matcher->matches(status);
391 
392     delete matcher;
393     delete pat;
394     return retVal;
395 }
396 
397 
398 
399 
400 //---------------------------------------------------------------------
401 //
402 //   pattern
403 //
404 //---------------------------------------------------------------------
pattern() const405 UnicodeString RegexPattern::pattern() const {
406     return fPattern;
407 }
408 
409 
410 
411 
412 //---------------------------------------------------------------------
413 //
414 //   split
415 //
416 //---------------------------------------------------------------------
split(const UnicodeString & input,UnicodeString dest[],int32_t destCapacity,UErrorCode & status) const417 int32_t  RegexPattern::split(const UnicodeString &input,
418         UnicodeString    dest[],
419         int32_t          destCapacity,
420         UErrorCode       &status) const
421 {
422     if (U_FAILURE(status)) {
423         return 0;
424     };
425 
426     RegexMatcher  m(this);
427     int32_t r = m.split(input, dest, destCapacity, status);
428     return r;
429 }
430 
431 
432 
433 //---------------------------------------------------------------------
434 //
435 //   dump    Output the compiled form of the pattern.
436 //           Debugging function only.
437 //
438 //---------------------------------------------------------------------
439 #if defined(REGEX_DEBUG)
dumpOp(int32_t index) const440 void   RegexPattern::dumpOp(int32_t index) const {
441     static const char * const opNames[] = {URX_OPCODE_NAMES};
442     int32_t op          = fCompiledPat->elementAti(index);
443     int32_t val         = URX_VAL(op);
444     int32_t type        = URX_TYPE(op);
445     int32_t pinnedType  = type;
446     if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) {
447         pinnedType = 0;
448     }
449 
450     REGEX_DUMP_DEBUG_PRINTF(("%4d   %08x    %-15s  ", index, op, opNames[pinnedType]));
451     switch (type) {
452     case URX_NOP:
453     case URX_DOTANY:
454     case URX_DOTANY_ALL:
455     case URX_FAIL:
456     case URX_CARET:
457     case URX_DOLLAR:
458     case URX_BACKSLASH_G:
459     case URX_BACKSLASH_X:
460     case URX_END:
461     case URX_DOLLAR_M:
462     case URX_CARET_M:
463         // Types with no operand field of interest.
464         break;
465 
466     case URX_RESERVED_OP:
467     case URX_START_CAPTURE:
468     case URX_END_CAPTURE:
469     case URX_STATE_SAVE:
470     case URX_JMP:
471     case URX_JMP_SAV:
472     case URX_JMP_SAV_X:
473     case URX_BACKSLASH_B:
474     case URX_BACKSLASH_BU:
475     case URX_BACKSLASH_D:
476     case URX_BACKSLASH_Z:
477     case URX_STRING_LEN:
478     case URX_CTR_INIT:
479     case URX_CTR_INIT_NG:
480     case URX_CTR_LOOP:
481     case URX_CTR_LOOP_NG:
482     case URX_RELOC_OPRND:
483     case URX_STO_SP:
484     case URX_LD_SP:
485     case URX_BACKREF:
486     case URX_STO_INP_LOC:
487     case URX_JMPX:
488     case URX_LA_START:
489     case URX_LA_END:
490     case URX_BACKREF_I:
491     case URX_LB_START:
492     case URX_LB_CONT:
493     case URX_LB_END:
494     case URX_LBN_CONT:
495     case URX_LBN_END:
496     case URX_LOOP_C:
497     case URX_LOOP_DOT_I:
498         // types with an integer operand field.
499         REGEX_DUMP_DEBUG_PRINTF(("%d", val));
500         break;
501 
502     case URX_ONECHAR:
503     case URX_ONECHAR_I:
504         REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?'));
505         break;
506 
507     case URX_STRING:
508     case URX_STRING_I:
509         {
510             int32_t lengthOp       = fCompiledPat->elementAti(index+1);
511             U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
512             int32_t length = URX_VAL(lengthOp);
513             int32_t i;
514             for (i=val; i<val+length; i++) {
515                 UChar c = fLiteralText[i];
516                 if (c < 32 || c >= 256) {c = '.';}
517                 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
518             }
519         }
520         break;
521 
522     case URX_SETREF:
523     case URX_LOOP_SR_I:
524         {
525             UnicodeString s;
526             UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
527             set->toPattern(s, TRUE);
528             for (int32_t i=0; i<s.length(); i++) {
529                 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
530             }
531         }
532         break;
533 
534     case URX_STATIC_SETREF:
535     case URX_STAT_SETREF_N:
536         {
537             UnicodeString s;
538             if (val & URX_NEG_SET) {
539                 REGEX_DUMP_DEBUG_PRINTF(("NOT "));
540                 val &= ~URX_NEG_SET;
541             }
542             UnicodeSet *set = fStaticSets[val];
543             set->toPattern(s, TRUE);
544             for (int32_t i=0; i<s.length(); i++) {
545                 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
546             }
547         }
548         break;
549 
550 
551     default:
552         REGEX_DUMP_DEBUG_PRINTF(("??????"));
553         break;
554     }
555     REGEX_DUMP_DEBUG_PRINTF(("\n"));
556 }
557 #endif
558 
559 
560 #if defined(REGEX_DEBUG)
561 U_CAPI void  U_EXPORT2
RegexPatternDump(const RegexPattern * This)562 RegexPatternDump(const RegexPattern *This) {
563     int      index;
564     int      i;
565 
566     REGEX_DUMP_DEBUG_PRINTF(("Original Pattern:  "));
567     for (i=0; i<This->fPattern.length(); i++) {
568         REGEX_DUMP_DEBUG_PRINTF(("%c", This->fPattern.charAt(i)));
569     }
570     REGEX_DUMP_DEBUG_PRINTF(("\n"));
571     REGEX_DUMP_DEBUG_PRINTF(("   Min Match Length:  %d\n", This->fMinMatchLen));
572     REGEX_DUMP_DEBUG_PRINTF(("   Match Start Type:  %s\n", START_OF_MATCH_STR(This->fStartType)));
573     if (This->fStartType == START_STRING) {
574         REGEX_DUMP_DEBUG_PRINTF(("    Initial match sting: \""));
575         for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
576             REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i]));   // TODO:  non-printables, surrogates.
577         }
578 
579     } else if (This->fStartType == START_SET) {
580         int32_t numSetChars = This->fInitialChars->size();
581         if (numSetChars > 20) {
582             numSetChars = 20;
583         }
584         REGEX_DUMP_DEBUG_PRINTF(("     Match First Chars : "));
585         for (i=0; i<numSetChars; i++) {
586             UChar32 c = This->fInitialChars->charAt(i);
587             if (0x20<c && c <0x7e) {
588                 REGEX_DUMP_DEBUG_PRINTF(("%c ", c));
589             } else {
590                 REGEX_DUMP_DEBUG_PRINTF(("%#x ", c));
591             }
592         }
593         if (numSetChars < This->fInitialChars->size()) {
594             REGEX_DUMP_DEBUG_PRINTF((" ..."));
595         }
596         REGEX_DUMP_DEBUG_PRINTF(("\n"));
597 
598     } else if (This->fStartType == START_CHAR) {
599         REGEX_DUMP_DEBUG_PRINTF(("    First char of Match : "));
600         if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) {
601                 REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar));
602             } else {
603                 REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar));
604             }
605     }
606 
607     REGEX_DUMP_DEBUG_PRINTF(("\nIndex   Binary     Type             Operand\n" \
608            "-------------------------------------------\n"));
609     for (index = 0; index<This->fCompiledPat->size(); index++) {
610         This->dumpOp(index);
611     }
612     REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
613 }
614 #endif
615 
616 
617 
618 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern)
619 
620 U_NAMESPACE_END
621 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
622