• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 1999-2007, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  uniset_props.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2004aug25
14 *   created by: Markus W. Scherer
15 *
16 *   Character property dependent functions moved here from uniset.cpp
17 */
18 
19 #include "unicode/utypes.h"
20 #include "unicode/uniset.h"
21 #include "unicode/parsepos.h"
22 #include "unicode/uchar.h"
23 #include "unicode/uscript.h"
24 #include "unicode/symtable.h"
25 #include "unicode/uset.h"
26 #include "unicode/locid.h"
27 #include "unicode/brkiter.h"
28 #include "uset_imp.h"
29 #include "ruleiter.h"
30 #include "cmemory.h"
31 #include "ucln_cmn.h"
32 #include "util.h"
33 #include "uvector.h"
34 #include "uprops.h"
35 #include "propname.h"
36 #include "unormimp.h"
37 #include "ucase.h"
38 #include "ubidi_props.h"
39 #include "uinvchar.h"
40 #include "charstr.h"
41 #include "cstring.h"
42 #include "umutex.h"
43 #include "uassert.h"
44 #include "hash.h"
45 
46 U_NAMESPACE_USE
47 
48 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
49 
50 // initial storage. Must be >= 0
51 // *** same as in uniset.cpp ! ***
52 #define START_EXTRA 16
53 
54 // Define UChar constants using hex for EBCDIC compatibility
55 // Used #define to reduce private static exports and memory access time.
56 #define SET_OPEN        ((UChar)0x005B) /*[*/
57 #define SET_CLOSE       ((UChar)0x005D) /*]*/
58 #define HYPHEN          ((UChar)0x002D) /*-*/
59 #define COMPLEMENT      ((UChar)0x005E) /*^*/
60 #define COLON           ((UChar)0x003A) /*:*/
61 #define BACKSLASH       ((UChar)0x005C) /*\*/
62 #define INTERSECTION    ((UChar)0x0026) /*&*/
63 #define UPPER_U         ((UChar)0x0055) /*U*/
64 #define LOWER_U         ((UChar)0x0075) /*u*/
65 #define OPEN_BRACE      ((UChar)123)    /*{*/
66 #define CLOSE_BRACE     ((UChar)125)    /*}*/
67 #define UPPER_P         ((UChar)0x0050) /*P*/
68 #define LOWER_P         ((UChar)0x0070) /*p*/
69 #define UPPER_N         ((UChar)78)     /*N*/
70 #define EQUALS          ((UChar)0x003D) /*=*/
71 
72 //static const UChar POSIX_OPEN[]  = { SET_OPEN,COLON,0 };  // "[:"
73 static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 };  // ":]"
74 //static const UChar PERL_OPEN[]   = { BACKSLASH,LOWER_P,0 }; // "\\p"
75 static const UChar PERL_CLOSE[]  = { CLOSE_BRACE,0 };    // "}"
76 //static const UChar NAME_OPEN[]   = { BACKSLASH,UPPER_N,0 };  // "\\N"
77 static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
78 
79 // Special property set IDs
80 static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
81 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
82 static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
83 
84 // Unicode name property alias
85 #define NAME_PROP "na"
86 #define NAME_PROP_LENGTH 2
87 
88 /**
89  * Delimiter string used in patterns to close a category reference:
90  * ":]".  Example: "[:Lu:]".
91  */
92 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
93 
94 U_CDECL_BEGIN
95 
96 static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()
97 
98 //----------------------------------------------------------------
99 // Inclusions list
100 //----------------------------------------------------------------
101 
102 // USetAdder implementation
103 // Does not use uset.h to reduce code dependencies
104 static void U_CALLCONV
_set_add(USet * set,UChar32 c)105 _set_add(USet *set, UChar32 c) {
106     ((UnicodeSet *)set)->add(c);
107 }
108 
109 static void U_CALLCONV
_set_addRange(USet * set,UChar32 start,UChar32 end)110 _set_addRange(USet *set, UChar32 start, UChar32 end) {
111     ((UnicodeSet *)set)->add(start, end);
112 }
113 
114 static void U_CALLCONV
_set_addString(USet * set,const UChar * str,int32_t length)115 _set_addString(USet *set, const UChar *str, int32_t length) {
116     ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
117 }
118 
119 /**
120  * Cleanup function for UnicodeSet
121  */
uset_cleanup(void)122 static UBool U_CALLCONV uset_cleanup(void) {
123     int32_t i;
124 
125     for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
126         if (INCLUSIONS[i] != NULL) {
127             delete INCLUSIONS[i];
128             INCLUSIONS[i] = NULL;
129         }
130     }
131 
132     return TRUE;
133 }
134 
135 U_CDECL_END
136 
137 U_NAMESPACE_BEGIN
138 
getInclusions(int32_t src,UErrorCode & status)139 static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status) {
140     UBool needInit;
141     UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit);
142     if (needInit) {
143         UnicodeSet* incl = new UnicodeSet();
144         USetAdder sa = {
145             (USet *)incl,
146             _set_add,
147             _set_addRange,
148             _set_addString,
149             NULL // don't need remove()
150         };
151 
152         if (incl != NULL) {
153             switch(src) {
154             case UPROPS_SRC_CHAR:
155                 uchar_addPropertyStarts(&sa, &status);
156                 break;
157             case UPROPS_SRC_PROPSVEC:
158                 upropsvec_addPropertyStarts(&sa, &status);
159                 break;
160             case UPROPS_SRC_CHAR_AND_PROPSVEC:
161                 uchar_addPropertyStarts(&sa, &status);
162                 upropsvec_addPropertyStarts(&sa, &status);
163                 break;
164             case UPROPS_SRC_HST:
165                 uhst_addPropertyStarts(&sa, &status);
166                 break;
167 #if !UCONFIG_NO_NORMALIZATION
168             case UPROPS_SRC_NORM:
169                 unorm_addPropertyStarts(&sa, &status);
170                 break;
171 #endif
172             case UPROPS_SRC_CASE:
173                 ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
174                 break;
175             case UPROPS_SRC_BIDI:
176                 ubidi_addPropertyStarts(ubidi_getSingleton(&status), &sa, &status);
177                 break;
178             default:
179                 status = U_INTERNAL_PROGRAM_ERROR;
180                 break;
181             }
182             if (U_SUCCESS(status)) {
183                 // Compact for caching
184                 incl->compact();
185                 umtx_lock(NULL);
186                 if (INCLUSIONS[src] == NULL) {
187                     INCLUSIONS[src] = incl;
188                     incl = NULL;
189                     ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
190                 }
191                 umtx_unlock(NULL);
192             }
193             delete incl;
194         } else {
195             status = U_MEMORY_ALLOCATION_ERROR;
196         }
197     }
198     return INCLUSIONS[src];
199 }
200 
201 // helper functions for matching of pattern syntax pieces ------------------ ***
202 // these functions are parallel to the PERL_OPEN etc. strings above
203 
204 // using these functions is not only faster than UnicodeString::compare() and
205 // caseCompare(), but they also make UnicodeSet work for simple patterns when
206 // no Unicode properties data is available - when caseCompare() fails
207 
208 static inline UBool
isPerlOpen(const UnicodeString & pattern,int32_t pos)209 isPerlOpen(const UnicodeString &pattern, int32_t pos) {
210     UChar c;
211     return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
212 }
213 
214 /*static inline UBool
215 isPerlClose(const UnicodeString &pattern, int32_t pos) {
216     return pattern.charAt(pos)==CLOSE_BRACE;
217 }*/
218 
219 static inline UBool
isNameOpen(const UnicodeString & pattern,int32_t pos)220 isNameOpen(const UnicodeString &pattern, int32_t pos) {
221     return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
222 }
223 
224 static inline UBool
isPOSIXOpen(const UnicodeString & pattern,int32_t pos)225 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
226     return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
227 }
228 
229 /*static inline UBool
230 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
231     return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
232 }*/
233 
234 // TODO memory debugging provided inside uniset.cpp
235 // could be made available here but probably obsolete with use of modern
236 // memory leak checker tools
237 #define _dbgct(me)
238 
239 //----------------------------------------------------------------
240 // Constructors &c
241 //----------------------------------------------------------------
242 
243 /**
244  * Constructs a set from the given pattern, optionally ignoring
245  * white space.  See the class description for the syntax of the
246  * pattern language.
247  * @param pattern a string specifying what characters are in the set
248  */
UnicodeSet(const UnicodeString & pattern,UErrorCode & status)249 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
250                        UErrorCode& status) :
251     len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
252     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
253 {
254     if(U_SUCCESS(status)){
255         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
256         /* test for NULL */
257         if(list == NULL) {
258             status = U_MEMORY_ALLOCATION_ERROR;
259         }else{
260             allocateStrings(status);
261             applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
262         }
263     }
264     _dbgct(this);
265 }
266 
267 /**
268  * Constructs a set from the given pattern, optionally ignoring
269  * white space.  See the class description for the syntax of the
270  * pattern language.
271  * @param pattern a string specifying what characters are in the set
272  * @param options bitmask for options to apply to the pattern.
273  * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
274  */
UnicodeSet(const UnicodeString & pattern,uint32_t options,const SymbolTable * symbols,UErrorCode & status)275 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
276                        uint32_t options,
277                        const SymbolTable* symbols,
278                        UErrorCode& status) :
279     len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
280     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
281 {
282     if(U_SUCCESS(status)){
283         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
284         /* test for NULL */
285         if(list == NULL) {
286             status = U_MEMORY_ALLOCATION_ERROR;
287         }else{
288             allocateStrings(status);
289             applyPattern(pattern, options, symbols, status);
290         }
291     }
292     _dbgct(this);
293 }
294 
UnicodeSet(const UnicodeString & pattern,ParsePosition & pos,uint32_t options,const SymbolTable * symbols,UErrorCode & status)295 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
296                        uint32_t options,
297                        const SymbolTable* symbols,
298                        UErrorCode& status) :
299     len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
300     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL)
301 {
302     if(U_SUCCESS(status)){
303         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
304         /* test for NULL */
305         if(list == NULL) {
306             status = U_MEMORY_ALLOCATION_ERROR;
307         }else{
308             allocateStrings(status);
309             applyPattern(pattern, pos, options, symbols, status);
310         }
311     }
312     _dbgct(this);
313 }
314 
315 //----------------------------------------------------------------
316 // Public API
317 //----------------------------------------------------------------
318 
319 /**
320  * Modifies this set to represent the set specified by the given
321  * pattern, optionally ignoring white space.  See the class
322  * description for the syntax of the pattern language.
323  * @param pattern a string specifying what characters are in the set
324  * @param ignoreSpaces if <code>true</code>, all spaces in the
325  * pattern are ignored.  Spaces are those characters for which
326  * <code>uprv_isRuleWhiteSpace()</code> is <code>true</code>.
327  * Characters preceded by '\\' are escaped, losing any special
328  * meaning they otherwise have.  Spaces may be included by
329  * escaping them.
330  * @exception <code>IllegalArgumentException</code> if the pattern
331  * contains a syntax error.
332  */
applyPattern(const UnicodeString & pattern,UErrorCode & status)333 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
334                                      UErrorCode& status) {
335     return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
336 }
337 
338 
339 /**
340  * Modifies this set to represent the set specified by the given
341  * pattern, optionally ignoring white space.  See the class
342  * description for the syntax of the pattern language.
343  * @param pattern a string specifying what characters are in the set
344  * @param options bitmask for options to apply to the pattern.
345  * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
346  */
applyPattern(const UnicodeString & pattern,uint32_t options,const SymbolTable * symbols,UErrorCode & status)347 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
348                                      uint32_t options,
349                                      const SymbolTable* symbols,
350                                      UErrorCode& status) {
351     if (U_FAILURE(status) || isFrozen()) {
352         return *this;
353     }
354 
355     ParsePosition pos(0);
356     applyPattern(pattern, pos, options, symbols, status);
357     if (U_FAILURE(status)) return *this;
358 
359     int32_t i = pos.getIndex();
360 
361     if (options & USET_IGNORE_SPACE) {
362         // Skip over trailing whitespace
363         ICU_Utility::skipWhitespace(pattern, i, TRUE);
364     }
365 
366     if (i != pattern.length()) {
367         status = U_ILLEGAL_ARGUMENT_ERROR;
368     }
369     return *this;
370 }
371 
applyPattern(const UnicodeString & pattern,ParsePosition & pos,uint32_t options,const SymbolTable * symbols,UErrorCode & status)372 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
373                               ParsePosition& pos,
374                               uint32_t options,
375                               const SymbolTable* symbols,
376                               UErrorCode& status) {
377     if (U_FAILURE(status) || isFrozen()) {
378         return *this;
379     }
380     // Need to build the pattern in a temporary string because
381     // _applyPattern calls add() etc., which set pat to empty.
382     UnicodeString rebuiltPat;
383     RuleCharacterIterator chars(pattern, symbols, pos);
384     applyPattern(chars, symbols, rebuiltPat, options, status);
385     if (U_FAILURE(status)) return *this;
386     if (chars.inVariable()) {
387         // syntaxError(chars, "Extra chars in variable value");
388         status = U_MALFORMED_SET;
389         return *this;
390     }
391     setPattern(rebuiltPat);
392     return *this;
393 }
394 
395 /**
396  * Return true if the given position, in the given pattern, appears
397  * to be the start of a UnicodeSet pattern.
398  */
resemblesPattern(const UnicodeString & pattern,int32_t pos)399 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
400     return ((pos+1) < pattern.length() &&
401             pattern.charAt(pos) == (UChar)91/*[*/) ||
402         resemblesPropertyPattern(pattern, pos);
403 }
404 
405 //----------------------------------------------------------------
406 // Implementation: Pattern parsing
407 //----------------------------------------------------------------
408 
409 /**
410  * A small all-inline class to manage a UnicodeSet pointer.  Add
411  * operator->() etc. as needed.
412  */
413 class UnicodeSetPointer {
414     UnicodeSet* p;
415 public:
UnicodeSetPointer()416     inline UnicodeSetPointer() : p(0) {}
~UnicodeSetPointer()417     inline ~UnicodeSetPointer() { delete p; }
pointer()418     inline UnicodeSet* pointer() { return p; }
allocate()419     inline UBool allocate() {
420         if (p == 0) {
421             p = new UnicodeSet();
422         }
423         return p != 0;
424     }
425 };
426 
427 /**
428  * Parse the pattern from the given RuleCharacterIterator.  The
429  * iterator is advanced over the parsed pattern.
430  * @param chars iterator over the pattern characters.  Upon return
431  * it will be advanced to the first character after the parsed
432  * pattern, or the end of the iteration if all characters are
433  * parsed.
434  * @param symbols symbol table to use to parse and dereference
435  * variables, or null if none.
436  * @param rebuiltPat the pattern that was parsed, rebuilt or
437  * copied from the input pattern, as appropriate.
438  * @param options a bit mask of zero or more of the following:
439  * IGNORE_SPACE, CASE.
440  */
applyPattern(RuleCharacterIterator & chars,const SymbolTable * symbols,UnicodeString & rebuiltPat,uint32_t options,UErrorCode & ec)441 void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
442                               const SymbolTable* symbols,
443                               UnicodeString& rebuiltPat,
444                               uint32_t options,
445                               UErrorCode& ec) {
446     if (U_FAILURE(ec)) return;
447 
448     // Syntax characters: [ ] ^ - & { }
449 
450     // Recognized special forms for chars, sets: c-c s-s s&s
451 
452     int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
453                    RuleCharacterIterator::PARSE_ESCAPES;
454     if ((options & USET_IGNORE_SPACE) != 0) {
455         opts |= RuleCharacterIterator::SKIP_WHITESPACE;
456     }
457 
458     UnicodeString patLocal, buf;
459     UBool usePat = FALSE;
460     UnicodeSetPointer scratch;
461     RuleCharacterIterator::Pos backup;
462 
463     // mode: 0=before [, 1=between [...], 2=after ]
464     // lastItem: 0=none, 1=char, 2=set
465     int8_t lastItem = 0, mode = 0;
466     UChar32 lastChar = 0;
467     UChar op = 0;
468 
469     UBool invert = FALSE;
470 
471     clear();
472 
473     while (mode != 2 && !chars.atEnd()) {
474         U_ASSERT((lastItem == 0 && op == 0) ||
475                  (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
476                  (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
477                                     op == INTERSECTION /*'&'*/)));
478 
479         UChar32 c = 0;
480         UBool literal = FALSE;
481         UnicodeSet* nested = 0; // alias - do not delete
482 
483         // -------- Check for property pattern
484 
485         // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
486         int8_t setMode = 0;
487         if (resemblesPropertyPattern(chars, opts)) {
488             setMode = 2;
489         }
490 
491         // -------- Parse '[' of opening delimiter OR nested set.
492         // If there is a nested set, use `setMode' to define how
493         // the set should be parsed.  If the '[' is part of the
494         // opening delimiter for this pattern, parse special
495         // strings "[", "[^", "[-", and "[^-".  Check for stand-in
496         // characters representing a nested set in the symbol
497         // table.
498 
499         else {
500             // Prepare to backup if necessary
501             chars.getPos(backup);
502             c = chars.next(opts, literal, ec);
503             if (U_FAILURE(ec)) return;
504 
505             if (c == 0x5B /*'['*/ && !literal) {
506                 if (mode == 1) {
507                     chars.setPos(backup); // backup
508                     setMode = 1;
509                 } else {
510                     // Handle opening '[' delimiter
511                     mode = 1;
512                     patLocal.append((UChar) 0x5B /*'['*/);
513                     chars.getPos(backup); // prepare to backup
514                     c = chars.next(opts, literal, ec);
515                     if (U_FAILURE(ec)) return;
516                     if (c == 0x5E /*'^'*/ && !literal) {
517                         invert = TRUE;
518                         patLocal.append((UChar) 0x5E /*'^'*/);
519                         chars.getPos(backup); // prepare to backup
520                         c = chars.next(opts, literal, ec);
521                         if (U_FAILURE(ec)) return;
522                     }
523                     // Fall through to handle special leading '-';
524                     // otherwise restart loop for nested [], \p{}, etc.
525                     if (c == HYPHEN /*'-'*/) {
526                         literal = TRUE;
527                         // Fall through to handle literal '-' below
528                     } else {
529                         chars.setPos(backup); // backup
530                         continue;
531                     }
532                 }
533             } else if (symbols != 0) {
534                 const UnicodeFunctor *m = symbols->lookupMatcher(c);
535                 if (m != 0) {
536                     if (m->getDynamicClassID() != UnicodeSet::getStaticClassID()) {
537                         ec = U_MALFORMED_SET;
538                         return;
539                     }
540                     // casting away const, but `nested' won't be modified
541                     // (important not to modify stored set)
542                     nested = (UnicodeSet*) m;
543                     setMode = 3;
544                 }
545             }
546         }
547 
548         // -------- Handle a nested set.  This either is inline in
549         // the pattern or represented by a stand-in that has
550         // previously been parsed and was looked up in the symbol
551         // table.
552 
553         if (setMode != 0) {
554             if (lastItem == 1) {
555                 if (op != 0) {
556                     // syntaxError(chars, "Char expected after operator");
557                     ec = U_MALFORMED_SET;
558                     return;
559                 }
560                 add(lastChar, lastChar);
561                 _appendToPat(patLocal, lastChar, FALSE);
562                 lastItem = 0;
563                 op = 0;
564             }
565 
566             if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
567                 patLocal.append(op);
568             }
569 
570             if (nested == 0) {
571                 // lazy allocation
572                 if (!scratch.allocate()) {
573                     ec = U_MEMORY_ALLOCATION_ERROR;
574                     return;
575                 }
576                 nested = scratch.pointer();
577             }
578             switch (setMode) {
579             case 1:
580                 nested->applyPattern(chars, symbols, patLocal, options, ec);
581                 break;
582             case 2:
583                 chars.skipIgnored(opts);
584                 nested->applyPropertyPattern(chars, patLocal, ec);
585                 if (U_FAILURE(ec)) return;
586                 break;
587             case 3: // `nested' already parsed
588                 nested->_toPattern(patLocal, FALSE);
589                 break;
590             }
591 
592             usePat = TRUE;
593 
594             if (mode == 0) {
595                 // Entire pattern is a category; leave parse loop
596                 *this = *nested;
597                 mode = 2;
598                 break;
599             }
600 
601             switch (op) {
602             case HYPHEN: /*'-'*/
603                 removeAll(*nested);
604                 break;
605             case INTERSECTION: /*'&'*/
606                 retainAll(*nested);
607                 break;
608             case 0:
609                 addAll(*nested);
610                 break;
611             }
612 
613             op = 0;
614             lastItem = 2;
615 
616             continue;
617         }
618 
619         if (mode == 0) {
620             // syntaxError(chars, "Missing '['");
621             ec = U_MALFORMED_SET;
622             return;
623         }
624 
625         // -------- Parse special (syntax) characters.  If the
626         // current character is not special, or if it is escaped,
627         // then fall through and handle it below.
628 
629         if (!literal) {
630             switch (c) {
631             case 0x5D /*']'*/:
632                 if (lastItem == 1) {
633                     add(lastChar, lastChar);
634                     _appendToPat(patLocal, lastChar, FALSE);
635                 }
636                 // Treat final trailing '-' as a literal
637                 if (op == HYPHEN /*'-'*/) {
638                     add(op, op);
639                     patLocal.append(op);
640                 } else if (op == INTERSECTION /*'&'*/) {
641                     // syntaxError(chars, "Trailing '&'");
642                     ec = U_MALFORMED_SET;
643                     return;
644                 }
645                 patLocal.append((UChar) 0x5D /*']'*/);
646                 mode = 2;
647                 continue;
648             case HYPHEN /*'-'*/:
649                 if (op == 0) {
650                     if (lastItem != 0) {
651                         op = (UChar) c;
652                         continue;
653                     } else {
654                         // Treat final trailing '-' as a literal
655                         add(c, c);
656                         c = chars.next(opts, literal, ec);
657                         if (U_FAILURE(ec)) return;
658                         if (c == 0x5D /*']'*/ && !literal) {
659                             patLocal.append(HYPHEN_RIGHT_BRACE);
660                             mode = 2;
661                             continue;
662                         }
663                     }
664                 }
665                 // syntaxError(chars, "'-' not after char or set");
666                 ec = U_MALFORMED_SET;
667                 return;
668             case INTERSECTION /*'&'*/:
669                 if (lastItem == 2 && op == 0) {
670                     op = (UChar) c;
671                     continue;
672                 }
673                 // syntaxError(chars, "'&' not after set");
674                 ec = U_MALFORMED_SET;
675                 return;
676             case 0x5E /*'^'*/:
677                 // syntaxError(chars, "'^' not after '['");
678                 ec = U_MALFORMED_SET;
679                 return;
680             case 0x7B /*'{'*/:
681                 if (op != 0) {
682                     // syntaxError(chars, "Missing operand after operator");
683                     ec = U_MALFORMED_SET;
684                     return;
685                 }
686                 if (lastItem == 1) {
687                     add(lastChar, lastChar);
688                     _appendToPat(patLocal, lastChar, FALSE);
689                 }
690                 lastItem = 0;
691                 buf.truncate(0);
692                 {
693                     UBool ok = FALSE;
694                     while (!chars.atEnd()) {
695                         c = chars.next(opts, literal, ec);
696                         if (U_FAILURE(ec)) return;
697                         if (c == 0x7D /*'}'*/ && !literal) {
698                             ok = TRUE;
699                             break;
700                         }
701                         buf.append(c);
702                     }
703                     if (buf.length() < 1 || !ok) {
704                         // syntaxError(chars, "Invalid multicharacter string");
705                         ec = U_MALFORMED_SET;
706                         return;
707                     }
708                 }
709                 // We have new string. Add it to set and continue;
710                 // we don't need to drop through to the further
711                 // processing
712                 add(buf);
713                 patLocal.append((UChar) 0x7B /*'{'*/);
714                 _appendToPat(patLocal, buf, FALSE);
715                 patLocal.append((UChar) 0x7D /*'}'*/);
716                 continue;
717             case SymbolTable::SYMBOL_REF:
718                 //         symbols  nosymbols
719                 // [a-$]   error    error (ambiguous)
720                 // [a$]    anchor   anchor
721                 // [a-$x]  var "x"* literal '$'
722                 // [a-$.]  error    literal '$'
723                 // *We won't get here in the case of var "x"
724                 {
725                     chars.getPos(backup);
726                     c = chars.next(opts, literal, ec);
727                     if (U_FAILURE(ec)) return;
728                     UBool anchor = (c == 0x5D /*']'*/ && !literal);
729                     if (symbols == 0 && !anchor) {
730                         c = SymbolTable::SYMBOL_REF;
731                         chars.setPos(backup);
732                         break; // literal '$'
733                     }
734                     if (anchor && op == 0) {
735                         if (lastItem == 1) {
736                             add(lastChar, lastChar);
737                             _appendToPat(patLocal, lastChar, FALSE);
738                         }
739                         add(U_ETHER);
740                         usePat = TRUE;
741                         patLocal.append((UChar) SymbolTable::SYMBOL_REF);
742                         patLocal.append((UChar) 0x5D /*']'*/);
743                         mode = 2;
744                         continue;
745                     }
746                     // syntaxError(chars, "Unquoted '$'");
747                     ec = U_MALFORMED_SET;
748                     return;
749                 }
750             default:
751                 break;
752             }
753         }
754 
755         // -------- Parse literal characters.  This includes both
756         // escaped chars ("\u4E01") and non-syntax characters
757         // ("a").
758 
759         switch (lastItem) {
760         case 0:
761             lastItem = 1;
762             lastChar = c;
763             break;
764         case 1:
765             if (op == HYPHEN /*'-'*/) {
766                 if (lastChar >= c) {
767                     // Don't allow redundant (a-a) or empty (b-a) ranges;
768                     // these are most likely typos.
769                     // syntaxError(chars, "Invalid range");
770                     ec = U_MALFORMED_SET;
771                     return;
772                 }
773                 add(lastChar, c);
774                 _appendToPat(patLocal, lastChar, FALSE);
775                 patLocal.append(op);
776                 _appendToPat(patLocal, c, FALSE);
777                 lastItem = 0;
778                 op = 0;
779             } else {
780                 add(lastChar, lastChar);
781                 _appendToPat(patLocal, lastChar, FALSE);
782                 lastChar = c;
783             }
784             break;
785         case 2:
786             if (op != 0) {
787                 // syntaxError(chars, "Set expected after operator");
788                 ec = U_MALFORMED_SET;
789                 return;
790             }
791             lastChar = c;
792             lastItem = 1;
793             break;
794         }
795     }
796 
797     if (mode != 2) {
798         // syntaxError(chars, "Missing ']'");
799         ec = U_MALFORMED_SET;
800         return;
801     }
802 
803     chars.skipIgnored(opts);
804 
805     /**
806      * Handle global flags (invert, case insensitivity).  If this
807      * pattern should be compiled case-insensitive, then we need
808      * to close over case BEFORE COMPLEMENTING.  This makes
809      * patterns like /[^abc]/i work.
810      */
811     if ((options & USET_CASE_INSENSITIVE) != 0) {
812         closeOver(USET_CASE_INSENSITIVE);
813     }
814     else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
815         closeOver(USET_ADD_CASE_MAPPINGS);
816     }
817     if (invert) {
818         complement();
819     }
820 
821     // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the
822     // generated pattern.
823     if (usePat) {
824         rebuiltPat.append(patLocal);
825     } else {
826         _generatePattern(rebuiltPat, FALSE);
827     }
828 }
829 
830 //----------------------------------------------------------------
831 // Property set implementation
832 //----------------------------------------------------------------
833 
numericValueFilter(UChar32 ch,void * context)834 static UBool numericValueFilter(UChar32 ch, void* context) {
835     return u_getNumericValue(ch) == *(double*)context;
836 }
837 
generalCategoryMaskFilter(UChar32 ch,void * context)838 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
839     int32_t value = *(int32_t*)context;
840     return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
841 }
842 
versionFilter(UChar32 ch,void * context)843 static UBool versionFilter(UChar32 ch, void* context) {
844     UVersionInfo v, none = { 0, 0, 0, 0};
845     UVersionInfo* version = (UVersionInfo*)context;
846     u_charAge(ch, v);
847     return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
848 }
849 
850 typedef struct {
851     UProperty prop;
852     int32_t value;
853 } IntPropertyContext;
854 
intPropertyFilter(UChar32 ch,void * context)855 static UBool intPropertyFilter(UChar32 ch, void* context) {
856     IntPropertyContext* c = (IntPropertyContext*)context;
857     return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
858 }
859 
860 
861 /**
862  * Generic filter-based scanning code for UCD property UnicodeSets.
863  */
applyFilter(UnicodeSet::Filter filter,void * context,int32_t src,UErrorCode & status)864 void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
865                              void* context,
866                              int32_t src,
867                              UErrorCode &status) {
868     // Walk through all Unicode characters, noting the start
869     // and end of each range for which filter.contain(c) is
870     // true.  Add each range to a set.
871     //
872     // To improve performance, use the INCLUSIONS set, which
873     // encodes information about character ranges that are known
874     // to have identical properties. INCLUSIONS contains
875     // only the first characters of such ranges.
876     //
877     // TODO Where possible, instead of scanning over code points,
878     // use internal property data to initialize UnicodeSets for
879     // those properties.  Scanning code points is slow.
880     if (U_FAILURE(status)) return;
881 
882     const UnicodeSet* inclusions = getInclusions(src, status);
883     if (U_FAILURE(status)) {
884         return;
885     }
886 
887     clear();
888 
889     UChar32 startHasProperty = -1;
890     int limitRange = inclusions->getRangeCount();
891 
892     for (int j=0; j<limitRange; ++j) {
893         // get current range
894         UChar32 start = inclusions->getRangeStart(j);
895         UChar32 end = inclusions->getRangeEnd(j);
896 
897         // for all the code points in the range, process
898         for (UChar32 ch = start; ch <= end; ++ch) {
899             // only add to this UnicodeSet on inflection points --
900             // where the hasProperty value changes to false
901             if ((*filter)(ch, context)) {
902                 if (startHasProperty < 0) {
903                     startHasProperty = ch;
904                 }
905             } else if (startHasProperty >= 0) {
906                 add(startHasProperty, ch-1);
907                 startHasProperty = -1;
908             }
909         }
910     }
911     if (startHasProperty >= 0) {
912         add((UChar32)startHasProperty, (UChar32)0x10FFFF);
913     }
914 }
915 
mungeCharName(char * dst,const char * src,int32_t dstCapacity)916 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
917     /* Note: we use ' ' in compiler code page */
918     int32_t j = 0;
919     char ch;
920     --dstCapacity; /* make room for term. zero */
921     while ((ch = *src++) != 0) {
922         if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
923             continue;
924         }
925         if (j >= dstCapacity) return FALSE;
926         dst[j++] = ch;
927     }
928     if (j > 0 && dst[j-1] == ' ') --j;
929     dst[j] = 0;
930     return TRUE;
931 }
932 
933 //----------------------------------------------------------------
934 // Property set API
935 //----------------------------------------------------------------
936 
937 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
938 
939 UnicodeSet&
applyIntPropertyValue(UProperty prop,int32_t value,UErrorCode & ec)940 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
941     if (U_FAILURE(ec) || isFrozen()) return *this;
942 
943     if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
944         applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
945     } else {
946         IntPropertyContext c = {prop, value};
947         applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
948     }
949     return *this;
950 }
951 
952 UnicodeSet&
applyPropertyAlias(const UnicodeString & prop,const UnicodeString & value,UErrorCode & ec)953 UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
954                                const UnicodeString& value,
955                                UErrorCode& ec) {
956     if (U_FAILURE(ec) || isFrozen()) return *this;
957 
958     // prop and value used to be converted to char * using the default
959     // converter instead of the invariant conversion.
960     // This should not be necessary because all Unicode property and value
961     // names use only invariant characters.
962     // If there are any variant characters, then we won't find them anyway.
963     // Checking first avoids assertion failures in the conversion.
964     if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
965         !uprv_isInvariantUString(value.getBuffer(), value.length())
966     ) {
967         FAIL(ec);
968     }
969     CharString pname(prop);
970     CharString vname(value);
971 
972     UProperty p;
973     int32_t v;
974     UBool mustNotBeEmpty = FALSE, invert = FALSE;
975 
976     if (value.length() > 0) {
977         p = u_getPropertyEnum(pname);
978         if (p == UCHAR_INVALID_CODE) FAIL(ec);
979 
980         // Treat gc as gcm
981         if (p == UCHAR_GENERAL_CATEGORY) {
982             p = UCHAR_GENERAL_CATEGORY_MASK;
983         }
984 
985         if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
986             (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
987             (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
988             v = u_getPropertyValueEnum(p, vname);
989             if (v == UCHAR_INVALID_CODE) {
990                 // Handle numeric CCC
991                 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
992                     p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
993                     p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
994                     char* end;
995                     double value = uprv_strtod(vname, &end);
996                     v = (int32_t) value;
997                     if (v != value || v < 0 || *end != 0) {
998                         // non-integral or negative value, or trailing junk
999                         FAIL(ec);
1000                     }
1001                     // If the resultant set is empty then the numeric value
1002                     // was invalid.
1003                     mustNotBeEmpty = TRUE;
1004                 } else {
1005                     FAIL(ec);
1006                 }
1007             }
1008         }
1009 
1010         else {
1011 
1012             switch (p) {
1013             case UCHAR_NUMERIC_VALUE:
1014                 {
1015                     char* end;
1016                     double value = uprv_strtod(vname, &end);
1017                     if (*end != 0) {
1018                         FAIL(ec);
1019                     }
1020                     applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);
1021                     return *this;
1022                 }
1023                 break;
1024             case UCHAR_NAME:
1025             case UCHAR_UNICODE_1_NAME:
1026                 {
1027                     // Must munge name, since u_charFromName() does not do
1028                     // 'loose' matching.
1029                     char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
1030                     if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
1031                     UCharNameChoice choice = (p == UCHAR_NAME) ?
1032                         U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME;
1033                     UChar32 ch = u_charFromName(choice, buf, &ec);
1034                     if (U_SUCCESS(ec)) {
1035                         clear();
1036                         add(ch);
1037                         return *this;
1038                     } else {
1039                         FAIL(ec);
1040                     }
1041                 }
1042                 break;
1043             case UCHAR_AGE:
1044                 {
1045                     // Must munge name, since u_versionFromString() does not do
1046                     // 'loose' matching.
1047                     char buf[128];
1048                     if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
1049                     UVersionInfo version;
1050                     u_versionFromString(version, buf);
1051                     applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
1052                     return *this;
1053                 }
1054                 break;
1055             default:
1056                 // p is a non-binary, non-enumerated property that we
1057                 // don't support (yet).
1058                 FAIL(ec);
1059             }
1060         }
1061     }
1062 
1063     else {
1064         // value is empty.  Interpret as General Category, Script, or
1065         // Binary property.
1066         p = UCHAR_GENERAL_CATEGORY_MASK;
1067         v = u_getPropertyValueEnum(p, pname);
1068         if (v == UCHAR_INVALID_CODE) {
1069             p = UCHAR_SCRIPT;
1070             v = u_getPropertyValueEnum(p, pname);
1071             if (v == UCHAR_INVALID_CODE) {
1072                 p = u_getPropertyEnum(pname);
1073                 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
1074                     v = 1;
1075                 } else if (0 == uprv_comparePropertyNames(ANY, pname)) {
1076                     set(MIN_VALUE, MAX_VALUE);
1077                     return *this;
1078                 } else if (0 == uprv_comparePropertyNames(ASCII, pname)) {
1079                     set(0, 0x7F);
1080                     return *this;
1081                 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname)) {
1082                     // [:Assigned:]=[:^Cn:]
1083                     p = UCHAR_GENERAL_CATEGORY_MASK;
1084                     v = U_GC_CN_MASK;
1085                     invert = TRUE;
1086                 } else {
1087                     FAIL(ec);
1088                 }
1089             }
1090         }
1091     }
1092 
1093     applyIntPropertyValue(p, v, ec);
1094     if(invert) {
1095         complement();
1096     }
1097 
1098     if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
1099         // mustNotBeEmpty is set to true if an empty set indicates
1100         // invalid input.
1101         ec = U_ILLEGAL_ARGUMENT_ERROR;
1102     }
1103 
1104     return *this;
1105 }
1106 
1107 //----------------------------------------------------------------
1108 // Property set patterns
1109 //----------------------------------------------------------------
1110 
1111 /**
1112  * Return true if the given position, in the given pattern, appears
1113  * to be the start of a property set pattern.
1114  */
resemblesPropertyPattern(const UnicodeString & pattern,int32_t pos)1115 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
1116                                            int32_t pos) {
1117     // Patterns are at least 5 characters long
1118     if ((pos+5) > pattern.length()) {
1119         return FALSE;
1120     }
1121 
1122     // Look for an opening [:, [:^, \p, or \P
1123     return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
1124 }
1125 
1126 /**
1127  * Return true if the given iterator appears to point at a
1128  * property pattern.  Regardless of the result, return with the
1129  * iterator unchanged.
1130  * @param chars iterator over the pattern characters.  Upon return
1131  * it will be unchanged.
1132  * @param iterOpts RuleCharacterIterator options
1133  */
resemblesPropertyPattern(RuleCharacterIterator & chars,int32_t iterOpts)1134 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
1135                                            int32_t iterOpts) {
1136     // NOTE: literal will always be FALSE, because we don't parse escapes.
1137     UBool result = FALSE, literal;
1138     UErrorCode ec = U_ZERO_ERROR;
1139     iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1140     RuleCharacterIterator::Pos pos;
1141     chars.getPos(pos);
1142     UChar32 c = chars.next(iterOpts, literal, ec);
1143     if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
1144         UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1145                                literal, ec);
1146         result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
1147                  (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
1148     }
1149     chars.setPos(pos);
1150     return result && U_SUCCESS(ec);
1151 }
1152 
1153 /**
1154  * Parse the given property pattern at the given parse position.
1155  */
applyPropertyPattern(const UnicodeString & pattern,ParsePosition & ppos,UErrorCode & ec)1156 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1157                                              ParsePosition& ppos,
1158                                              UErrorCode &ec) {
1159     int32_t pos = ppos.getIndex();
1160 
1161     UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1162     UBool isName = FALSE; // true for \N{pat}, o/w false
1163     UBool invert = FALSE;
1164 
1165     if (U_FAILURE(ec)) return *this;
1166 
1167     // Minimum length is 5 characters, e.g. \p{L}
1168     if ((pos+5) > pattern.length()) {
1169         FAIL(ec);
1170     }
1171 
1172     // On entry, ppos should point to one of the following locations:
1173     // Look for an opening [:, [:^, \p, or \P
1174     if (isPOSIXOpen(pattern, pos)) {
1175         posix = TRUE;
1176         pos += 2;
1177         pos = ICU_Utility::skipWhitespace(pattern, pos);
1178         if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
1179             ++pos;
1180             invert = TRUE;
1181         }
1182     } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1183         UChar c = pattern.charAt(pos+1);
1184         invert = (c == UPPER_P);
1185         isName = (c == UPPER_N);
1186         pos += 2;
1187         pos = ICU_Utility::skipWhitespace(pattern, pos);
1188         if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
1189             // Syntax error; "\p" or "\P" not followed by "{"
1190             FAIL(ec);
1191         }
1192     } else {
1193         // Open delimiter not seen
1194         FAIL(ec);
1195     }
1196 
1197     // Look for the matching close delimiter, either :] or }
1198     int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos);
1199     if (close < 0) {
1200         // Syntax error; close delimiter missing
1201         FAIL(ec);
1202     }
1203 
1204     // Look for an '=' sign.  If this is present, we will parse a
1205     // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1206     // pattern.
1207     int32_t equals = pattern.indexOf(EQUALS, pos);
1208     UnicodeString propName, valueName;
1209     if (equals >= 0 && equals < close && !isName) {
1210         // Equals seen; parse medium/long pattern
1211         pattern.extractBetween(pos, equals, propName);
1212         pattern.extractBetween(equals+1, close, valueName);
1213     }
1214 
1215     else {
1216         // Handle case where no '=' is seen, and \N{}
1217         pattern.extractBetween(pos, close, propName);
1218 
1219         // Handle \N{name}
1220         if (isName) {
1221             // This is a little inefficient since it means we have to
1222             // parse NAME_PROP back to UCHAR_NAME even though we already
1223             // know it's UCHAR_NAME.  If we refactor the API to
1224             // support args of (UProperty, char*) then we can remove
1225             // NAME_PROP and make this a little more efficient.
1226             valueName = propName;
1227             propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1228         }
1229     }
1230 
1231     applyPropertyAlias(propName, valueName, ec);
1232 
1233     if (U_SUCCESS(ec)) {
1234         if (invert) {
1235             complement();
1236         }
1237 
1238         // Move to the limit position after the close delimiter if the
1239         // parse succeeded.
1240         ppos.setIndex(close + (posix ? 2 : 1));
1241     }
1242 
1243     return *this;
1244 }
1245 
1246 /**
1247  * Parse a property pattern.
1248  * @param chars iterator over the pattern characters.  Upon return
1249  * it will be advanced to the first character after the parsed
1250  * pattern, or the end of the iteration if all characters are
1251  * parsed.
1252  * @param rebuiltPat the pattern that was parsed, rebuilt or
1253  * copied from the input pattern, as appropriate.
1254  */
applyPropertyPattern(RuleCharacterIterator & chars,UnicodeString & rebuiltPat,UErrorCode & ec)1255 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1256                                       UnicodeString& rebuiltPat,
1257                                       UErrorCode& ec) {
1258     if (U_FAILURE(ec)) return;
1259     UnicodeString pattern;
1260     chars.lookahead(pattern);
1261     ParsePosition pos(0);
1262     applyPropertyPattern(pattern, pos, ec);
1263     if (U_FAILURE(ec)) return;
1264     if (pos.getIndex() == 0) {
1265         // syntaxError(chars, "Invalid property pattern");
1266         ec = U_MALFORMED_SET;
1267         return;
1268     }
1269     chars.jumpahead(pos.getIndex());
1270     rebuiltPat.append(pattern, 0, pos.getIndex());
1271 }
1272 
1273 //----------------------------------------------------------------
1274 // Case folding API
1275 //----------------------------------------------------------------
1276 
1277 // add the result of a full case mapping to the set
1278 // use str as a temporary string to avoid constructing one
1279 static inline void
addCaseMapping(UnicodeSet & set,int32_t result,const UChar * full,UnicodeString & str)1280 addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) {
1281     if(result >= 0) {
1282         if(result > UCASE_MAX_STRING_LENGTH) {
1283             // add a single-code point case mapping
1284             set.add(result);
1285         } else {
1286             // add a string case mapping from full with length result
1287             str.setTo((UBool)FALSE, full, result);
1288             set.add(str);
1289         }
1290     }
1291     // result < 0: the code point mapped to itself, no need to add it
1292     // see ucase.h
1293 }
1294 
closeOver(int32_t attribute)1295 UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
1296     if (isFrozen()) {
1297         return *this;
1298     }
1299     if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
1300         UErrorCode status = U_ZERO_ERROR;
1301         const UCaseProps *csp = ucase_getSingleton(&status);
1302         if (U_SUCCESS(status)) {
1303             UnicodeSet foldSet(*this);
1304             UnicodeString str;
1305             USetAdder sa = {
1306                 (USet *)&foldSet,
1307                 _set_add,
1308                 _set_addRange,
1309                 _set_addString,
1310                 NULL // don't need remove()
1311             };
1312 
1313             // start with input set to guarantee inclusion
1314             // USET_CASE: remove strings because the strings will actually be reduced (folded);
1315             //            therefore, start with no strings and add only those needed
1316             if (attribute & USET_CASE_INSENSITIVE) {
1317                 foldSet.strings->removeAllElements();
1318             }
1319 
1320             int32_t n = getRangeCount();
1321             UChar32 result;
1322             const UChar *full;
1323             int32_t locCache = 0;
1324 
1325             for (int32_t i=0; i<n; ++i) {
1326                 UChar32 start = getRangeStart(i);
1327                 UChar32 end   = getRangeEnd(i);
1328 
1329                 if (attribute & USET_CASE_INSENSITIVE) {
1330                     // full case closure
1331                     for (UChar32 cp=start; cp<=end; ++cp) {
1332                         ucase_addCaseClosure(csp, cp, &sa);
1333                     }
1334                 } else {
1335                     // add case mappings
1336                     // (does not add long s for regular s, or Kelvin for k, for example)
1337                     for (UChar32 cp=start; cp<=end; ++cp) {
1338                         result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache);
1339                         addCaseMapping(foldSet, result, full, str);
1340 
1341                         result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache);
1342                         addCaseMapping(foldSet, result, full, str);
1343 
1344                         result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache);
1345                         addCaseMapping(foldSet, result, full, str);
1346 
1347                         result = ucase_toFullFolding(csp, cp, &full, 0);
1348                         addCaseMapping(foldSet, result, full, str);
1349                     }
1350                 }
1351             }
1352             if (strings != NULL && strings->size() > 0) {
1353                 if (attribute & USET_CASE_INSENSITIVE) {
1354                     for (int32_t j=0; j<strings->size(); ++j) {
1355                         str = *(const UnicodeString *) strings->elementAt(j);
1356                         str.foldCase();
1357                         if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) {
1358                             foldSet.add(str); // does not map to code points: add the folded string itself
1359                         }
1360                     }
1361                 } else {
1362                     Locale root("");
1363 #if !UCONFIG_NO_BREAK_ITERATION
1364                     BreakIterator *bi = BreakIterator::createWordInstance(root, status);
1365 #endif
1366                     if (U_SUCCESS(status)) {
1367                         const UnicodeString *pStr;
1368 
1369                         for (int32_t j=0; j<strings->size(); ++j) {
1370                             pStr = (const UnicodeString *) strings->elementAt(j);
1371                             (str = *pStr).toLower(root);
1372                             foldSet.add(str);
1373 #if !UCONFIG_NO_BREAK_ITERATION
1374                             (str = *pStr).toTitle(bi, root);
1375                             foldSet.add(str);
1376 #endif
1377                             (str = *pStr).toUpper(root);
1378                             foldSet.add(str);
1379                             (str = *pStr).foldCase();
1380                             foldSet.add(str);
1381                         }
1382                     }
1383 #if !UCONFIG_NO_BREAK_ITERATION
1384                     delete bi;
1385 #endif
1386                 }
1387             }
1388             *this = foldSet;
1389         }
1390     }
1391     return *this;
1392 }
1393 
1394 U_NAMESPACE_END
1395