• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 1999-2010, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  uniset_props.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:4
12 *
13 *   created on: 2004aug25
14 *   created by: Markus W. Scherer
15 *
16 *   Character property dependent functions moved here from uniset.cpp
17 */
18 
19 #include "unicode/utypes.h"
20 #include "unicode/uniset.h"
21 #include "unicode/parsepos.h"
22 #include "unicode/uchar.h"
23 #include "unicode/uscript.h"
24 #include "unicode/symtable.h"
25 #include "unicode/uset.h"
26 #include "unicode/locid.h"
27 #include "unicode/brkiter.h"
28 #include "uset_imp.h"
29 #include "ruleiter.h"
30 #include "cmemory.h"
31 #include "ucln_cmn.h"
32 #include "util.h"
33 #include "uvector.h"
34 #include "uprops.h"
35 #include "propname.h"
36 #include "normalizer2impl.h"
37 #include "unormimp.h"
38 #include "ucase.h"
39 #include "ubidi_props.h"
40 #include "uinvchar.h"
41 #include "uprops.h"
42 #include "charstr.h"
43 #include "cstring.h"
44 #include "mutex.h"
45 #include "umutex.h"
46 #include "uassert.h"
47 #include "hash.h"
48 
49 U_NAMESPACE_USE
50 
51 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
52 
53 // initial storage. Must be >= 0
54 // *** same as in uniset.cpp ! ***
55 #define START_EXTRA 16
56 
57 // Define UChar constants using hex for EBCDIC compatibility
58 // Used #define to reduce private static exports and memory access time.
59 #define SET_OPEN        ((UChar)0x005B) /*[*/
60 #define SET_CLOSE       ((UChar)0x005D) /*]*/
61 #define HYPHEN          ((UChar)0x002D) /*-*/
62 #define COMPLEMENT      ((UChar)0x005E) /*^*/
63 #define COLON           ((UChar)0x003A) /*:*/
64 #define BACKSLASH       ((UChar)0x005C) /*\*/
65 #define INTERSECTION    ((UChar)0x0026) /*&*/
66 #define UPPER_U         ((UChar)0x0055) /*U*/
67 #define LOWER_U         ((UChar)0x0075) /*u*/
68 #define OPEN_BRACE      ((UChar)123)    /*{*/
69 #define CLOSE_BRACE     ((UChar)125)    /*}*/
70 #define UPPER_P         ((UChar)0x0050) /*P*/
71 #define LOWER_P         ((UChar)0x0070) /*p*/
72 #define UPPER_N         ((UChar)78)     /*N*/
73 #define EQUALS          ((UChar)0x003D) /*=*/
74 
75 //static const UChar POSIX_OPEN[]  = { SET_OPEN,COLON,0 };  // "[:"
76 static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 };  // ":]"
77 //static const UChar PERL_OPEN[]   = { BACKSLASH,LOWER_P,0 }; // "\\p"
78 static const UChar PERL_CLOSE[]  = { CLOSE_BRACE,0 };    // "}"
79 //static const UChar NAME_OPEN[]   = { BACKSLASH,UPPER_N,0 };  // "\\N"
80 static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
81 
82 // Special property set IDs
83 static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
84 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
85 static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
86 
87 // Unicode name property alias
88 #define NAME_PROP "na"
89 #define NAME_PROP_LENGTH 2
90 
91 /**
92  * Delimiter string used in patterns to close a category reference:
93  * ":]".  Example: "[:Lu:]".
94  */
95 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
96 
97 // Cached sets ------------------------------------------------------------- ***
98 
99 U_CDECL_BEGIN
100 static UBool U_CALLCONV uset_cleanup();
101 U_CDECL_END
102 
103 // Not a TriStateSingletonWrapper because we think the UnicodeSet constructor
104 // can only fail with an out-of-memory error
105 // if we have a correct pattern and the properties data is hardcoded and always available.
106 class UnicodeSetSingleton : public SimpleSingletonWrapper<UnicodeSet> {
107 public:
UnicodeSetSingleton(SimpleSingleton & s,const char * pattern)108     UnicodeSetSingleton(SimpleSingleton &s, const char *pattern) :
109             SimpleSingletonWrapper<UnicodeSet>(s), fPattern(pattern) {}
getInstance(UErrorCode & errorCode)110     UnicodeSet *getInstance(UErrorCode &errorCode) {
111         return SimpleSingletonWrapper<UnicodeSet>::getInstance(createInstance, fPattern, errorCode);
112     }
113 private:
createInstance(const void * context,UErrorCode & errorCode)114     static void *createInstance(const void *context, UErrorCode &errorCode) {
115         UnicodeString pattern((const char *)context, -1, US_INV);
116         UnicodeSet *set=new UnicodeSet(pattern, errorCode);
117         if(set==NULL) {
118             errorCode=U_MEMORY_ALLOCATION_ERROR;
119         }
120         set->freeze();
121         ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
122         return set;
123     }
124 
125     const char *fPattern;
126 };
127 
128 U_CDECL_BEGIN
129 
130 static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()
131 
132 STATIC_SIMPLE_SINGLETON(uni32Singleton);
133 
134 //----------------------------------------------------------------
135 // Inclusions list
136 //----------------------------------------------------------------
137 
138 // USetAdder implementation
139 // Does not use uset.h to reduce code dependencies
140 static void U_CALLCONV
_set_add(USet * set,UChar32 c)141 _set_add(USet *set, UChar32 c) {
142     ((UnicodeSet *)set)->add(c);
143 }
144 
145 static void U_CALLCONV
_set_addRange(USet * set,UChar32 start,UChar32 end)146 _set_addRange(USet *set, UChar32 start, UChar32 end) {
147     ((UnicodeSet *)set)->add(start, end);
148 }
149 
150 static void U_CALLCONV
_set_addString(USet * set,const UChar * str,int32_t length)151 _set_addString(USet *set, const UChar *str, int32_t length) {
152     ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
153 }
154 
155 /**
156  * Cleanup function for UnicodeSet
157  */
uset_cleanup(void)158 static UBool U_CALLCONV uset_cleanup(void) {
159     int32_t i;
160 
161     for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
162         if (INCLUSIONS[i] != NULL) {
163             delete INCLUSIONS[i];
164             INCLUSIONS[i] = NULL;
165         }
166     }
167     UnicodeSetSingleton(uni32Singleton, NULL).deleteInstance();
168     return TRUE;
169 }
170 
171 U_CDECL_END
172 
173 U_NAMESPACE_BEGIN
174 
175 /*
176 Reduce excessive reallocation, and make it easier to detect initialization
177 problems.
178 Usually you don't see smaller sets than this for Unicode 5.0.
179 */
180 #define DEFAULT_INCLUSION_CAPACITY 3072
181 
getInclusions(int32_t src,UErrorCode & status)182 const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
183     UBool needInit;
184     UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit);
185     if (needInit) {
186         UnicodeSet* incl = new UnicodeSet();
187         USetAdder sa = {
188             (USet *)incl,
189             _set_add,
190             _set_addRange,
191             _set_addString,
192             NULL, // don't need remove()
193             NULL // don't need removeRange()
194         };
195         incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);
196         if (incl != NULL) {
197             switch(src) {
198             case UPROPS_SRC_CHAR:
199                 uchar_addPropertyStarts(&sa, &status);
200                 break;
201             case UPROPS_SRC_PROPSVEC:
202                 upropsvec_addPropertyStarts(&sa, &status);
203                 break;
204             case UPROPS_SRC_CHAR_AND_PROPSVEC:
205                 uchar_addPropertyStarts(&sa, &status);
206                 upropsvec_addPropertyStarts(&sa, &status);
207                 break;
208 #if !UCONFIG_NO_NORMALIZATION
209             case UPROPS_SRC_NORM:
210                 unorm_addPropertyStarts(&sa, &status);
211                 break;
212             case UPROPS_SRC_CASE_AND_NORM:
213                 ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
214                 unorm_addPropertyStarts(&sa, &status);
215                 break;
216             case UPROPS_SRC_NFC: {
217                 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
218                 if(U_SUCCESS(status)) {
219                     impl->addPropertyStarts(&sa, status);
220                 }
221                 break;
222             }
223             case UPROPS_SRC_NFKC: {
224                 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status);
225                 if(U_SUCCESS(status)) {
226                     impl->addPropertyStarts(&sa, status);
227                 }
228                 break;
229             }
230             case UPROPS_SRC_NFKC_CF: {
231                 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status);
232                 if(U_SUCCESS(status)) {
233                     impl->addPropertyStarts(&sa, status);
234                 }
235                 break;
236             }
237 #endif
238             case UPROPS_SRC_CASE:
239                 ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
240                 break;
241             case UPROPS_SRC_BIDI:
242                 ubidi_addPropertyStarts(ubidi_getSingleton(&status), &sa, &status);
243                 break;
244             default:
245                 status = U_INTERNAL_PROGRAM_ERROR;
246                 break;
247             }
248             if (U_SUCCESS(status)) {
249                 // Compact for caching
250                 incl->compact();
251                 umtx_lock(NULL);
252                 if (INCLUSIONS[src] == NULL) {
253                     INCLUSIONS[src] = incl;
254                     incl = NULL;
255                     ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
256                 }
257                 umtx_unlock(NULL);
258             }
259             delete incl;
260         } else {
261             status = U_MEMORY_ALLOCATION_ERROR;
262         }
263     }
264     return INCLUSIONS[src];
265 }
266 
267 // Cache some sets for other services -------------------------------------- ***
268 
269 U_CFUNC UnicodeSet *
uniset_getUnicode32Instance(UErrorCode & errorCode)270 uniset_getUnicode32Instance(UErrorCode &errorCode) {
271     return UnicodeSetSingleton(uni32Singleton, "[:age=3.2:]").getInstance(errorCode);
272 }
273 
274 // helper functions for matching of pattern syntax pieces ------------------ ***
275 // these functions are parallel to the PERL_OPEN etc. strings above
276 
277 // using these functions is not only faster than UnicodeString::compare() and
278 // caseCompare(), but they also make UnicodeSet work for simple patterns when
279 // no Unicode properties data is available - when caseCompare() fails
280 
281 static inline UBool
isPerlOpen(const UnicodeString & pattern,int32_t pos)282 isPerlOpen(const UnicodeString &pattern, int32_t pos) {
283     UChar c;
284     return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
285 }
286 
287 /*static inline UBool
288 isPerlClose(const UnicodeString &pattern, int32_t pos) {
289     return pattern.charAt(pos)==CLOSE_BRACE;
290 }*/
291 
292 static inline UBool
isNameOpen(const UnicodeString & pattern,int32_t pos)293 isNameOpen(const UnicodeString &pattern, int32_t pos) {
294     return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
295 }
296 
297 static inline UBool
isPOSIXOpen(const UnicodeString & pattern,int32_t pos)298 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
299     return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
300 }
301 
302 /*static inline UBool
303 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
304     return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
305 }*/
306 
307 // TODO memory debugging provided inside uniset.cpp
308 // could be made available here but probably obsolete with use of modern
309 // memory leak checker tools
310 #define _dbgct(me)
311 
312 //----------------------------------------------------------------
313 // Constructors &c
314 //----------------------------------------------------------------
315 
316 /**
317  * Constructs a set from the given pattern, optionally ignoring
318  * white space.  See the class description for the syntax of the
319  * pattern language.
320  * @param pattern a string specifying what characters are in the set
321  */
UnicodeSet(const UnicodeString & pattern,UErrorCode & status)322 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
323                        UErrorCode& status) :
324     len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
325     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
326     fFlags(0)
327 {
328     if(U_SUCCESS(status)){
329         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
330         /* test for NULL */
331         if(list == NULL) {
332             status = U_MEMORY_ALLOCATION_ERROR;
333         }else{
334             allocateStrings(status);
335             applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
336         }
337     }
338     _dbgct(this);
339 }
340 
341 /**
342  * Constructs a set from the given pattern, optionally ignoring
343  * white space.  See the class description for the syntax of the
344  * pattern language.
345  * @param pattern a string specifying what characters are in the set
346  * @param options bitmask for options to apply to the pattern.
347  * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
348  */
UnicodeSet(const UnicodeString & pattern,uint32_t options,const SymbolTable * symbols,UErrorCode & status)349 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
350                        uint32_t options,
351                        const SymbolTable* symbols,
352                        UErrorCode& status) :
353     len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
354     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
355     fFlags(0)
356 {
357     if(U_SUCCESS(status)){
358         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
359         /* test for NULL */
360         if(list == NULL) {
361             status = U_MEMORY_ALLOCATION_ERROR;
362         }else{
363             allocateStrings(status);
364             applyPattern(pattern, options, symbols, status);
365         }
366     }
367     _dbgct(this);
368 }
369 
UnicodeSet(const UnicodeString & pattern,ParsePosition & pos,uint32_t options,const SymbolTable * symbols,UErrorCode & status)370 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
371                        uint32_t options,
372                        const SymbolTable* symbols,
373                        UErrorCode& status) :
374     len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
375     bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
376     fFlags(0)
377 {
378     if(U_SUCCESS(status)){
379         list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
380         /* test for NULL */
381         if(list == NULL) {
382             status = U_MEMORY_ALLOCATION_ERROR;
383         }else{
384             allocateStrings(status);
385             applyPattern(pattern, pos, options, symbols, status);
386         }
387     }
388     _dbgct(this);
389 }
390 
391 //----------------------------------------------------------------
392 // Public API
393 //----------------------------------------------------------------
394 
395 /**
396  * Modifies this set to represent the set specified by the given
397  * pattern, optionally ignoring white space.  See the class
398  * description for the syntax of the pattern language.
399  * @param pattern a string specifying what characters are in the set
400  * @param ignoreSpaces if <code>true</code>, all spaces in the
401  * pattern are ignored.  Spaces are those characters for which
402  * <code>uprv_isRuleWhiteSpace()</code> is <code>true</code>.
403  * Characters preceded by '\\' are escaped, losing any special
404  * meaning they otherwise have.  Spaces may be included by
405  * escaping them.
406  * @exception <code>IllegalArgumentException</code> if the pattern
407  * contains a syntax error.
408  */
applyPattern(const UnicodeString & pattern,UErrorCode & status)409 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
410                                      UErrorCode& status) {
411     return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
412 }
413 
414 
415 /**
416  * Modifies this set to represent the set specified by the given
417  * pattern, optionally ignoring white space.  See the class
418  * description for the syntax of the pattern language.
419  * @param pattern a string specifying what characters are in the set
420  * @param options bitmask for options to apply to the pattern.
421  * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
422  */
applyPattern(const UnicodeString & pattern,uint32_t options,const SymbolTable * symbols,UErrorCode & status)423 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
424                                      uint32_t options,
425                                      const SymbolTable* symbols,
426                                      UErrorCode& status) {
427     if (U_FAILURE(status) || isFrozen()) {
428         return *this;
429     }
430 
431     ParsePosition pos(0);
432     applyPattern(pattern, pos, options, symbols, status);
433     if (U_FAILURE(status)) return *this;
434 
435     int32_t i = pos.getIndex();
436 
437     if (options & USET_IGNORE_SPACE) {
438         // Skip over trailing whitespace
439         ICU_Utility::skipWhitespace(pattern, i, TRUE);
440     }
441 
442     if (i != pattern.length()) {
443         status = U_ILLEGAL_ARGUMENT_ERROR;
444     }
445     return *this;
446 }
447 
applyPattern(const UnicodeString & pattern,ParsePosition & pos,uint32_t options,const SymbolTable * symbols,UErrorCode & status)448 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
449                               ParsePosition& pos,
450                               uint32_t options,
451                               const SymbolTable* symbols,
452                               UErrorCode& status) {
453     if (U_FAILURE(status) || isFrozen()) {
454         return *this;
455     }
456     // Need to build the pattern in a temporary string because
457     // _applyPattern calls add() etc., which set pat to empty.
458     UnicodeString rebuiltPat;
459     RuleCharacterIterator chars(pattern, symbols, pos);
460     applyPattern(chars, symbols, rebuiltPat, options, status);
461     if (U_FAILURE(status)) return *this;
462     if (chars.inVariable()) {
463         // syntaxError(chars, "Extra chars in variable value");
464         status = U_MALFORMED_SET;
465         return *this;
466     }
467     setPattern(rebuiltPat);
468     return *this;
469 }
470 
471 /**
472  * Return true if the given position, in the given pattern, appears
473  * to be the start of a UnicodeSet pattern.
474  */
resemblesPattern(const UnicodeString & pattern,int32_t pos)475 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
476     return ((pos+1) < pattern.length() &&
477             pattern.charAt(pos) == (UChar)91/*[*/) ||
478         resemblesPropertyPattern(pattern, pos);
479 }
480 
481 //----------------------------------------------------------------
482 // Implementation: Pattern parsing
483 //----------------------------------------------------------------
484 
485 /**
486  * A small all-inline class to manage a UnicodeSet pointer.  Add
487  * operator->() etc. as needed.
488  */
489 class UnicodeSetPointer {
490     UnicodeSet* p;
491 public:
UnicodeSetPointer()492     inline UnicodeSetPointer() : p(0) {}
~UnicodeSetPointer()493     inline ~UnicodeSetPointer() { delete p; }
pointer()494     inline UnicodeSet* pointer() { return p; }
allocate()495     inline UBool allocate() {
496         if (p == 0) {
497             p = new UnicodeSet();
498         }
499         return p != 0;
500     }
501 };
502 
503 /**
504  * Parse the pattern from the given RuleCharacterIterator.  The
505  * iterator is advanced over the parsed pattern.
506  * @param chars iterator over the pattern characters.  Upon return
507  * it will be advanced to the first character after the parsed
508  * pattern, or the end of the iteration if all characters are
509  * parsed.
510  * @param symbols symbol table to use to parse and dereference
511  * variables, or null if none.
512  * @param rebuiltPat the pattern that was parsed, rebuilt or
513  * copied from the input pattern, as appropriate.
514  * @param options a bit mask of zero or more of the following:
515  * IGNORE_SPACE, CASE.
516  */
applyPattern(RuleCharacterIterator & chars,const SymbolTable * symbols,UnicodeString & rebuiltPat,uint32_t options,UErrorCode & ec)517 void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
518                               const SymbolTable* symbols,
519                               UnicodeString& rebuiltPat,
520                               uint32_t options,
521                               UErrorCode& ec) {
522     if (U_FAILURE(ec)) return;
523 
524     // Syntax characters: [ ] ^ - & { }
525 
526     // Recognized special forms for chars, sets: c-c s-s s&s
527 
528     int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
529                    RuleCharacterIterator::PARSE_ESCAPES;
530     if ((options & USET_IGNORE_SPACE) != 0) {
531         opts |= RuleCharacterIterator::SKIP_WHITESPACE;
532     }
533 
534     UnicodeString patLocal, buf;
535     UBool usePat = FALSE;
536     UnicodeSetPointer scratch;
537     RuleCharacterIterator::Pos backup;
538 
539     // mode: 0=before [, 1=between [...], 2=after ]
540     // lastItem: 0=none, 1=char, 2=set
541     int8_t lastItem = 0, mode = 0;
542     UChar32 lastChar = 0;
543     UChar op = 0;
544 
545     UBool invert = FALSE;
546 
547     clear();
548 
549     while (mode != 2 && !chars.atEnd()) {
550         U_ASSERT((lastItem == 0 && op == 0) ||
551                  (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
552                  (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
553                                     op == INTERSECTION /*'&'*/)));
554 
555         UChar32 c = 0;
556         UBool literal = FALSE;
557         UnicodeSet* nested = 0; // alias - do not delete
558 
559         // -------- Check for property pattern
560 
561         // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
562         int8_t setMode = 0;
563         if (resemblesPropertyPattern(chars, opts)) {
564             setMode = 2;
565         }
566 
567         // -------- Parse '[' of opening delimiter OR nested set.
568         // If there is a nested set, use `setMode' to define how
569         // the set should be parsed.  If the '[' is part of the
570         // opening delimiter for this pattern, parse special
571         // strings "[", "[^", "[-", and "[^-".  Check for stand-in
572         // characters representing a nested set in the symbol
573         // table.
574 
575         else {
576             // Prepare to backup if necessary
577             chars.getPos(backup);
578             c = chars.next(opts, literal, ec);
579             if (U_FAILURE(ec)) return;
580 
581             if (c == 0x5B /*'['*/ && !literal) {
582                 if (mode == 1) {
583                     chars.setPos(backup); // backup
584                     setMode = 1;
585                 } else {
586                     // Handle opening '[' delimiter
587                     mode = 1;
588                     patLocal.append((UChar) 0x5B /*'['*/);
589                     chars.getPos(backup); // prepare to backup
590                     c = chars.next(opts, literal, ec);
591                     if (U_FAILURE(ec)) return;
592                     if (c == 0x5E /*'^'*/ && !literal) {
593                         invert = TRUE;
594                         patLocal.append((UChar) 0x5E /*'^'*/);
595                         chars.getPos(backup); // prepare to backup
596                         c = chars.next(opts, literal, ec);
597                         if (U_FAILURE(ec)) return;
598                     }
599                     // Fall through to handle special leading '-';
600                     // otherwise restart loop for nested [], \p{}, etc.
601                     if (c == HYPHEN /*'-'*/) {
602                         literal = TRUE;
603                         // Fall through to handle literal '-' below
604                     } else {
605                         chars.setPos(backup); // backup
606                         continue;
607                     }
608                 }
609             } else if (symbols != 0) {
610                 const UnicodeFunctor *m = symbols->lookupMatcher(c);
611                 if (m != 0) {
612                     if (m->getDynamicClassID() != UnicodeSet::getStaticClassID()) {
613                         ec = U_MALFORMED_SET;
614                         return;
615                     }
616                     // casting away const, but `nested' won't be modified
617                     // (important not to modify stored set)
618                     nested = (UnicodeSet*) m;
619                     setMode = 3;
620                 }
621             }
622         }
623 
624         // -------- Handle a nested set.  This either is inline in
625         // the pattern or represented by a stand-in that has
626         // previously been parsed and was looked up in the symbol
627         // table.
628 
629         if (setMode != 0) {
630             if (lastItem == 1) {
631                 if (op != 0) {
632                     // syntaxError(chars, "Char expected after operator");
633                     ec = U_MALFORMED_SET;
634                     return;
635                 }
636                 add(lastChar, lastChar);
637                 _appendToPat(patLocal, lastChar, FALSE);
638                 lastItem = 0;
639                 op = 0;
640             }
641 
642             if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
643                 patLocal.append(op);
644             }
645 
646             if (nested == 0) {
647                 // lazy allocation
648                 if (!scratch.allocate()) {
649                     ec = U_MEMORY_ALLOCATION_ERROR;
650                     return;
651                 }
652                 nested = scratch.pointer();
653             }
654             switch (setMode) {
655             case 1:
656                 nested->applyPattern(chars, symbols, patLocal, options, ec);
657                 break;
658             case 2:
659                 chars.skipIgnored(opts);
660                 nested->applyPropertyPattern(chars, patLocal, ec);
661                 if (U_FAILURE(ec)) return;
662                 break;
663             case 3: // `nested' already parsed
664                 nested->_toPattern(patLocal, FALSE);
665                 break;
666             }
667 
668             usePat = TRUE;
669 
670             if (mode == 0) {
671                 // Entire pattern is a category; leave parse loop
672                 *this = *nested;
673                 mode = 2;
674                 break;
675             }
676 
677             switch (op) {
678             case HYPHEN: /*'-'*/
679                 removeAll(*nested);
680                 break;
681             case INTERSECTION: /*'&'*/
682                 retainAll(*nested);
683                 break;
684             case 0:
685                 addAll(*nested);
686                 break;
687             }
688 
689             op = 0;
690             lastItem = 2;
691 
692             continue;
693         }
694 
695         if (mode == 0) {
696             // syntaxError(chars, "Missing '['");
697             ec = U_MALFORMED_SET;
698             return;
699         }
700 
701         // -------- Parse special (syntax) characters.  If the
702         // current character is not special, or if it is escaped,
703         // then fall through and handle it below.
704 
705         if (!literal) {
706             switch (c) {
707             case 0x5D /*']'*/:
708                 if (lastItem == 1) {
709                     add(lastChar, lastChar);
710                     _appendToPat(patLocal, lastChar, FALSE);
711                 }
712                 // Treat final trailing '-' as a literal
713                 if (op == HYPHEN /*'-'*/) {
714                     add(op, op);
715                     patLocal.append(op);
716                 } else if (op == INTERSECTION /*'&'*/) {
717                     // syntaxError(chars, "Trailing '&'");
718                     ec = U_MALFORMED_SET;
719                     return;
720                 }
721                 patLocal.append((UChar) 0x5D /*']'*/);
722                 mode = 2;
723                 continue;
724             case HYPHEN /*'-'*/:
725                 if (op == 0) {
726                     if (lastItem != 0) {
727                         op = (UChar) c;
728                         continue;
729                     } else {
730                         // Treat final trailing '-' as a literal
731                         add(c, c);
732                         c = chars.next(opts, literal, ec);
733                         if (U_FAILURE(ec)) return;
734                         if (c == 0x5D /*']'*/ && !literal) {
735                             patLocal.append(HYPHEN_RIGHT_BRACE);
736                             mode = 2;
737                             continue;
738                         }
739                     }
740                 }
741                 // syntaxError(chars, "'-' not after char or set");
742                 ec = U_MALFORMED_SET;
743                 return;
744             case INTERSECTION /*'&'*/:
745                 if (lastItem == 2 && op == 0) {
746                     op = (UChar) c;
747                     continue;
748                 }
749                 // syntaxError(chars, "'&' not after set");
750                 ec = U_MALFORMED_SET;
751                 return;
752             case 0x5E /*'^'*/:
753                 // syntaxError(chars, "'^' not after '['");
754                 ec = U_MALFORMED_SET;
755                 return;
756             case 0x7B /*'{'*/:
757                 if (op != 0) {
758                     // syntaxError(chars, "Missing operand after operator");
759                     ec = U_MALFORMED_SET;
760                     return;
761                 }
762                 if (lastItem == 1) {
763                     add(lastChar, lastChar);
764                     _appendToPat(patLocal, lastChar, FALSE);
765                 }
766                 lastItem = 0;
767                 buf.truncate(0);
768                 {
769                     UBool ok = FALSE;
770                     while (!chars.atEnd()) {
771                         c = chars.next(opts, literal, ec);
772                         if (U_FAILURE(ec)) return;
773                         if (c == 0x7D /*'}'*/ && !literal) {
774                             ok = TRUE;
775                             break;
776                         }
777                         buf.append(c);
778                     }
779                     if (buf.length() < 1 || !ok) {
780                         // syntaxError(chars, "Invalid multicharacter string");
781                         ec = U_MALFORMED_SET;
782                         return;
783                     }
784                 }
785                 // We have new string. Add it to set and continue;
786                 // we don't need to drop through to the further
787                 // processing
788                 add(buf);
789                 patLocal.append((UChar) 0x7B /*'{'*/);
790                 _appendToPat(patLocal, buf, FALSE);
791                 patLocal.append((UChar) 0x7D /*'}'*/);
792                 continue;
793             case SymbolTable::SYMBOL_REF:
794                 //         symbols  nosymbols
795                 // [a-$]   error    error (ambiguous)
796                 // [a$]    anchor   anchor
797                 // [a-$x]  var "x"* literal '$'
798                 // [a-$.]  error    literal '$'
799                 // *We won't get here in the case of var "x"
800                 {
801                     chars.getPos(backup);
802                     c = chars.next(opts, literal, ec);
803                     if (U_FAILURE(ec)) return;
804                     UBool anchor = (c == 0x5D /*']'*/ && !literal);
805                     if (symbols == 0 && !anchor) {
806                         c = SymbolTable::SYMBOL_REF;
807                         chars.setPos(backup);
808                         break; // literal '$'
809                     }
810                     if (anchor && op == 0) {
811                         if (lastItem == 1) {
812                             add(lastChar, lastChar);
813                             _appendToPat(patLocal, lastChar, FALSE);
814                         }
815                         add(U_ETHER);
816                         usePat = TRUE;
817                         patLocal.append((UChar) SymbolTable::SYMBOL_REF);
818                         patLocal.append((UChar) 0x5D /*']'*/);
819                         mode = 2;
820                         continue;
821                     }
822                     // syntaxError(chars, "Unquoted '$'");
823                     ec = U_MALFORMED_SET;
824                     return;
825                 }
826             default:
827                 break;
828             }
829         }
830 
831         // -------- Parse literal characters.  This includes both
832         // escaped chars ("\u4E01") and non-syntax characters
833         // ("a").
834 
835         switch (lastItem) {
836         case 0:
837             lastItem = 1;
838             lastChar = c;
839             break;
840         case 1:
841             if (op == HYPHEN /*'-'*/) {
842                 if (lastChar >= c) {
843                     // Don't allow redundant (a-a) or empty (b-a) ranges;
844                     // these are most likely typos.
845                     // syntaxError(chars, "Invalid range");
846                     ec = U_MALFORMED_SET;
847                     return;
848                 }
849                 add(lastChar, c);
850                 _appendToPat(patLocal, lastChar, FALSE);
851                 patLocal.append(op);
852                 _appendToPat(patLocal, c, FALSE);
853                 lastItem = 0;
854                 op = 0;
855             } else {
856                 add(lastChar, lastChar);
857                 _appendToPat(patLocal, lastChar, FALSE);
858                 lastChar = c;
859             }
860             break;
861         case 2:
862             if (op != 0) {
863                 // syntaxError(chars, "Set expected after operator");
864                 ec = U_MALFORMED_SET;
865                 return;
866             }
867             lastChar = c;
868             lastItem = 1;
869             break;
870         }
871     }
872 
873     if (mode != 2) {
874         // syntaxError(chars, "Missing ']'");
875         ec = U_MALFORMED_SET;
876         return;
877     }
878 
879     chars.skipIgnored(opts);
880 
881     /**
882      * Handle global flags (invert, case insensitivity).  If this
883      * pattern should be compiled case-insensitive, then we need
884      * to close over case BEFORE COMPLEMENTING.  This makes
885      * patterns like /[^abc]/i work.
886      */
887     if ((options & USET_CASE_INSENSITIVE) != 0) {
888         closeOver(USET_CASE_INSENSITIVE);
889     }
890     else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
891         closeOver(USET_ADD_CASE_MAPPINGS);
892     }
893     if (invert) {
894         complement();
895     }
896 
897     // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the
898     // generated pattern.
899     if (usePat) {
900         rebuiltPat.append(patLocal);
901     } else {
902         _generatePattern(rebuiltPat, FALSE);
903     }
904     if (isBogus() && U_SUCCESS(ec)) {
905         // We likely ran out of memory. AHHH!
906         ec = U_MEMORY_ALLOCATION_ERROR;
907     }
908 }
909 
910 //----------------------------------------------------------------
911 // Property set implementation
912 //----------------------------------------------------------------
913 
numericValueFilter(UChar32 ch,void * context)914 static UBool numericValueFilter(UChar32 ch, void* context) {
915     return u_getNumericValue(ch) == *(double*)context;
916 }
917 
generalCategoryMaskFilter(UChar32 ch,void * context)918 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
919     int32_t value = *(int32_t*)context;
920     return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
921 }
922 
versionFilter(UChar32 ch,void * context)923 static UBool versionFilter(UChar32 ch, void* context) {
924     UVersionInfo v, none = { 0, 0, 0, 0};
925     UVersionInfo* version = (UVersionInfo*)context;
926     u_charAge(ch, v);
927     return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
928 }
929 
930 typedef struct {
931     UProperty prop;
932     int32_t value;
933 } IntPropertyContext;
934 
intPropertyFilter(UChar32 ch,void * context)935 static UBool intPropertyFilter(UChar32 ch, void* context) {
936     IntPropertyContext* c = (IntPropertyContext*)context;
937     return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
938 }
939 
940 
941 /**
942  * Generic filter-based scanning code for UCD property UnicodeSets.
943  */
applyFilter(UnicodeSet::Filter filter,void * context,int32_t src,UErrorCode & status)944 void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
945                              void* context,
946                              int32_t src,
947                              UErrorCode &status) {
948     // Walk through all Unicode characters, noting the start
949     // and end of each range for which filter.contain(c) is
950     // true.  Add each range to a set.
951     //
952     // To improve performance, use the INCLUSIONS set, which
953     // encodes information about character ranges that are known
954     // to have identical properties. INCLUSIONS contains
955     // only the first characters of such ranges.
956     //
957     // TODO Where possible, instead of scanning over code points,
958     // use internal property data to initialize UnicodeSets for
959     // those properties.  Scanning code points is slow.
960     if (U_FAILURE(status)) return;
961 
962     const UnicodeSet* inclusions = getInclusions(src, status);
963     if (U_FAILURE(status)) {
964         return;
965     }
966 
967     clear();
968 
969     UChar32 startHasProperty = -1;
970     int32_t limitRange = inclusions->getRangeCount();
971 
972     for (int j=0; j<limitRange; ++j) {
973         // get current range
974         UChar32 start = inclusions->getRangeStart(j);
975         UChar32 end = inclusions->getRangeEnd(j);
976 
977         // for all the code points in the range, process
978         for (UChar32 ch = start; ch <= end; ++ch) {
979             // only add to this UnicodeSet on inflection points --
980             // where the hasProperty value changes to false
981             if ((*filter)(ch, context)) {
982                 if (startHasProperty < 0) {
983                     startHasProperty = ch;
984                 }
985             } else if (startHasProperty >= 0) {
986                 add(startHasProperty, ch-1);
987                 startHasProperty = -1;
988             }
989         }
990     }
991     if (startHasProperty >= 0) {
992         add((UChar32)startHasProperty, (UChar32)0x10FFFF);
993     }
994     if (isBogus() && U_SUCCESS(status)) {
995         // We likely ran out of memory. AHHH!
996         status = U_MEMORY_ALLOCATION_ERROR;
997     }
998 }
999 
mungeCharName(char * dst,const char * src,int32_t dstCapacity)1000 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
1001     /* Note: we use ' ' in compiler code page */
1002     int32_t j = 0;
1003     char ch;
1004     --dstCapacity; /* make room for term. zero */
1005     while ((ch = *src++) != 0) {
1006         if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
1007             continue;
1008         }
1009         if (j >= dstCapacity) return FALSE;
1010         dst[j++] = ch;
1011     }
1012     if (j > 0 && dst[j-1] == ' ') --j;
1013     dst[j] = 0;
1014     return TRUE;
1015 }
1016 
1017 //----------------------------------------------------------------
1018 // Property set API
1019 //----------------------------------------------------------------
1020 
1021 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
1022 
1023 UnicodeSet&
applyIntPropertyValue(UProperty prop,int32_t value,UErrorCode & ec)1024 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
1025     if (U_FAILURE(ec) || isFrozen()) return *this;
1026 
1027     if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
1028         applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
1029     } else {
1030         IntPropertyContext c = {prop, value};
1031         applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
1032     }
1033     return *this;
1034 }
1035 
1036 UnicodeSet&
applyPropertyAlias(const UnicodeString & prop,const UnicodeString & value,UErrorCode & ec)1037 UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
1038                                const UnicodeString& value,
1039                                UErrorCode& ec) {
1040     if (U_FAILURE(ec) || isFrozen()) return *this;
1041 
1042     // prop and value used to be converted to char * using the default
1043     // converter instead of the invariant conversion.
1044     // This should not be necessary because all Unicode property and value
1045     // names use only invariant characters.
1046     // If there are any variant characters, then we won't find them anyway.
1047     // Checking first avoids assertion failures in the conversion.
1048     if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
1049         !uprv_isInvariantUString(value.getBuffer(), value.length())
1050     ) {
1051         FAIL(ec);
1052     }
1053     CharString pname(prop);
1054     CharString vname(value);
1055 
1056     UProperty p;
1057     int32_t v;
1058     UBool mustNotBeEmpty = FALSE, invert = FALSE;
1059 
1060     if (value.length() > 0) {
1061         p = u_getPropertyEnum(pname);
1062         if (p == UCHAR_INVALID_CODE) FAIL(ec);
1063 
1064         // Treat gc as gcm
1065         if (p == UCHAR_GENERAL_CATEGORY) {
1066             p = UCHAR_GENERAL_CATEGORY_MASK;
1067         }
1068 
1069         if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
1070             (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
1071             (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
1072             v = u_getPropertyValueEnum(p, vname);
1073             if (v == UCHAR_INVALID_CODE) {
1074                 // Handle numeric CCC
1075                 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
1076                     p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
1077                     p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
1078                     char* end;
1079                     double value = uprv_strtod(vname, &end);
1080                     v = (int32_t) value;
1081                     if (v != value || v < 0 || *end != 0) {
1082                         // non-integral or negative value, or trailing junk
1083                         FAIL(ec);
1084                     }
1085                     // If the resultant set is empty then the numeric value
1086                     // was invalid.
1087                     mustNotBeEmpty = TRUE;
1088                 } else {
1089                     FAIL(ec);
1090                 }
1091             }
1092         }
1093 
1094         else {
1095 
1096             switch (p) {
1097             case UCHAR_NUMERIC_VALUE:
1098                 {
1099                     char* end;
1100                     double value = uprv_strtod(vname, &end);
1101                     if (*end != 0) {
1102                         FAIL(ec);
1103                     }
1104                     applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);
1105                     return *this;
1106                 }
1107                 break;
1108             case UCHAR_NAME:
1109             case UCHAR_UNICODE_1_NAME:
1110                 {
1111                     // Must munge name, since u_charFromName() does not do
1112                     // 'loose' matching.
1113                     char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
1114                     if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
1115                     UCharNameChoice choice = (p == UCHAR_NAME) ?
1116                         U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME;
1117                     UChar32 ch = u_charFromName(choice, buf, &ec);
1118                     if (U_SUCCESS(ec)) {
1119                         clear();
1120                         add(ch);
1121                         return *this;
1122                     } else {
1123                         FAIL(ec);
1124                     }
1125                 }
1126                 break;
1127             case UCHAR_AGE:
1128                 {
1129                     // Must munge name, since u_versionFromString() does not do
1130                     // 'loose' matching.
1131                     char buf[128];
1132                     if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
1133                     UVersionInfo version;
1134                     u_versionFromString(version, buf);
1135                     applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
1136                     return *this;
1137                 }
1138                 break;
1139             default:
1140                 // p is a non-binary, non-enumerated property that we
1141                 // don't support (yet).
1142                 FAIL(ec);
1143             }
1144         }
1145     }
1146 
1147     else {
1148         // value is empty.  Interpret as General Category, Script, or
1149         // Binary property.
1150         p = UCHAR_GENERAL_CATEGORY_MASK;
1151         v = u_getPropertyValueEnum(p, pname);
1152         if (v == UCHAR_INVALID_CODE) {
1153             p = UCHAR_SCRIPT;
1154             v = u_getPropertyValueEnum(p, pname);
1155             if (v == UCHAR_INVALID_CODE) {
1156                 p = u_getPropertyEnum(pname);
1157                 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
1158                     v = 1;
1159                 } else if (0 == uprv_comparePropertyNames(ANY, pname)) {
1160                     set(MIN_VALUE, MAX_VALUE);
1161                     return *this;
1162                 } else if (0 == uprv_comparePropertyNames(ASCII, pname)) {
1163                     set(0, 0x7F);
1164                     return *this;
1165                 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname)) {
1166                     // [:Assigned:]=[:^Cn:]
1167                     p = UCHAR_GENERAL_CATEGORY_MASK;
1168                     v = U_GC_CN_MASK;
1169                     invert = TRUE;
1170                 } else {
1171                     FAIL(ec);
1172                 }
1173             }
1174         }
1175     }
1176 
1177     applyIntPropertyValue(p, v, ec);
1178     if(invert) {
1179         complement();
1180     }
1181 
1182     if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
1183         // mustNotBeEmpty is set to true if an empty set indicates
1184         // invalid input.
1185         ec = U_ILLEGAL_ARGUMENT_ERROR;
1186     }
1187 
1188     if (isBogus() && U_SUCCESS(ec)) {
1189         // We likely ran out of memory. AHHH!
1190         ec = U_MEMORY_ALLOCATION_ERROR;
1191     }
1192     return *this;
1193 }
1194 
1195 //----------------------------------------------------------------
1196 // Property set patterns
1197 //----------------------------------------------------------------
1198 
1199 /**
1200  * Return true if the given position, in the given pattern, appears
1201  * to be the start of a property set pattern.
1202  */
resemblesPropertyPattern(const UnicodeString & pattern,int32_t pos)1203 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
1204                                            int32_t pos) {
1205     // Patterns are at least 5 characters long
1206     if ((pos+5) > pattern.length()) {
1207         return FALSE;
1208     }
1209 
1210     // Look for an opening [:, [:^, \p, or \P
1211     return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
1212 }
1213 
1214 /**
1215  * Return true if the given iterator appears to point at a
1216  * property pattern.  Regardless of the result, return with the
1217  * iterator unchanged.
1218  * @param chars iterator over the pattern characters.  Upon return
1219  * it will be unchanged.
1220  * @param iterOpts RuleCharacterIterator options
1221  */
resemblesPropertyPattern(RuleCharacterIterator & chars,int32_t iterOpts)1222 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
1223                                            int32_t iterOpts) {
1224     // NOTE: literal will always be FALSE, because we don't parse escapes.
1225     UBool result = FALSE, literal;
1226     UErrorCode ec = U_ZERO_ERROR;
1227     iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1228     RuleCharacterIterator::Pos pos;
1229     chars.getPos(pos);
1230     UChar32 c = chars.next(iterOpts, literal, ec);
1231     if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
1232         UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1233                                literal, ec);
1234         result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
1235                  (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
1236     }
1237     chars.setPos(pos);
1238     return result && U_SUCCESS(ec);
1239 }
1240 
1241 /**
1242  * Parse the given property pattern at the given parse position.
1243  */
applyPropertyPattern(const UnicodeString & pattern,ParsePosition & ppos,UErrorCode & ec)1244 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1245                                              ParsePosition& ppos,
1246                                              UErrorCode &ec) {
1247     int32_t pos = ppos.getIndex();
1248 
1249     UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1250     UBool isName = FALSE; // true for \N{pat}, o/w false
1251     UBool invert = FALSE;
1252 
1253     if (U_FAILURE(ec)) return *this;
1254 
1255     // Minimum length is 5 characters, e.g. \p{L}
1256     if ((pos+5) > pattern.length()) {
1257         FAIL(ec);
1258     }
1259 
1260     // On entry, ppos should point to one of the following locations:
1261     // Look for an opening [:, [:^, \p, or \P
1262     if (isPOSIXOpen(pattern, pos)) {
1263         posix = TRUE;
1264         pos += 2;
1265         pos = ICU_Utility::skipWhitespace(pattern, pos);
1266         if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
1267             ++pos;
1268             invert = TRUE;
1269         }
1270     } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1271         UChar c = pattern.charAt(pos+1);
1272         invert = (c == UPPER_P);
1273         isName = (c == UPPER_N);
1274         pos += 2;
1275         pos = ICU_Utility::skipWhitespace(pattern, pos);
1276         if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
1277             // Syntax error; "\p" or "\P" not followed by "{"
1278             FAIL(ec);
1279         }
1280     } else {
1281         // Open delimiter not seen
1282         FAIL(ec);
1283     }
1284 
1285     // Look for the matching close delimiter, either :] or }
1286     int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos);
1287     if (close < 0) {
1288         // Syntax error; close delimiter missing
1289         FAIL(ec);
1290     }
1291 
1292     // Look for an '=' sign.  If this is present, we will parse a
1293     // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1294     // pattern.
1295     int32_t equals = pattern.indexOf(EQUALS, pos);
1296     UnicodeString propName, valueName;
1297     if (equals >= 0 && equals < close && !isName) {
1298         // Equals seen; parse medium/long pattern
1299         pattern.extractBetween(pos, equals, propName);
1300         pattern.extractBetween(equals+1, close, valueName);
1301     }
1302 
1303     else {
1304         // Handle case where no '=' is seen, and \N{}
1305         pattern.extractBetween(pos, close, propName);
1306 
1307         // Handle \N{name}
1308         if (isName) {
1309             // This is a little inefficient since it means we have to
1310             // parse NAME_PROP back to UCHAR_NAME even though we already
1311             // know it's UCHAR_NAME.  If we refactor the API to
1312             // support args of (UProperty, char*) then we can remove
1313             // NAME_PROP and make this a little more efficient.
1314             valueName = propName;
1315             propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1316         }
1317     }
1318 
1319     applyPropertyAlias(propName, valueName, ec);
1320 
1321     if (U_SUCCESS(ec)) {
1322         if (invert) {
1323             complement();
1324         }
1325 
1326         // Move to the limit position after the close delimiter if the
1327         // parse succeeded.
1328         ppos.setIndex(close + (posix ? 2 : 1));
1329     }
1330 
1331     return *this;
1332 }
1333 
1334 /**
1335  * Parse a property pattern.
1336  * @param chars iterator over the pattern characters.  Upon return
1337  * it will be advanced to the first character after the parsed
1338  * pattern, or the end of the iteration if all characters are
1339  * parsed.
1340  * @param rebuiltPat the pattern that was parsed, rebuilt or
1341  * copied from the input pattern, as appropriate.
1342  */
applyPropertyPattern(RuleCharacterIterator & chars,UnicodeString & rebuiltPat,UErrorCode & ec)1343 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1344                                       UnicodeString& rebuiltPat,
1345                                       UErrorCode& ec) {
1346     if (U_FAILURE(ec)) return;
1347     UnicodeString pattern;
1348     chars.lookahead(pattern);
1349     ParsePosition pos(0);
1350     applyPropertyPattern(pattern, pos, ec);
1351     if (U_FAILURE(ec)) return;
1352     if (pos.getIndex() == 0) {
1353         // syntaxError(chars, "Invalid property pattern");
1354         ec = U_MALFORMED_SET;
1355         return;
1356     }
1357     chars.jumpahead(pos.getIndex());
1358     rebuiltPat.append(pattern, 0, pos.getIndex());
1359 }
1360 
1361 //----------------------------------------------------------------
1362 // Case folding API
1363 //----------------------------------------------------------------
1364 
1365 // add the result of a full case mapping to the set
1366 // use str as a temporary string to avoid constructing one
1367 static inline void
addCaseMapping(UnicodeSet & set,int32_t result,const UChar * full,UnicodeString & str)1368 addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) {
1369     if(result >= 0) {
1370         if(result > UCASE_MAX_STRING_LENGTH) {
1371             // add a single-code point case mapping
1372             set.add(result);
1373         } else {
1374             // add a string case mapping from full with length result
1375             str.setTo((UBool)FALSE, full, result);
1376             set.add(str);
1377         }
1378     }
1379     // result < 0: the code point mapped to itself, no need to add it
1380     // see ucase.h
1381 }
1382 
closeOver(int32_t attribute)1383 UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
1384     if (isFrozen() || isBogus()) {
1385         return *this;
1386     }
1387     if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
1388         UErrorCode status = U_ZERO_ERROR;
1389         const UCaseProps *csp = ucase_getSingleton(&status);
1390         if (U_SUCCESS(status)) {
1391             UnicodeSet foldSet(*this);
1392             UnicodeString str;
1393             USetAdder sa = {
1394                 (USet *)&foldSet,
1395                 _set_add,
1396                 _set_addRange,
1397                 _set_addString,
1398                 NULL, // don't need remove()
1399                 NULL // don't need removeRange()
1400             };
1401 
1402             // start with input set to guarantee inclusion
1403             // USET_CASE: remove strings because the strings will actually be reduced (folded);
1404             //            therefore, start with no strings and add only those needed
1405             if (attribute & USET_CASE_INSENSITIVE) {
1406                 foldSet.strings->removeAllElements();
1407             }
1408 
1409             int32_t n = getRangeCount();
1410             UChar32 result;
1411             const UChar *full;
1412             int32_t locCache = 0;
1413 
1414             for (int32_t i=0; i<n; ++i) {
1415                 UChar32 start = getRangeStart(i);
1416                 UChar32 end   = getRangeEnd(i);
1417 
1418                 if (attribute & USET_CASE_INSENSITIVE) {
1419                     // full case closure
1420                     for (UChar32 cp=start; cp<=end; ++cp) {
1421                         ucase_addCaseClosure(csp, cp, &sa);
1422                     }
1423                 } else {
1424                     // add case mappings
1425                     // (does not add long s for regular s, or Kelvin for k, for example)
1426                     for (UChar32 cp=start; cp<=end; ++cp) {
1427                         result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache);
1428                         addCaseMapping(foldSet, result, full, str);
1429 
1430                         result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache);
1431                         addCaseMapping(foldSet, result, full, str);
1432 
1433                         result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache);
1434                         addCaseMapping(foldSet, result, full, str);
1435 
1436                         result = ucase_toFullFolding(csp, cp, &full, 0);
1437                         addCaseMapping(foldSet, result, full, str);
1438                     }
1439                 }
1440             }
1441             if (strings != NULL && strings->size() > 0) {
1442                 if (attribute & USET_CASE_INSENSITIVE) {
1443                     for (int32_t j=0; j<strings->size(); ++j) {
1444                         str = *(const UnicodeString *) strings->elementAt(j);
1445                         str.foldCase();
1446                         if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) {
1447                             foldSet.add(str); // does not map to code points: add the folded string itself
1448                         }
1449                     }
1450                 } else {
1451                     Locale root("");
1452 #if !UCONFIG_NO_BREAK_ITERATION
1453                     BreakIterator *bi = BreakIterator::createWordInstance(root, status);
1454 #endif
1455                     if (U_SUCCESS(status)) {
1456                         const UnicodeString *pStr;
1457 
1458                         for (int32_t j=0; j<strings->size(); ++j) {
1459                             pStr = (const UnicodeString *) strings->elementAt(j);
1460                             (str = *pStr).toLower(root);
1461                             foldSet.add(str);
1462 #if !UCONFIG_NO_BREAK_ITERATION
1463                             (str = *pStr).toTitle(bi, root);
1464                             foldSet.add(str);
1465 #endif
1466                             (str = *pStr).toUpper(root);
1467                             foldSet.add(str);
1468                             (str = *pStr).foldCase();
1469                             foldSet.add(str);
1470                         }
1471                     }
1472 #if !UCONFIG_NO_BREAK_ITERATION
1473                     delete bi;
1474 #endif
1475                 }
1476             }
1477             *this = foldSet;
1478         }
1479     }
1480     return *this;
1481 }
1482 
1483 U_NAMESPACE_END
1484