1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 1999-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: uniset_props.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2004aug25
14 * created by: Markus W. Scherer
15 *
16 * Character property dependent functions moved here from uniset.cpp
17 */
18
19 #include "unicode/utypes.h"
20 #include "unicode/uniset.h"
21 #include "unicode/parsepos.h"
22 #include "unicode/uchar.h"
23 #include "unicode/uscript.h"
24 #include "unicode/symtable.h"
25 #include "unicode/uset.h"
26 #include "unicode/locid.h"
27 #include "unicode/brkiter.h"
28 #include "uset_imp.h"
29 #include "ruleiter.h"
30 #include "cmemory.h"
31 #include "ucln_cmn.h"
32 #include "util.h"
33 #include "uvector.h"
34 #include "uprops.h"
35 #include "propname.h"
36 #include "normalizer2impl.h"
37 #include "unormimp.h"
38 #include "ucase.h"
39 #include "ubidi_props.h"
40 #include "uinvchar.h"
41 #include "uprops.h"
42 #include "charstr.h"
43 #include "cstring.h"
44 #include "mutex.h"
45 #include "umutex.h"
46 #include "uassert.h"
47 #include "hash.h"
48
49 U_NAMESPACE_USE
50
51 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
52
53 // initial storage. Must be >= 0
54 // *** same as in uniset.cpp ! ***
55 #define START_EXTRA 16
56
57 // Define UChar constants using hex for EBCDIC compatibility
58 // Used #define to reduce private static exports and memory access time.
59 #define SET_OPEN ((UChar)0x005B) /*[*/
60 #define SET_CLOSE ((UChar)0x005D) /*]*/
61 #define HYPHEN ((UChar)0x002D) /*-*/
62 #define COMPLEMENT ((UChar)0x005E) /*^*/
63 #define COLON ((UChar)0x003A) /*:*/
64 #define BACKSLASH ((UChar)0x005C) /*\*/
65 #define INTERSECTION ((UChar)0x0026) /*&*/
66 #define UPPER_U ((UChar)0x0055) /*U*/
67 #define LOWER_U ((UChar)0x0075) /*u*/
68 #define OPEN_BRACE ((UChar)123) /*{*/
69 #define CLOSE_BRACE ((UChar)125) /*}*/
70 #define UPPER_P ((UChar)0x0050) /*P*/
71 #define LOWER_P ((UChar)0x0070) /*p*/
72 #define UPPER_N ((UChar)78) /*N*/
73 #define EQUALS ((UChar)0x003D) /*=*/
74
75 //static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:"
76 static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]"
77 //static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p"
78 static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}"
79 //static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N"
80 static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
81
82 // Special property set IDs
83 static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
84 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
85 static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
86
87 // Unicode name property alias
88 #define NAME_PROP "na"
89 #define NAME_PROP_LENGTH 2
90
91 /**
92 * Delimiter string used in patterns to close a category reference:
93 * ":]". Example: "[:Lu:]".
94 */
95 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
96
97 // Cached sets ------------------------------------------------------------- ***
98
99 U_CDECL_BEGIN
100 static UBool U_CALLCONV uset_cleanup();
101 U_CDECL_END
102
103 // Not a TriStateSingletonWrapper because we think the UnicodeSet constructor
104 // can only fail with an out-of-memory error
105 // if we have a correct pattern and the properties data is hardcoded and always available.
106 class UnicodeSetSingleton : public SimpleSingletonWrapper<UnicodeSet> {
107 public:
UnicodeSetSingleton(SimpleSingleton & s,const char * pattern)108 UnicodeSetSingleton(SimpleSingleton &s, const char *pattern) :
109 SimpleSingletonWrapper<UnicodeSet>(s), fPattern(pattern) {}
getInstance(UErrorCode & errorCode)110 UnicodeSet *getInstance(UErrorCode &errorCode) {
111 return SimpleSingletonWrapper<UnicodeSet>::getInstance(createInstance, fPattern, errorCode);
112 }
113 private:
createInstance(const void * context,UErrorCode & errorCode)114 static void *createInstance(const void *context, UErrorCode &errorCode) {
115 UnicodeString pattern((const char *)context, -1, US_INV);
116 UnicodeSet *set=new UnicodeSet(pattern, errorCode);
117 if(set==NULL) {
118 errorCode=U_MEMORY_ALLOCATION_ERROR;
119 }
120 set->freeze();
121 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
122 return set;
123 }
124
125 const char *fPattern;
126 };
127
128 U_CDECL_BEGIN
129
130 static UnicodeSet *INCLUSIONS[UPROPS_SRC_COUNT] = { NULL }; // cached getInclusions()
131
132 STATIC_SIMPLE_SINGLETON(uni32Singleton);
133
134 //----------------------------------------------------------------
135 // Inclusions list
136 //----------------------------------------------------------------
137
138 // USetAdder implementation
139 // Does not use uset.h to reduce code dependencies
140 static void U_CALLCONV
_set_add(USet * set,UChar32 c)141 _set_add(USet *set, UChar32 c) {
142 ((UnicodeSet *)set)->add(c);
143 }
144
145 static void U_CALLCONV
_set_addRange(USet * set,UChar32 start,UChar32 end)146 _set_addRange(USet *set, UChar32 start, UChar32 end) {
147 ((UnicodeSet *)set)->add(start, end);
148 }
149
150 static void U_CALLCONV
_set_addString(USet * set,const UChar * str,int32_t length)151 _set_addString(USet *set, const UChar *str, int32_t length) {
152 ((UnicodeSet *)set)->add(UnicodeString((UBool)(length<0), str, length));
153 }
154
155 /**
156 * Cleanup function for UnicodeSet
157 */
uset_cleanup(void)158 static UBool U_CALLCONV uset_cleanup(void) {
159 int32_t i;
160
161 for(i = UPROPS_SRC_NONE; i < UPROPS_SRC_COUNT; ++i) {
162 if (INCLUSIONS[i] != NULL) {
163 delete INCLUSIONS[i];
164 INCLUSIONS[i] = NULL;
165 }
166 }
167 UnicodeSetSingleton(uni32Singleton, NULL).deleteInstance();
168 return TRUE;
169 }
170
171 U_CDECL_END
172
173 U_NAMESPACE_BEGIN
174
175 /*
176 Reduce excessive reallocation, and make it easier to detect initialization
177 problems.
178 Usually you don't see smaller sets than this for Unicode 5.0.
179 */
180 #define DEFAULT_INCLUSION_CAPACITY 3072
181
getInclusions(int32_t src,UErrorCode & status)182 const UnicodeSet* UnicodeSet::getInclusions(int32_t src, UErrorCode &status) {
183 UBool needInit;
184 UMTX_CHECK(NULL, (INCLUSIONS[src] == NULL), needInit);
185 if (needInit) {
186 UnicodeSet* incl = new UnicodeSet();
187 USetAdder sa = {
188 (USet *)incl,
189 _set_add,
190 _set_addRange,
191 _set_addString,
192 NULL, // don't need remove()
193 NULL // don't need removeRange()
194 };
195 incl->ensureCapacity(DEFAULT_INCLUSION_CAPACITY, status);
196 if (incl != NULL) {
197 switch(src) {
198 case UPROPS_SRC_CHAR:
199 uchar_addPropertyStarts(&sa, &status);
200 break;
201 case UPROPS_SRC_PROPSVEC:
202 upropsvec_addPropertyStarts(&sa, &status);
203 break;
204 case UPROPS_SRC_CHAR_AND_PROPSVEC:
205 uchar_addPropertyStarts(&sa, &status);
206 upropsvec_addPropertyStarts(&sa, &status);
207 break;
208 #if !UCONFIG_NO_NORMALIZATION
209 case UPROPS_SRC_NORM:
210 unorm_addPropertyStarts(&sa, &status);
211 break;
212 case UPROPS_SRC_CASE_AND_NORM:
213 ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
214 unorm_addPropertyStarts(&sa, &status);
215 break;
216 case UPROPS_SRC_NFC: {
217 const Normalizer2Impl *impl=Normalizer2Factory::getNFCImpl(status);
218 if(U_SUCCESS(status)) {
219 impl->addPropertyStarts(&sa, status);
220 }
221 break;
222 }
223 case UPROPS_SRC_NFKC: {
224 const Normalizer2Impl *impl=Normalizer2Factory::getNFKCImpl(status);
225 if(U_SUCCESS(status)) {
226 impl->addPropertyStarts(&sa, status);
227 }
228 break;
229 }
230 case UPROPS_SRC_NFKC_CF: {
231 const Normalizer2Impl *impl=Normalizer2Factory::getNFKC_CFImpl(status);
232 if(U_SUCCESS(status)) {
233 impl->addPropertyStarts(&sa, status);
234 }
235 break;
236 }
237 #endif
238 case UPROPS_SRC_CASE:
239 ucase_addPropertyStarts(ucase_getSingleton(&status), &sa, &status);
240 break;
241 case UPROPS_SRC_BIDI:
242 ubidi_addPropertyStarts(ubidi_getSingleton(&status), &sa, &status);
243 break;
244 default:
245 status = U_INTERNAL_PROGRAM_ERROR;
246 break;
247 }
248 if (U_SUCCESS(status)) {
249 // Compact for caching
250 incl->compact();
251 umtx_lock(NULL);
252 if (INCLUSIONS[src] == NULL) {
253 INCLUSIONS[src] = incl;
254 incl = NULL;
255 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
256 }
257 umtx_unlock(NULL);
258 }
259 delete incl;
260 } else {
261 status = U_MEMORY_ALLOCATION_ERROR;
262 }
263 }
264 return INCLUSIONS[src];
265 }
266
267 // Cache some sets for other services -------------------------------------- ***
268
269 U_CFUNC UnicodeSet *
uniset_getUnicode32Instance(UErrorCode & errorCode)270 uniset_getUnicode32Instance(UErrorCode &errorCode) {
271 return UnicodeSetSingleton(uni32Singleton, "[:age=3.2:]").getInstance(errorCode);
272 }
273
274 // helper functions for matching of pattern syntax pieces ------------------ ***
275 // these functions are parallel to the PERL_OPEN etc. strings above
276
277 // using these functions is not only faster than UnicodeString::compare() and
278 // caseCompare(), but they also make UnicodeSet work for simple patterns when
279 // no Unicode properties data is available - when caseCompare() fails
280
281 static inline UBool
isPerlOpen(const UnicodeString & pattern,int32_t pos)282 isPerlOpen(const UnicodeString &pattern, int32_t pos) {
283 UChar c;
284 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
285 }
286
287 /*static inline UBool
288 isPerlClose(const UnicodeString &pattern, int32_t pos) {
289 return pattern.charAt(pos)==CLOSE_BRACE;
290 }*/
291
292 static inline UBool
isNameOpen(const UnicodeString & pattern,int32_t pos)293 isNameOpen(const UnicodeString &pattern, int32_t pos) {
294 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
295 }
296
297 static inline UBool
isPOSIXOpen(const UnicodeString & pattern,int32_t pos)298 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
299 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
300 }
301
302 /*static inline UBool
303 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
304 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
305 }*/
306
307 // TODO memory debugging provided inside uniset.cpp
308 // could be made available here but probably obsolete with use of modern
309 // memory leak checker tools
310 #define _dbgct(me)
311
312 //----------------------------------------------------------------
313 // Constructors &c
314 //----------------------------------------------------------------
315
316 /**
317 * Constructs a set from the given pattern, optionally ignoring
318 * white space. See the class description for the syntax of the
319 * pattern language.
320 * @param pattern a string specifying what characters are in the set
321 */
UnicodeSet(const UnicodeString & pattern,UErrorCode & status)322 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
323 UErrorCode& status) :
324 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
325 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
326 fFlags(0)
327 {
328 if(U_SUCCESS(status)){
329 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
330 /* test for NULL */
331 if(list == NULL) {
332 status = U_MEMORY_ALLOCATION_ERROR;
333 }else{
334 allocateStrings(status);
335 applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
336 }
337 }
338 _dbgct(this);
339 }
340
341 /**
342 * Constructs a set from the given pattern, optionally ignoring
343 * white space. See the class description for the syntax of the
344 * pattern language.
345 * @param pattern a string specifying what characters are in the set
346 * @param options bitmask for options to apply to the pattern.
347 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
348 */
UnicodeSet(const UnicodeString & pattern,uint32_t options,const SymbolTable * symbols,UErrorCode & status)349 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
350 uint32_t options,
351 const SymbolTable* symbols,
352 UErrorCode& status) :
353 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
354 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
355 fFlags(0)
356 {
357 if(U_SUCCESS(status)){
358 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
359 /* test for NULL */
360 if(list == NULL) {
361 status = U_MEMORY_ALLOCATION_ERROR;
362 }else{
363 allocateStrings(status);
364 applyPattern(pattern, options, symbols, status);
365 }
366 }
367 _dbgct(this);
368 }
369
UnicodeSet(const UnicodeString & pattern,ParsePosition & pos,uint32_t options,const SymbolTable * symbols,UErrorCode & status)370 UnicodeSet::UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
371 uint32_t options,
372 const SymbolTable* symbols,
373 UErrorCode& status) :
374 len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0),
375 bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL),
376 fFlags(0)
377 {
378 if(U_SUCCESS(status)){
379 list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity);
380 /* test for NULL */
381 if(list == NULL) {
382 status = U_MEMORY_ALLOCATION_ERROR;
383 }else{
384 allocateStrings(status);
385 applyPattern(pattern, pos, options, symbols, status);
386 }
387 }
388 _dbgct(this);
389 }
390
391 //----------------------------------------------------------------
392 // Public API
393 //----------------------------------------------------------------
394
395 /**
396 * Modifies this set to represent the set specified by the given
397 * pattern, optionally ignoring white space. See the class
398 * description for the syntax of the pattern language.
399 * @param pattern a string specifying what characters are in the set
400 * @param ignoreSpaces if <code>true</code>, all spaces in the
401 * pattern are ignored. Spaces are those characters for which
402 * <code>uprv_isRuleWhiteSpace()</code> is <code>true</code>.
403 * Characters preceded by '\\' are escaped, losing any special
404 * meaning they otherwise have. Spaces may be included by
405 * escaping them.
406 * @exception <code>IllegalArgumentException</code> if the pattern
407 * contains a syntax error.
408 */
applyPattern(const UnicodeString & pattern,UErrorCode & status)409 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
410 UErrorCode& status) {
411 return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
412 }
413
414
415 /**
416 * Modifies this set to represent the set specified by the given
417 * pattern, optionally ignoring white space. See the class
418 * description for the syntax of the pattern language.
419 * @param pattern a string specifying what characters are in the set
420 * @param options bitmask for options to apply to the pattern.
421 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
422 */
applyPattern(const UnicodeString & pattern,uint32_t options,const SymbolTable * symbols,UErrorCode & status)423 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
424 uint32_t options,
425 const SymbolTable* symbols,
426 UErrorCode& status) {
427 if (U_FAILURE(status) || isFrozen()) {
428 return *this;
429 }
430
431 ParsePosition pos(0);
432 applyPattern(pattern, pos, options, symbols, status);
433 if (U_FAILURE(status)) return *this;
434
435 int32_t i = pos.getIndex();
436
437 if (options & USET_IGNORE_SPACE) {
438 // Skip over trailing whitespace
439 ICU_Utility::skipWhitespace(pattern, i, TRUE);
440 }
441
442 if (i != pattern.length()) {
443 status = U_ILLEGAL_ARGUMENT_ERROR;
444 }
445 return *this;
446 }
447
applyPattern(const UnicodeString & pattern,ParsePosition & pos,uint32_t options,const SymbolTable * symbols,UErrorCode & status)448 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
449 ParsePosition& pos,
450 uint32_t options,
451 const SymbolTable* symbols,
452 UErrorCode& status) {
453 if (U_FAILURE(status) || isFrozen()) {
454 return *this;
455 }
456 // Need to build the pattern in a temporary string because
457 // _applyPattern calls add() etc., which set pat to empty.
458 UnicodeString rebuiltPat;
459 RuleCharacterIterator chars(pattern, symbols, pos);
460 applyPattern(chars, symbols, rebuiltPat, options, status);
461 if (U_FAILURE(status)) return *this;
462 if (chars.inVariable()) {
463 // syntaxError(chars, "Extra chars in variable value");
464 status = U_MALFORMED_SET;
465 return *this;
466 }
467 setPattern(rebuiltPat);
468 return *this;
469 }
470
471 /**
472 * Return true if the given position, in the given pattern, appears
473 * to be the start of a UnicodeSet pattern.
474 */
resemblesPattern(const UnicodeString & pattern,int32_t pos)475 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
476 return ((pos+1) < pattern.length() &&
477 pattern.charAt(pos) == (UChar)91/*[*/) ||
478 resemblesPropertyPattern(pattern, pos);
479 }
480
481 //----------------------------------------------------------------
482 // Implementation: Pattern parsing
483 //----------------------------------------------------------------
484
485 /**
486 * A small all-inline class to manage a UnicodeSet pointer. Add
487 * operator->() etc. as needed.
488 */
489 class UnicodeSetPointer {
490 UnicodeSet* p;
491 public:
UnicodeSetPointer()492 inline UnicodeSetPointer() : p(0) {}
~UnicodeSetPointer()493 inline ~UnicodeSetPointer() { delete p; }
pointer()494 inline UnicodeSet* pointer() { return p; }
allocate()495 inline UBool allocate() {
496 if (p == 0) {
497 p = new UnicodeSet();
498 }
499 return p != 0;
500 }
501 };
502
503 /**
504 * Parse the pattern from the given RuleCharacterIterator. The
505 * iterator is advanced over the parsed pattern.
506 * @param chars iterator over the pattern characters. Upon return
507 * it will be advanced to the first character after the parsed
508 * pattern, or the end of the iteration if all characters are
509 * parsed.
510 * @param symbols symbol table to use to parse and dereference
511 * variables, or null if none.
512 * @param rebuiltPat the pattern that was parsed, rebuilt or
513 * copied from the input pattern, as appropriate.
514 * @param options a bit mask of zero or more of the following:
515 * IGNORE_SPACE, CASE.
516 */
applyPattern(RuleCharacterIterator & chars,const SymbolTable * symbols,UnicodeString & rebuiltPat,uint32_t options,UErrorCode & ec)517 void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
518 const SymbolTable* symbols,
519 UnicodeString& rebuiltPat,
520 uint32_t options,
521 UErrorCode& ec) {
522 if (U_FAILURE(ec)) return;
523
524 // Syntax characters: [ ] ^ - & { }
525
526 // Recognized special forms for chars, sets: c-c s-s s&s
527
528 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
529 RuleCharacterIterator::PARSE_ESCAPES;
530 if ((options & USET_IGNORE_SPACE) != 0) {
531 opts |= RuleCharacterIterator::SKIP_WHITESPACE;
532 }
533
534 UnicodeString patLocal, buf;
535 UBool usePat = FALSE;
536 UnicodeSetPointer scratch;
537 RuleCharacterIterator::Pos backup;
538
539 // mode: 0=before [, 1=between [...], 2=after ]
540 // lastItem: 0=none, 1=char, 2=set
541 int8_t lastItem = 0, mode = 0;
542 UChar32 lastChar = 0;
543 UChar op = 0;
544
545 UBool invert = FALSE;
546
547 clear();
548
549 while (mode != 2 && !chars.atEnd()) {
550 U_ASSERT((lastItem == 0 && op == 0) ||
551 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
552 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
553 op == INTERSECTION /*'&'*/)));
554
555 UChar32 c = 0;
556 UBool literal = FALSE;
557 UnicodeSet* nested = 0; // alias - do not delete
558
559 // -------- Check for property pattern
560
561 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
562 int8_t setMode = 0;
563 if (resemblesPropertyPattern(chars, opts)) {
564 setMode = 2;
565 }
566
567 // -------- Parse '[' of opening delimiter OR nested set.
568 // If there is a nested set, use `setMode' to define how
569 // the set should be parsed. If the '[' is part of the
570 // opening delimiter for this pattern, parse special
571 // strings "[", "[^", "[-", and "[^-". Check for stand-in
572 // characters representing a nested set in the symbol
573 // table.
574
575 else {
576 // Prepare to backup if necessary
577 chars.getPos(backup);
578 c = chars.next(opts, literal, ec);
579 if (U_FAILURE(ec)) return;
580
581 if (c == 0x5B /*'['*/ && !literal) {
582 if (mode == 1) {
583 chars.setPos(backup); // backup
584 setMode = 1;
585 } else {
586 // Handle opening '[' delimiter
587 mode = 1;
588 patLocal.append((UChar) 0x5B /*'['*/);
589 chars.getPos(backup); // prepare to backup
590 c = chars.next(opts, literal, ec);
591 if (U_FAILURE(ec)) return;
592 if (c == 0x5E /*'^'*/ && !literal) {
593 invert = TRUE;
594 patLocal.append((UChar) 0x5E /*'^'*/);
595 chars.getPos(backup); // prepare to backup
596 c = chars.next(opts, literal, ec);
597 if (U_FAILURE(ec)) return;
598 }
599 // Fall through to handle special leading '-';
600 // otherwise restart loop for nested [], \p{}, etc.
601 if (c == HYPHEN /*'-'*/) {
602 literal = TRUE;
603 // Fall through to handle literal '-' below
604 } else {
605 chars.setPos(backup); // backup
606 continue;
607 }
608 }
609 } else if (symbols != 0) {
610 const UnicodeFunctor *m = symbols->lookupMatcher(c);
611 if (m != 0) {
612 if (m->getDynamicClassID() != UnicodeSet::getStaticClassID()) {
613 ec = U_MALFORMED_SET;
614 return;
615 }
616 // casting away const, but `nested' won't be modified
617 // (important not to modify stored set)
618 nested = (UnicodeSet*) m;
619 setMode = 3;
620 }
621 }
622 }
623
624 // -------- Handle a nested set. This either is inline in
625 // the pattern or represented by a stand-in that has
626 // previously been parsed and was looked up in the symbol
627 // table.
628
629 if (setMode != 0) {
630 if (lastItem == 1) {
631 if (op != 0) {
632 // syntaxError(chars, "Char expected after operator");
633 ec = U_MALFORMED_SET;
634 return;
635 }
636 add(lastChar, lastChar);
637 _appendToPat(patLocal, lastChar, FALSE);
638 lastItem = 0;
639 op = 0;
640 }
641
642 if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
643 patLocal.append(op);
644 }
645
646 if (nested == 0) {
647 // lazy allocation
648 if (!scratch.allocate()) {
649 ec = U_MEMORY_ALLOCATION_ERROR;
650 return;
651 }
652 nested = scratch.pointer();
653 }
654 switch (setMode) {
655 case 1:
656 nested->applyPattern(chars, symbols, patLocal, options, ec);
657 break;
658 case 2:
659 chars.skipIgnored(opts);
660 nested->applyPropertyPattern(chars, patLocal, ec);
661 if (U_FAILURE(ec)) return;
662 break;
663 case 3: // `nested' already parsed
664 nested->_toPattern(patLocal, FALSE);
665 break;
666 }
667
668 usePat = TRUE;
669
670 if (mode == 0) {
671 // Entire pattern is a category; leave parse loop
672 *this = *nested;
673 mode = 2;
674 break;
675 }
676
677 switch (op) {
678 case HYPHEN: /*'-'*/
679 removeAll(*nested);
680 break;
681 case INTERSECTION: /*'&'*/
682 retainAll(*nested);
683 break;
684 case 0:
685 addAll(*nested);
686 break;
687 }
688
689 op = 0;
690 lastItem = 2;
691
692 continue;
693 }
694
695 if (mode == 0) {
696 // syntaxError(chars, "Missing '['");
697 ec = U_MALFORMED_SET;
698 return;
699 }
700
701 // -------- Parse special (syntax) characters. If the
702 // current character is not special, or if it is escaped,
703 // then fall through and handle it below.
704
705 if (!literal) {
706 switch (c) {
707 case 0x5D /*']'*/:
708 if (lastItem == 1) {
709 add(lastChar, lastChar);
710 _appendToPat(patLocal, lastChar, FALSE);
711 }
712 // Treat final trailing '-' as a literal
713 if (op == HYPHEN /*'-'*/) {
714 add(op, op);
715 patLocal.append(op);
716 } else if (op == INTERSECTION /*'&'*/) {
717 // syntaxError(chars, "Trailing '&'");
718 ec = U_MALFORMED_SET;
719 return;
720 }
721 patLocal.append((UChar) 0x5D /*']'*/);
722 mode = 2;
723 continue;
724 case HYPHEN /*'-'*/:
725 if (op == 0) {
726 if (lastItem != 0) {
727 op = (UChar) c;
728 continue;
729 } else {
730 // Treat final trailing '-' as a literal
731 add(c, c);
732 c = chars.next(opts, literal, ec);
733 if (U_FAILURE(ec)) return;
734 if (c == 0x5D /*']'*/ && !literal) {
735 patLocal.append(HYPHEN_RIGHT_BRACE);
736 mode = 2;
737 continue;
738 }
739 }
740 }
741 // syntaxError(chars, "'-' not after char or set");
742 ec = U_MALFORMED_SET;
743 return;
744 case INTERSECTION /*'&'*/:
745 if (lastItem == 2 && op == 0) {
746 op = (UChar) c;
747 continue;
748 }
749 // syntaxError(chars, "'&' not after set");
750 ec = U_MALFORMED_SET;
751 return;
752 case 0x5E /*'^'*/:
753 // syntaxError(chars, "'^' not after '['");
754 ec = U_MALFORMED_SET;
755 return;
756 case 0x7B /*'{'*/:
757 if (op != 0) {
758 // syntaxError(chars, "Missing operand after operator");
759 ec = U_MALFORMED_SET;
760 return;
761 }
762 if (lastItem == 1) {
763 add(lastChar, lastChar);
764 _appendToPat(patLocal, lastChar, FALSE);
765 }
766 lastItem = 0;
767 buf.truncate(0);
768 {
769 UBool ok = FALSE;
770 while (!chars.atEnd()) {
771 c = chars.next(opts, literal, ec);
772 if (U_FAILURE(ec)) return;
773 if (c == 0x7D /*'}'*/ && !literal) {
774 ok = TRUE;
775 break;
776 }
777 buf.append(c);
778 }
779 if (buf.length() < 1 || !ok) {
780 // syntaxError(chars, "Invalid multicharacter string");
781 ec = U_MALFORMED_SET;
782 return;
783 }
784 }
785 // We have new string. Add it to set and continue;
786 // we don't need to drop through to the further
787 // processing
788 add(buf);
789 patLocal.append((UChar) 0x7B /*'{'*/);
790 _appendToPat(patLocal, buf, FALSE);
791 patLocal.append((UChar) 0x7D /*'}'*/);
792 continue;
793 case SymbolTable::SYMBOL_REF:
794 // symbols nosymbols
795 // [a-$] error error (ambiguous)
796 // [a$] anchor anchor
797 // [a-$x] var "x"* literal '$'
798 // [a-$.] error literal '$'
799 // *We won't get here in the case of var "x"
800 {
801 chars.getPos(backup);
802 c = chars.next(opts, literal, ec);
803 if (U_FAILURE(ec)) return;
804 UBool anchor = (c == 0x5D /*']'*/ && !literal);
805 if (symbols == 0 && !anchor) {
806 c = SymbolTable::SYMBOL_REF;
807 chars.setPos(backup);
808 break; // literal '$'
809 }
810 if (anchor && op == 0) {
811 if (lastItem == 1) {
812 add(lastChar, lastChar);
813 _appendToPat(patLocal, lastChar, FALSE);
814 }
815 add(U_ETHER);
816 usePat = TRUE;
817 patLocal.append((UChar) SymbolTable::SYMBOL_REF);
818 patLocal.append((UChar) 0x5D /*']'*/);
819 mode = 2;
820 continue;
821 }
822 // syntaxError(chars, "Unquoted '$'");
823 ec = U_MALFORMED_SET;
824 return;
825 }
826 default:
827 break;
828 }
829 }
830
831 // -------- Parse literal characters. This includes both
832 // escaped chars ("\u4E01") and non-syntax characters
833 // ("a").
834
835 switch (lastItem) {
836 case 0:
837 lastItem = 1;
838 lastChar = c;
839 break;
840 case 1:
841 if (op == HYPHEN /*'-'*/) {
842 if (lastChar >= c) {
843 // Don't allow redundant (a-a) or empty (b-a) ranges;
844 // these are most likely typos.
845 // syntaxError(chars, "Invalid range");
846 ec = U_MALFORMED_SET;
847 return;
848 }
849 add(lastChar, c);
850 _appendToPat(patLocal, lastChar, FALSE);
851 patLocal.append(op);
852 _appendToPat(patLocal, c, FALSE);
853 lastItem = 0;
854 op = 0;
855 } else {
856 add(lastChar, lastChar);
857 _appendToPat(patLocal, lastChar, FALSE);
858 lastChar = c;
859 }
860 break;
861 case 2:
862 if (op != 0) {
863 // syntaxError(chars, "Set expected after operator");
864 ec = U_MALFORMED_SET;
865 return;
866 }
867 lastChar = c;
868 lastItem = 1;
869 break;
870 }
871 }
872
873 if (mode != 2) {
874 // syntaxError(chars, "Missing ']'");
875 ec = U_MALFORMED_SET;
876 return;
877 }
878
879 chars.skipIgnored(opts);
880
881 /**
882 * Handle global flags (invert, case insensitivity). If this
883 * pattern should be compiled case-insensitive, then we need
884 * to close over case BEFORE COMPLEMENTING. This makes
885 * patterns like /[^abc]/i work.
886 */
887 if ((options & USET_CASE_INSENSITIVE) != 0) {
888 closeOver(USET_CASE_INSENSITIVE);
889 }
890 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
891 closeOver(USET_ADD_CASE_MAPPINGS);
892 }
893 if (invert) {
894 complement();
895 }
896
897 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
898 // generated pattern.
899 if (usePat) {
900 rebuiltPat.append(patLocal);
901 } else {
902 _generatePattern(rebuiltPat, FALSE);
903 }
904 if (isBogus() && U_SUCCESS(ec)) {
905 // We likely ran out of memory. AHHH!
906 ec = U_MEMORY_ALLOCATION_ERROR;
907 }
908 }
909
910 //----------------------------------------------------------------
911 // Property set implementation
912 //----------------------------------------------------------------
913
numericValueFilter(UChar32 ch,void * context)914 static UBool numericValueFilter(UChar32 ch, void* context) {
915 return u_getNumericValue(ch) == *(double*)context;
916 }
917
generalCategoryMaskFilter(UChar32 ch,void * context)918 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
919 int32_t value = *(int32_t*)context;
920 return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
921 }
922
versionFilter(UChar32 ch,void * context)923 static UBool versionFilter(UChar32 ch, void* context) {
924 UVersionInfo v, none = { 0, 0, 0, 0};
925 UVersionInfo* version = (UVersionInfo*)context;
926 u_charAge(ch, v);
927 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
928 }
929
930 typedef struct {
931 UProperty prop;
932 int32_t value;
933 } IntPropertyContext;
934
intPropertyFilter(UChar32 ch,void * context)935 static UBool intPropertyFilter(UChar32 ch, void* context) {
936 IntPropertyContext* c = (IntPropertyContext*)context;
937 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
938 }
939
940
941 /**
942 * Generic filter-based scanning code for UCD property UnicodeSets.
943 */
applyFilter(UnicodeSet::Filter filter,void * context,int32_t src,UErrorCode & status)944 void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
945 void* context,
946 int32_t src,
947 UErrorCode &status) {
948 // Walk through all Unicode characters, noting the start
949 // and end of each range for which filter.contain(c) is
950 // true. Add each range to a set.
951 //
952 // To improve performance, use the INCLUSIONS set, which
953 // encodes information about character ranges that are known
954 // to have identical properties. INCLUSIONS contains
955 // only the first characters of such ranges.
956 //
957 // TODO Where possible, instead of scanning over code points,
958 // use internal property data to initialize UnicodeSets for
959 // those properties. Scanning code points is slow.
960 if (U_FAILURE(status)) return;
961
962 const UnicodeSet* inclusions = getInclusions(src, status);
963 if (U_FAILURE(status)) {
964 return;
965 }
966
967 clear();
968
969 UChar32 startHasProperty = -1;
970 int32_t limitRange = inclusions->getRangeCount();
971
972 for (int j=0; j<limitRange; ++j) {
973 // get current range
974 UChar32 start = inclusions->getRangeStart(j);
975 UChar32 end = inclusions->getRangeEnd(j);
976
977 // for all the code points in the range, process
978 for (UChar32 ch = start; ch <= end; ++ch) {
979 // only add to this UnicodeSet on inflection points --
980 // where the hasProperty value changes to false
981 if ((*filter)(ch, context)) {
982 if (startHasProperty < 0) {
983 startHasProperty = ch;
984 }
985 } else if (startHasProperty >= 0) {
986 add(startHasProperty, ch-1);
987 startHasProperty = -1;
988 }
989 }
990 }
991 if (startHasProperty >= 0) {
992 add((UChar32)startHasProperty, (UChar32)0x10FFFF);
993 }
994 if (isBogus() && U_SUCCESS(status)) {
995 // We likely ran out of memory. AHHH!
996 status = U_MEMORY_ALLOCATION_ERROR;
997 }
998 }
999
mungeCharName(char * dst,const char * src,int32_t dstCapacity)1000 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
1001 /* Note: we use ' ' in compiler code page */
1002 int32_t j = 0;
1003 char ch;
1004 --dstCapacity; /* make room for term. zero */
1005 while ((ch = *src++) != 0) {
1006 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
1007 continue;
1008 }
1009 if (j >= dstCapacity) return FALSE;
1010 dst[j++] = ch;
1011 }
1012 if (j > 0 && dst[j-1] == ' ') --j;
1013 dst[j] = 0;
1014 return TRUE;
1015 }
1016
1017 //----------------------------------------------------------------
1018 // Property set API
1019 //----------------------------------------------------------------
1020
1021 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
1022
1023 UnicodeSet&
applyIntPropertyValue(UProperty prop,int32_t value,UErrorCode & ec)1024 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
1025 if (U_FAILURE(ec) || isFrozen()) return *this;
1026
1027 if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
1028 applyFilter(generalCategoryMaskFilter, &value, UPROPS_SRC_CHAR, ec);
1029 } else {
1030 IntPropertyContext c = {prop, value};
1031 applyFilter(intPropertyFilter, &c, uprops_getSource(prop), ec);
1032 }
1033 return *this;
1034 }
1035
1036 UnicodeSet&
applyPropertyAlias(const UnicodeString & prop,const UnicodeString & value,UErrorCode & ec)1037 UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
1038 const UnicodeString& value,
1039 UErrorCode& ec) {
1040 if (U_FAILURE(ec) || isFrozen()) return *this;
1041
1042 // prop and value used to be converted to char * using the default
1043 // converter instead of the invariant conversion.
1044 // This should not be necessary because all Unicode property and value
1045 // names use only invariant characters.
1046 // If there are any variant characters, then we won't find them anyway.
1047 // Checking first avoids assertion failures in the conversion.
1048 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
1049 !uprv_isInvariantUString(value.getBuffer(), value.length())
1050 ) {
1051 FAIL(ec);
1052 }
1053 CharString pname(prop);
1054 CharString vname(value);
1055
1056 UProperty p;
1057 int32_t v;
1058 UBool mustNotBeEmpty = FALSE, invert = FALSE;
1059
1060 if (value.length() > 0) {
1061 p = u_getPropertyEnum(pname);
1062 if (p == UCHAR_INVALID_CODE) FAIL(ec);
1063
1064 // Treat gc as gcm
1065 if (p == UCHAR_GENERAL_CATEGORY) {
1066 p = UCHAR_GENERAL_CATEGORY_MASK;
1067 }
1068
1069 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
1070 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
1071 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
1072 v = u_getPropertyValueEnum(p, vname);
1073 if (v == UCHAR_INVALID_CODE) {
1074 // Handle numeric CCC
1075 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
1076 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
1077 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
1078 char* end;
1079 double value = uprv_strtod(vname, &end);
1080 v = (int32_t) value;
1081 if (v != value || v < 0 || *end != 0) {
1082 // non-integral or negative value, or trailing junk
1083 FAIL(ec);
1084 }
1085 // If the resultant set is empty then the numeric value
1086 // was invalid.
1087 mustNotBeEmpty = TRUE;
1088 } else {
1089 FAIL(ec);
1090 }
1091 }
1092 }
1093
1094 else {
1095
1096 switch (p) {
1097 case UCHAR_NUMERIC_VALUE:
1098 {
1099 char* end;
1100 double value = uprv_strtod(vname, &end);
1101 if (*end != 0) {
1102 FAIL(ec);
1103 }
1104 applyFilter(numericValueFilter, &value, UPROPS_SRC_CHAR, ec);
1105 return *this;
1106 }
1107 break;
1108 case UCHAR_NAME:
1109 case UCHAR_UNICODE_1_NAME:
1110 {
1111 // Must munge name, since u_charFromName() does not do
1112 // 'loose' matching.
1113 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
1114 if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
1115 UCharNameChoice choice = (p == UCHAR_NAME) ?
1116 U_EXTENDED_CHAR_NAME : U_UNICODE_10_CHAR_NAME;
1117 UChar32 ch = u_charFromName(choice, buf, &ec);
1118 if (U_SUCCESS(ec)) {
1119 clear();
1120 add(ch);
1121 return *this;
1122 } else {
1123 FAIL(ec);
1124 }
1125 }
1126 break;
1127 case UCHAR_AGE:
1128 {
1129 // Must munge name, since u_versionFromString() does not do
1130 // 'loose' matching.
1131 char buf[128];
1132 if (!mungeCharName(buf, vname, sizeof(buf))) FAIL(ec);
1133 UVersionInfo version;
1134 u_versionFromString(version, buf);
1135 applyFilter(versionFilter, &version, UPROPS_SRC_PROPSVEC, ec);
1136 return *this;
1137 }
1138 break;
1139 default:
1140 // p is a non-binary, non-enumerated property that we
1141 // don't support (yet).
1142 FAIL(ec);
1143 }
1144 }
1145 }
1146
1147 else {
1148 // value is empty. Interpret as General Category, Script, or
1149 // Binary property.
1150 p = UCHAR_GENERAL_CATEGORY_MASK;
1151 v = u_getPropertyValueEnum(p, pname);
1152 if (v == UCHAR_INVALID_CODE) {
1153 p = UCHAR_SCRIPT;
1154 v = u_getPropertyValueEnum(p, pname);
1155 if (v == UCHAR_INVALID_CODE) {
1156 p = u_getPropertyEnum(pname);
1157 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
1158 v = 1;
1159 } else if (0 == uprv_comparePropertyNames(ANY, pname)) {
1160 set(MIN_VALUE, MAX_VALUE);
1161 return *this;
1162 } else if (0 == uprv_comparePropertyNames(ASCII, pname)) {
1163 set(0, 0x7F);
1164 return *this;
1165 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname)) {
1166 // [:Assigned:]=[:^Cn:]
1167 p = UCHAR_GENERAL_CATEGORY_MASK;
1168 v = U_GC_CN_MASK;
1169 invert = TRUE;
1170 } else {
1171 FAIL(ec);
1172 }
1173 }
1174 }
1175 }
1176
1177 applyIntPropertyValue(p, v, ec);
1178 if(invert) {
1179 complement();
1180 }
1181
1182 if (U_SUCCESS(ec) && (mustNotBeEmpty && isEmpty())) {
1183 // mustNotBeEmpty is set to true if an empty set indicates
1184 // invalid input.
1185 ec = U_ILLEGAL_ARGUMENT_ERROR;
1186 }
1187
1188 if (isBogus() && U_SUCCESS(ec)) {
1189 // We likely ran out of memory. AHHH!
1190 ec = U_MEMORY_ALLOCATION_ERROR;
1191 }
1192 return *this;
1193 }
1194
1195 //----------------------------------------------------------------
1196 // Property set patterns
1197 //----------------------------------------------------------------
1198
1199 /**
1200 * Return true if the given position, in the given pattern, appears
1201 * to be the start of a property set pattern.
1202 */
resemblesPropertyPattern(const UnicodeString & pattern,int32_t pos)1203 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
1204 int32_t pos) {
1205 // Patterns are at least 5 characters long
1206 if ((pos+5) > pattern.length()) {
1207 return FALSE;
1208 }
1209
1210 // Look for an opening [:, [:^, \p, or \P
1211 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
1212 }
1213
1214 /**
1215 * Return true if the given iterator appears to point at a
1216 * property pattern. Regardless of the result, return with the
1217 * iterator unchanged.
1218 * @param chars iterator over the pattern characters. Upon return
1219 * it will be unchanged.
1220 * @param iterOpts RuleCharacterIterator options
1221 */
resemblesPropertyPattern(RuleCharacterIterator & chars,int32_t iterOpts)1222 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
1223 int32_t iterOpts) {
1224 // NOTE: literal will always be FALSE, because we don't parse escapes.
1225 UBool result = FALSE, literal;
1226 UErrorCode ec = U_ZERO_ERROR;
1227 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1228 RuleCharacterIterator::Pos pos;
1229 chars.getPos(pos);
1230 UChar32 c = chars.next(iterOpts, literal, ec);
1231 if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
1232 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1233 literal, ec);
1234 result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
1235 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
1236 }
1237 chars.setPos(pos);
1238 return result && U_SUCCESS(ec);
1239 }
1240
1241 /**
1242 * Parse the given property pattern at the given parse position.
1243 */
applyPropertyPattern(const UnicodeString & pattern,ParsePosition & ppos,UErrorCode & ec)1244 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1245 ParsePosition& ppos,
1246 UErrorCode &ec) {
1247 int32_t pos = ppos.getIndex();
1248
1249 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1250 UBool isName = FALSE; // true for \N{pat}, o/w false
1251 UBool invert = FALSE;
1252
1253 if (U_FAILURE(ec)) return *this;
1254
1255 // Minimum length is 5 characters, e.g. \p{L}
1256 if ((pos+5) > pattern.length()) {
1257 FAIL(ec);
1258 }
1259
1260 // On entry, ppos should point to one of the following locations:
1261 // Look for an opening [:, [:^, \p, or \P
1262 if (isPOSIXOpen(pattern, pos)) {
1263 posix = TRUE;
1264 pos += 2;
1265 pos = ICU_Utility::skipWhitespace(pattern, pos);
1266 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
1267 ++pos;
1268 invert = TRUE;
1269 }
1270 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1271 UChar c = pattern.charAt(pos+1);
1272 invert = (c == UPPER_P);
1273 isName = (c == UPPER_N);
1274 pos += 2;
1275 pos = ICU_Utility::skipWhitespace(pattern, pos);
1276 if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
1277 // Syntax error; "\p" or "\P" not followed by "{"
1278 FAIL(ec);
1279 }
1280 } else {
1281 // Open delimiter not seen
1282 FAIL(ec);
1283 }
1284
1285 // Look for the matching close delimiter, either :] or }
1286 int32_t close = pattern.indexOf(posix ? POSIX_CLOSE : PERL_CLOSE, pos);
1287 if (close < 0) {
1288 // Syntax error; close delimiter missing
1289 FAIL(ec);
1290 }
1291
1292 // Look for an '=' sign. If this is present, we will parse a
1293 // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1294 // pattern.
1295 int32_t equals = pattern.indexOf(EQUALS, pos);
1296 UnicodeString propName, valueName;
1297 if (equals >= 0 && equals < close && !isName) {
1298 // Equals seen; parse medium/long pattern
1299 pattern.extractBetween(pos, equals, propName);
1300 pattern.extractBetween(equals+1, close, valueName);
1301 }
1302
1303 else {
1304 // Handle case where no '=' is seen, and \N{}
1305 pattern.extractBetween(pos, close, propName);
1306
1307 // Handle \N{name}
1308 if (isName) {
1309 // This is a little inefficient since it means we have to
1310 // parse NAME_PROP back to UCHAR_NAME even though we already
1311 // know it's UCHAR_NAME. If we refactor the API to
1312 // support args of (UProperty, char*) then we can remove
1313 // NAME_PROP and make this a little more efficient.
1314 valueName = propName;
1315 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1316 }
1317 }
1318
1319 applyPropertyAlias(propName, valueName, ec);
1320
1321 if (U_SUCCESS(ec)) {
1322 if (invert) {
1323 complement();
1324 }
1325
1326 // Move to the limit position after the close delimiter if the
1327 // parse succeeded.
1328 ppos.setIndex(close + (posix ? 2 : 1));
1329 }
1330
1331 return *this;
1332 }
1333
1334 /**
1335 * Parse a property pattern.
1336 * @param chars iterator over the pattern characters. Upon return
1337 * it will be advanced to the first character after the parsed
1338 * pattern, or the end of the iteration if all characters are
1339 * parsed.
1340 * @param rebuiltPat the pattern that was parsed, rebuilt or
1341 * copied from the input pattern, as appropriate.
1342 */
applyPropertyPattern(RuleCharacterIterator & chars,UnicodeString & rebuiltPat,UErrorCode & ec)1343 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1344 UnicodeString& rebuiltPat,
1345 UErrorCode& ec) {
1346 if (U_FAILURE(ec)) return;
1347 UnicodeString pattern;
1348 chars.lookahead(pattern);
1349 ParsePosition pos(0);
1350 applyPropertyPattern(pattern, pos, ec);
1351 if (U_FAILURE(ec)) return;
1352 if (pos.getIndex() == 0) {
1353 // syntaxError(chars, "Invalid property pattern");
1354 ec = U_MALFORMED_SET;
1355 return;
1356 }
1357 chars.jumpahead(pos.getIndex());
1358 rebuiltPat.append(pattern, 0, pos.getIndex());
1359 }
1360
1361 //----------------------------------------------------------------
1362 // Case folding API
1363 //----------------------------------------------------------------
1364
1365 // add the result of a full case mapping to the set
1366 // use str as a temporary string to avoid constructing one
1367 static inline void
addCaseMapping(UnicodeSet & set,int32_t result,const UChar * full,UnicodeString & str)1368 addCaseMapping(UnicodeSet &set, int32_t result, const UChar *full, UnicodeString &str) {
1369 if(result >= 0) {
1370 if(result > UCASE_MAX_STRING_LENGTH) {
1371 // add a single-code point case mapping
1372 set.add(result);
1373 } else {
1374 // add a string case mapping from full with length result
1375 str.setTo((UBool)FALSE, full, result);
1376 set.add(str);
1377 }
1378 }
1379 // result < 0: the code point mapped to itself, no need to add it
1380 // see ucase.h
1381 }
1382
closeOver(int32_t attribute)1383 UnicodeSet& UnicodeSet::closeOver(int32_t attribute) {
1384 if (isFrozen() || isBogus()) {
1385 return *this;
1386 }
1387 if (attribute & (USET_CASE_INSENSITIVE | USET_ADD_CASE_MAPPINGS)) {
1388 UErrorCode status = U_ZERO_ERROR;
1389 const UCaseProps *csp = ucase_getSingleton(&status);
1390 if (U_SUCCESS(status)) {
1391 UnicodeSet foldSet(*this);
1392 UnicodeString str;
1393 USetAdder sa = {
1394 (USet *)&foldSet,
1395 _set_add,
1396 _set_addRange,
1397 _set_addString,
1398 NULL, // don't need remove()
1399 NULL // don't need removeRange()
1400 };
1401
1402 // start with input set to guarantee inclusion
1403 // USET_CASE: remove strings because the strings will actually be reduced (folded);
1404 // therefore, start with no strings and add only those needed
1405 if (attribute & USET_CASE_INSENSITIVE) {
1406 foldSet.strings->removeAllElements();
1407 }
1408
1409 int32_t n = getRangeCount();
1410 UChar32 result;
1411 const UChar *full;
1412 int32_t locCache = 0;
1413
1414 for (int32_t i=0; i<n; ++i) {
1415 UChar32 start = getRangeStart(i);
1416 UChar32 end = getRangeEnd(i);
1417
1418 if (attribute & USET_CASE_INSENSITIVE) {
1419 // full case closure
1420 for (UChar32 cp=start; cp<=end; ++cp) {
1421 ucase_addCaseClosure(csp, cp, &sa);
1422 }
1423 } else {
1424 // add case mappings
1425 // (does not add long s for regular s, or Kelvin for k, for example)
1426 for (UChar32 cp=start; cp<=end; ++cp) {
1427 result = ucase_toFullLower(csp, cp, NULL, NULL, &full, "", &locCache);
1428 addCaseMapping(foldSet, result, full, str);
1429
1430 result = ucase_toFullTitle(csp, cp, NULL, NULL, &full, "", &locCache);
1431 addCaseMapping(foldSet, result, full, str);
1432
1433 result = ucase_toFullUpper(csp, cp, NULL, NULL, &full, "", &locCache);
1434 addCaseMapping(foldSet, result, full, str);
1435
1436 result = ucase_toFullFolding(csp, cp, &full, 0);
1437 addCaseMapping(foldSet, result, full, str);
1438 }
1439 }
1440 }
1441 if (strings != NULL && strings->size() > 0) {
1442 if (attribute & USET_CASE_INSENSITIVE) {
1443 for (int32_t j=0; j<strings->size(); ++j) {
1444 str = *(const UnicodeString *) strings->elementAt(j);
1445 str.foldCase();
1446 if(!ucase_addStringCaseClosure(csp, str.getBuffer(), str.length(), &sa)) {
1447 foldSet.add(str); // does not map to code points: add the folded string itself
1448 }
1449 }
1450 } else {
1451 Locale root("");
1452 #if !UCONFIG_NO_BREAK_ITERATION
1453 BreakIterator *bi = BreakIterator::createWordInstance(root, status);
1454 #endif
1455 if (U_SUCCESS(status)) {
1456 const UnicodeString *pStr;
1457
1458 for (int32_t j=0; j<strings->size(); ++j) {
1459 pStr = (const UnicodeString *) strings->elementAt(j);
1460 (str = *pStr).toLower(root);
1461 foldSet.add(str);
1462 #if !UCONFIG_NO_BREAK_ITERATION
1463 (str = *pStr).toTitle(bi, root);
1464 foldSet.add(str);
1465 #endif
1466 (str = *pStr).toUpper(root);
1467 foldSet.add(str);
1468 (str = *pStr).foldCase();
1469 foldSet.add(str);
1470 }
1471 }
1472 #if !UCONFIG_NO_BREAK_ITERATION
1473 delete bi;
1474 #endif
1475 }
1476 }
1477 *this = foldSet;
1478 }
1479 }
1480 return *this;
1481 }
1482
1483 U_NAMESPACE_END
1484