1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 1999-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: uniset_props.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2004aug25
16 * created by: Markus W. Scherer
17 *
18 * Character property dependent functions moved here from uniset.cpp
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/uniset.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/uchar.h"
25 #include "unicode/uscript.h"
26 #include "unicode/symtable.h"
27 #include "unicode/uset.h"
28 #include "unicode/locid.h"
29 #include "unicode/brkiter.h"
30 #include "uset_imp.h"
31 #include "ruleiter.h"
32 #include "cmemory.h"
33 #include "ucln_cmn.h"
34 #include "util.h"
35 #include "uvector.h"
36 #include "uprops.h"
37 #include "propname.h"
38 #include "normalizer2impl.h"
39 #include "uinvchar.h"
40 #include "uprops.h"
41 #include "charstr.h"
42 #include "cstring.h"
43 #include "mutex.h"
44 #include "umutex.h"
45 #include "uassert.h"
46 #include "hash.h"
47
48 U_NAMESPACE_USE
49
50 // Define UChar constants using hex for EBCDIC compatibility
51 // Used #define to reduce private static exports and memory access time.
52 #define SET_OPEN ((UChar)0x005B) /*[*/
53 #define SET_CLOSE ((UChar)0x005D) /*]*/
54 #define HYPHEN ((UChar)0x002D) /*-*/
55 #define COMPLEMENT ((UChar)0x005E) /*^*/
56 #define COLON ((UChar)0x003A) /*:*/
57 #define BACKSLASH ((UChar)0x005C) /*\*/
58 #define INTERSECTION ((UChar)0x0026) /*&*/
59 #define UPPER_U ((UChar)0x0055) /*U*/
60 #define LOWER_U ((UChar)0x0075) /*u*/
61 #define OPEN_BRACE ((UChar)123) /*{*/
62 #define CLOSE_BRACE ((UChar)125) /*}*/
63 #define UPPER_P ((UChar)0x0050) /*P*/
64 #define LOWER_P ((UChar)0x0070) /*p*/
65 #define UPPER_N ((UChar)78) /*N*/
66 #define EQUALS ((UChar)0x003D) /*=*/
67
68 //static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:"
69 static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]"
70 //static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p"
71 //static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}"
72 //static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N"
73 static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/
74
75 // Special property set IDs
76 static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
77 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
78 static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
79
80 // Unicode name property alias
81 #define NAME_PROP "na"
82 #define NAME_PROP_LENGTH 2
83
84 /**
85 * Delimiter string used in patterns to close a category reference:
86 * ":]". Example: "[:Lu:]".
87 */
88 //static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */
89
90 // Cached sets ------------------------------------------------------------- ***
91
92 U_CDECL_BEGIN
93 static UBool U_CALLCONV uset_cleanup();
94
95 static UnicodeSet *uni32Singleton;
96 static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER;
97
98 /**
99 * Cleanup function for UnicodeSet
100 */
uset_cleanup(void)101 static UBool U_CALLCONV uset_cleanup(void) {
102 delete uni32Singleton;
103 uni32Singleton = NULL;
104 uni32InitOnce.reset();
105 return TRUE;
106 }
107
108 U_CDECL_END
109
110 U_NAMESPACE_BEGIN
111
112 namespace {
113
114 // Cache some sets for other services -------------------------------------- ***
createUni32Set(UErrorCode & errorCode)115 void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
116 U_ASSERT(uni32Singleton == NULL);
117 uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
118 if(uni32Singleton==NULL) {
119 errorCode=U_MEMORY_ALLOCATION_ERROR;
120 } else {
121 uni32Singleton->freeze();
122 }
123 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
124 }
125
126
127 U_CFUNC UnicodeSet *
uniset_getUnicode32Instance(UErrorCode & errorCode)128 uniset_getUnicode32Instance(UErrorCode &errorCode) {
129 umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
130 return uni32Singleton;
131 }
132
133 // helper functions for matching of pattern syntax pieces ------------------ ***
134 // these functions are parallel to the PERL_OPEN etc. strings above
135
136 // using these functions is not only faster than UnicodeString::compare() and
137 // caseCompare(), but they also make UnicodeSet work for simple patterns when
138 // no Unicode properties data is available - when caseCompare() fails
139
140 static inline UBool
isPerlOpen(const UnicodeString & pattern,int32_t pos)141 isPerlOpen(const UnicodeString &pattern, int32_t pos) {
142 UChar c;
143 return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P);
144 }
145
146 /*static inline UBool
147 isPerlClose(const UnicodeString &pattern, int32_t pos) {
148 return pattern.charAt(pos)==CLOSE_BRACE;
149 }*/
150
151 static inline UBool
isNameOpen(const UnicodeString & pattern,int32_t pos)152 isNameOpen(const UnicodeString &pattern, int32_t pos) {
153 return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N;
154 }
155
156 static inline UBool
isPOSIXOpen(const UnicodeString & pattern,int32_t pos)157 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
158 return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON;
159 }
160
161 /*static inline UBool
162 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
163 return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE;
164 }*/
165
166 // TODO memory debugging provided inside uniset.cpp
167 // could be made available here but probably obsolete with use of modern
168 // memory leak checker tools
169 #define _dbgct(me)
170
171 } // namespace
172
173 //----------------------------------------------------------------
174 // Constructors &c
175 //----------------------------------------------------------------
176
177 /**
178 * Constructs a set from the given pattern, optionally ignoring
179 * white space. See the class description for the syntax of the
180 * pattern language.
181 * @param pattern a string specifying what characters are in the set
182 */
UnicodeSet(const UnicodeString & pattern,UErrorCode & status)183 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
184 UErrorCode& status) {
185 applyPattern(pattern, status);
186 _dbgct(this);
187 }
188
189 //----------------------------------------------------------------
190 // Public API
191 //----------------------------------------------------------------
192
applyPattern(const UnicodeString & pattern,UErrorCode & status)193 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
194 UErrorCode& status) {
195 // Equivalent to
196 // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
197 // but without dependency on closeOver().
198 ParsePosition pos(0);
199 applyPatternIgnoreSpace(pattern, pos, NULL, status);
200 if (U_FAILURE(status)) return *this;
201
202 int32_t i = pos.getIndex();
203 // Skip over trailing whitespace
204 ICU_Utility::skipWhitespace(pattern, i, TRUE);
205 if (i != pattern.length()) {
206 status = U_ILLEGAL_ARGUMENT_ERROR;
207 }
208 return *this;
209 }
210
211 void
applyPatternIgnoreSpace(const UnicodeString & pattern,ParsePosition & pos,const SymbolTable * symbols,UErrorCode & status)212 UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
213 ParsePosition& pos,
214 const SymbolTable* symbols,
215 UErrorCode& status) {
216 if (U_FAILURE(status)) {
217 return;
218 }
219 if (isFrozen()) {
220 status = U_NO_WRITE_PERMISSION;
221 return;
222 }
223 // Need to build the pattern in a temporary string because
224 // _applyPattern calls add() etc., which set pat to empty.
225 UnicodeString rebuiltPat;
226 RuleCharacterIterator chars(pattern, symbols, pos);
227 applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, 0, status);
228 if (U_FAILURE(status)) return;
229 if (chars.inVariable()) {
230 // syntaxError(chars, "Extra chars in variable value");
231 status = U_MALFORMED_SET;
232 return;
233 }
234 setPattern(rebuiltPat);
235 }
236
237 /**
238 * Return true if the given position, in the given pattern, appears
239 * to be the start of a UnicodeSet pattern.
240 */
resemblesPattern(const UnicodeString & pattern,int32_t pos)241 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
242 return ((pos+1) < pattern.length() &&
243 pattern.charAt(pos) == (UChar)91/*[*/) ||
244 resemblesPropertyPattern(pattern, pos);
245 }
246
247 //----------------------------------------------------------------
248 // Implementation: Pattern parsing
249 //----------------------------------------------------------------
250
251 namespace {
252
253 /**
254 * A small all-inline class to manage a UnicodeSet pointer. Add
255 * operator->() etc. as needed.
256 */
257 class UnicodeSetPointer {
258 UnicodeSet* p;
259 public:
UnicodeSetPointer()260 inline UnicodeSetPointer() : p(0) {}
~UnicodeSetPointer()261 inline ~UnicodeSetPointer() { delete p; }
pointer()262 inline UnicodeSet* pointer() { return p; }
allocate()263 inline UBool allocate() {
264 if (p == 0) {
265 p = new UnicodeSet();
266 }
267 return p != 0;
268 }
269 };
270
271 constexpr int32_t MAX_DEPTH = 100;
272
273 } // namespace
274
275 /**
276 * Parse the pattern from the given RuleCharacterIterator. The
277 * iterator is advanced over the parsed pattern.
278 * @param chars iterator over the pattern characters. Upon return
279 * it will be advanced to the first character after the parsed
280 * pattern, or the end of the iteration if all characters are
281 * parsed.
282 * @param symbols symbol table to use to parse and dereference
283 * variables, or null if none.
284 * @param rebuiltPat the pattern that was parsed, rebuilt or
285 * copied from the input pattern, as appropriate.
286 * @param options a bit mask of zero or more of the following:
287 * IGNORE_SPACE, CASE.
288 */
applyPattern(RuleCharacterIterator & chars,const SymbolTable * symbols,UnicodeString & rebuiltPat,uint32_t options,UnicodeSet & (UnicodeSet::* caseClosure)(int32_t attribute),int32_t depth,UErrorCode & ec)289 void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
290 const SymbolTable* symbols,
291 UnicodeString& rebuiltPat,
292 uint32_t options,
293 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
294 int32_t depth,
295 UErrorCode& ec) {
296 if (U_FAILURE(ec)) return;
297 if (depth > MAX_DEPTH) {
298 ec = U_ILLEGAL_ARGUMENT_ERROR;
299 return;
300 }
301
302 // Syntax characters: [ ] ^ - & { }
303
304 // Recognized special forms for chars, sets: c-c s-s s&s
305
306 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
307 RuleCharacterIterator::PARSE_ESCAPES;
308 if ((options & USET_IGNORE_SPACE) != 0) {
309 opts |= RuleCharacterIterator::SKIP_WHITESPACE;
310 }
311
312 UnicodeString patLocal, buf;
313 UBool usePat = FALSE;
314 UnicodeSetPointer scratch;
315 RuleCharacterIterator::Pos backup;
316
317 // mode: 0=before [, 1=between [...], 2=after ]
318 // lastItem: 0=none, 1=char, 2=set
319 int8_t lastItem = 0, mode = 0;
320 UChar32 lastChar = 0;
321 UChar op = 0;
322
323 UBool invert = FALSE;
324
325 clear();
326
327 while (mode != 2 && !chars.atEnd()) {
328 U_ASSERT((lastItem == 0 && op == 0) ||
329 (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) ||
330 (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ ||
331 op == INTERSECTION /*'&'*/)));
332
333 UChar32 c = 0;
334 UBool literal = FALSE;
335 UnicodeSet* nested = 0; // alias - do not delete
336
337 // -------- Check for property pattern
338
339 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
340 int8_t setMode = 0;
341 if (resemblesPropertyPattern(chars, opts)) {
342 setMode = 2;
343 }
344
345 // -------- Parse '[' of opening delimiter OR nested set.
346 // If there is a nested set, use `setMode' to define how
347 // the set should be parsed. If the '[' is part of the
348 // opening delimiter for this pattern, parse special
349 // strings "[", "[^", "[-", and "[^-". Check for stand-in
350 // characters representing a nested set in the symbol
351 // table.
352
353 else {
354 // Prepare to backup if necessary
355 chars.getPos(backup);
356 c = chars.next(opts, literal, ec);
357 if (U_FAILURE(ec)) return;
358
359 if (c == 0x5B /*'['*/ && !literal) {
360 if (mode == 1) {
361 chars.setPos(backup); // backup
362 setMode = 1;
363 } else {
364 // Handle opening '[' delimiter
365 mode = 1;
366 patLocal.append((UChar) 0x5B /*'['*/);
367 chars.getPos(backup); // prepare to backup
368 c = chars.next(opts, literal, ec);
369 if (U_FAILURE(ec)) return;
370 if (c == 0x5E /*'^'*/ && !literal) {
371 invert = TRUE;
372 patLocal.append((UChar) 0x5E /*'^'*/);
373 chars.getPos(backup); // prepare to backup
374 c = chars.next(opts, literal, ec);
375 if (U_FAILURE(ec)) return;
376 }
377 // Fall through to handle special leading '-';
378 // otherwise restart loop for nested [], \p{}, etc.
379 if (c == HYPHEN /*'-'*/) {
380 literal = TRUE;
381 // Fall through to handle literal '-' below
382 } else {
383 chars.setPos(backup); // backup
384 continue;
385 }
386 }
387 } else if (symbols != 0) {
388 const UnicodeFunctor *m = symbols->lookupMatcher(c);
389 if (m != 0) {
390 const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
391 if (ms == NULL) {
392 ec = U_MALFORMED_SET;
393 return;
394 }
395 // casting away const, but `nested' won't be modified
396 // (important not to modify stored set)
397 nested = const_cast<UnicodeSet*>(ms);
398 setMode = 3;
399 }
400 }
401 }
402
403 // -------- Handle a nested set. This either is inline in
404 // the pattern or represented by a stand-in that has
405 // previously been parsed and was looked up in the symbol
406 // table.
407
408 if (setMode != 0) {
409 if (lastItem == 1) {
410 if (op != 0) {
411 // syntaxError(chars, "Char expected after operator");
412 ec = U_MALFORMED_SET;
413 return;
414 }
415 add(lastChar, lastChar);
416 _appendToPat(patLocal, lastChar, FALSE);
417 lastItem = 0;
418 op = 0;
419 }
420
421 if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) {
422 patLocal.append(op);
423 }
424
425 if (nested == 0) {
426 // lazy allocation
427 if (!scratch.allocate()) {
428 ec = U_MEMORY_ALLOCATION_ERROR;
429 return;
430 }
431 nested = scratch.pointer();
432 }
433 switch (setMode) {
434 case 1:
435 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
436 break;
437 case 2:
438 chars.skipIgnored(opts);
439 nested->applyPropertyPattern(chars, patLocal, ec);
440 if (U_FAILURE(ec)) return;
441 break;
442 case 3: // `nested' already parsed
443 nested->_toPattern(patLocal, FALSE);
444 break;
445 }
446
447 usePat = TRUE;
448
449 if (mode == 0) {
450 // Entire pattern is a category; leave parse loop
451 *this = *nested;
452 mode = 2;
453 break;
454 }
455
456 switch (op) {
457 case HYPHEN: /*'-'*/
458 removeAll(*nested);
459 break;
460 case INTERSECTION: /*'&'*/
461 retainAll(*nested);
462 break;
463 case 0:
464 addAll(*nested);
465 break;
466 }
467
468 op = 0;
469 lastItem = 2;
470
471 continue;
472 }
473
474 if (mode == 0) {
475 // syntaxError(chars, "Missing '['");
476 ec = U_MALFORMED_SET;
477 return;
478 }
479
480 // -------- Parse special (syntax) characters. If the
481 // current character is not special, or if it is escaped,
482 // then fall through and handle it below.
483
484 if (!literal) {
485 switch (c) {
486 case 0x5D /*']'*/:
487 if (lastItem == 1) {
488 add(lastChar, lastChar);
489 _appendToPat(patLocal, lastChar, FALSE);
490 }
491 // Treat final trailing '-' as a literal
492 if (op == HYPHEN /*'-'*/) {
493 add(op, op);
494 patLocal.append(op);
495 } else if (op == INTERSECTION /*'&'*/) {
496 // syntaxError(chars, "Trailing '&'");
497 ec = U_MALFORMED_SET;
498 return;
499 }
500 patLocal.append((UChar) 0x5D /*']'*/);
501 mode = 2;
502 continue;
503 case HYPHEN /*'-'*/:
504 if (op == 0) {
505 if (lastItem != 0) {
506 op = (UChar) c;
507 continue;
508 } else {
509 // Treat final trailing '-' as a literal
510 add(c, c);
511 c = chars.next(opts, literal, ec);
512 if (U_FAILURE(ec)) return;
513 if (c == 0x5D /*']'*/ && !literal) {
514 patLocal.append(HYPHEN_RIGHT_BRACE, 2);
515 mode = 2;
516 continue;
517 }
518 }
519 }
520 // syntaxError(chars, "'-' not after char or set");
521 ec = U_MALFORMED_SET;
522 return;
523 case INTERSECTION /*'&'*/:
524 if (lastItem == 2 && op == 0) {
525 op = (UChar) c;
526 continue;
527 }
528 // syntaxError(chars, "'&' not after set");
529 ec = U_MALFORMED_SET;
530 return;
531 case 0x5E /*'^'*/:
532 // syntaxError(chars, "'^' not after '['");
533 ec = U_MALFORMED_SET;
534 return;
535 case 0x7B /*'{'*/:
536 if (op != 0) {
537 // syntaxError(chars, "Missing operand after operator");
538 ec = U_MALFORMED_SET;
539 return;
540 }
541 if (lastItem == 1) {
542 add(lastChar, lastChar);
543 _appendToPat(patLocal, lastChar, FALSE);
544 }
545 lastItem = 0;
546 buf.truncate(0);
547 {
548 UBool ok = FALSE;
549 while (!chars.atEnd()) {
550 c = chars.next(opts, literal, ec);
551 if (U_FAILURE(ec)) return;
552 if (c == 0x7D /*'}'*/ && !literal) {
553 ok = TRUE;
554 break;
555 }
556 buf.append(c);
557 }
558 if (buf.length() < 1 || !ok) {
559 // syntaxError(chars, "Invalid multicharacter string");
560 ec = U_MALFORMED_SET;
561 return;
562 }
563 }
564 // We have new string. Add it to set and continue;
565 // we don't need to drop through to the further
566 // processing
567 add(buf);
568 patLocal.append((UChar) 0x7B /*'{'*/);
569 _appendToPat(patLocal, buf, FALSE);
570 patLocal.append((UChar) 0x7D /*'}'*/);
571 continue;
572 case SymbolTable::SYMBOL_REF:
573 // symbols nosymbols
574 // [a-$] error error (ambiguous)
575 // [a$] anchor anchor
576 // [a-$x] var "x"* literal '$'
577 // [a-$.] error literal '$'
578 // *We won't get here in the case of var "x"
579 {
580 chars.getPos(backup);
581 c = chars.next(opts, literal, ec);
582 if (U_FAILURE(ec)) return;
583 UBool anchor = (c == 0x5D /*']'*/ && !literal);
584 if (symbols == 0 && !anchor) {
585 c = SymbolTable::SYMBOL_REF;
586 chars.setPos(backup);
587 break; // literal '$'
588 }
589 if (anchor && op == 0) {
590 if (lastItem == 1) {
591 add(lastChar, lastChar);
592 _appendToPat(patLocal, lastChar, FALSE);
593 }
594 add(U_ETHER);
595 usePat = TRUE;
596 patLocal.append((UChar) SymbolTable::SYMBOL_REF);
597 patLocal.append((UChar) 0x5D /*']'*/);
598 mode = 2;
599 continue;
600 }
601 // syntaxError(chars, "Unquoted '$'");
602 ec = U_MALFORMED_SET;
603 return;
604 }
605 default:
606 break;
607 }
608 }
609
610 // -------- Parse literal characters. This includes both
611 // escaped chars ("\u4E01") and non-syntax characters
612 // ("a").
613
614 switch (lastItem) {
615 case 0:
616 lastItem = 1;
617 lastChar = c;
618 break;
619 case 1:
620 if (op == HYPHEN /*'-'*/) {
621 if (lastChar >= c) {
622 // Don't allow redundant (a-a) or empty (b-a) ranges;
623 // these are most likely typos.
624 // syntaxError(chars, "Invalid range");
625 ec = U_MALFORMED_SET;
626 return;
627 }
628 add(lastChar, c);
629 _appendToPat(patLocal, lastChar, FALSE);
630 patLocal.append(op);
631 _appendToPat(patLocal, c, FALSE);
632 lastItem = 0;
633 op = 0;
634 } else {
635 add(lastChar, lastChar);
636 _appendToPat(patLocal, lastChar, FALSE);
637 lastChar = c;
638 }
639 break;
640 case 2:
641 if (op != 0) {
642 // syntaxError(chars, "Set expected after operator");
643 ec = U_MALFORMED_SET;
644 return;
645 }
646 lastChar = c;
647 lastItem = 1;
648 break;
649 }
650 }
651
652 if (mode != 2) {
653 // syntaxError(chars, "Missing ']'");
654 ec = U_MALFORMED_SET;
655 return;
656 }
657
658 chars.skipIgnored(opts);
659
660 /**
661 * Handle global flags (invert, case insensitivity). If this
662 * pattern should be compiled case-insensitive, then we need
663 * to close over case BEFORE COMPLEMENTING. This makes
664 * patterns like /[^abc]/i work.
665 */
666 if ((options & USET_CASE_INSENSITIVE) != 0) {
667 (this->*caseClosure)(USET_CASE_INSENSITIVE);
668 }
669 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
670 (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
671 }
672 if (invert) {
673 complement();
674 }
675
676 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
677 // generated pattern.
678 if (usePat) {
679 rebuiltPat.append(patLocal);
680 } else {
681 _generatePattern(rebuiltPat, FALSE);
682 }
683 if (isBogus() && U_SUCCESS(ec)) {
684 // We likely ran out of memory. AHHH!
685 ec = U_MEMORY_ALLOCATION_ERROR;
686 }
687 }
688
689 //----------------------------------------------------------------
690 // Property set implementation
691 //----------------------------------------------------------------
692
693 namespace {
694
numericValueFilter(UChar32 ch,void * context)695 static UBool numericValueFilter(UChar32 ch, void* context) {
696 return u_getNumericValue(ch) == *(double*)context;
697 }
698
generalCategoryMaskFilter(UChar32 ch,void * context)699 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
700 int32_t value = *(int32_t*)context;
701 return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
702 }
703
versionFilter(UChar32 ch,void * context)704 static UBool versionFilter(UChar32 ch, void* context) {
705 static const UVersionInfo none = { 0, 0, 0, 0 };
706 UVersionInfo v;
707 u_charAge(ch, v);
708 UVersionInfo* version = (UVersionInfo*)context;
709 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
710 }
711
712 typedef struct {
713 UProperty prop;
714 int32_t value;
715 } IntPropertyContext;
716
intPropertyFilter(UChar32 ch,void * context)717 static UBool intPropertyFilter(UChar32 ch, void* context) {
718 IntPropertyContext* c = (IntPropertyContext*)context;
719 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
720 }
721
scriptExtensionsFilter(UChar32 ch,void * context)722 static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
723 return uscript_hasScript(ch, *(UScriptCode*)context);
724 }
725
726 } // namespace
727
728 /**
729 * Generic filter-based scanning code for UCD property UnicodeSets.
730 */
applyFilter(UnicodeSet::Filter filter,void * context,const UnicodeSet * inclusions,UErrorCode & status)731 void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
732 void* context,
733 const UnicodeSet* inclusions,
734 UErrorCode &status) {
735 if (U_FAILURE(status)) return;
736
737 // Logically, walk through all Unicode characters, noting the start
738 // and end of each range for which filter.contain(c) is
739 // true. Add each range to a set.
740 //
741 // To improve performance, use an inclusions set which
742 // encodes information about character ranges that are known
743 // to have identical properties.
744 // inclusions contains the first characters of
745 // same-value ranges for the given property.
746
747 clear();
748
749 UChar32 startHasProperty = -1;
750 int32_t limitRange = inclusions->getRangeCount();
751
752 for (int j=0; j<limitRange; ++j) {
753 // get current range
754 UChar32 start = inclusions->getRangeStart(j);
755 UChar32 end = inclusions->getRangeEnd(j);
756
757 // for all the code points in the range, process
758 for (UChar32 ch = start; ch <= end; ++ch) {
759 // only add to this UnicodeSet on inflection points --
760 // where the hasProperty value changes to false
761 if ((*filter)(ch, context)) {
762 if (startHasProperty < 0) {
763 startHasProperty = ch;
764 }
765 } else if (startHasProperty >= 0) {
766 add(startHasProperty, ch-1);
767 startHasProperty = -1;
768 }
769 }
770 }
771 if (startHasProperty >= 0) {
772 add((UChar32)startHasProperty, (UChar32)0x10FFFF);
773 }
774 if (isBogus() && U_SUCCESS(status)) {
775 // We likely ran out of memory. AHHH!
776 status = U_MEMORY_ALLOCATION_ERROR;
777 }
778 }
779
780 namespace {
781
mungeCharName(char * dst,const char * src,int32_t dstCapacity)782 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
783 /* Note: we use ' ' in compiler code page */
784 int32_t j = 0;
785 char ch;
786 --dstCapacity; /* make room for term. zero */
787 while ((ch = *src++) != 0) {
788 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
789 continue;
790 }
791 if (j >= dstCapacity) return FALSE;
792 dst[j++] = ch;
793 }
794 if (j > 0 && dst[j-1] == ' ') --j;
795 dst[j] = 0;
796 return TRUE;
797 }
798
799 } // namespace
800
801 //----------------------------------------------------------------
802 // Property set API
803 //----------------------------------------------------------------
804
805 #define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;}
806
807 UnicodeSet&
applyIntPropertyValue(UProperty prop,int32_t value,UErrorCode & ec)808 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
809 if (U_FAILURE(ec) || isFrozen()) { return *this; }
810 if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
811 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
812 applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
813 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
814 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
815 UScriptCode script = (UScriptCode)value;
816 applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
817 } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
818 if (value == 0 || value == 1) {
819 const USet *set = u_getBinaryPropertySet(prop, &ec);
820 if (U_FAILURE(ec)) { return *this; }
821 copyFrom(*UnicodeSet::fromUSet(set), TRUE);
822 if (value == 0) {
823 complement();
824 }
825 } else {
826 clear();
827 }
828 } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
829 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
830 IntPropertyContext c = {prop, value};
831 applyFilter(intPropertyFilter, &c, inclusions, ec);
832 } else {
833 ec = U_ILLEGAL_ARGUMENT_ERROR;
834 }
835 return *this;
836 }
837
838 UnicodeSet&
applyPropertyAlias(const UnicodeString & prop,const UnicodeString & value,UErrorCode & ec)839 UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
840 const UnicodeString& value,
841 UErrorCode& ec) {
842 if (U_FAILURE(ec) || isFrozen()) return *this;
843
844 // prop and value used to be converted to char * using the default
845 // converter instead of the invariant conversion.
846 // This should not be necessary because all Unicode property and value
847 // names use only invariant characters.
848 // If there are any variant characters, then we won't find them anyway.
849 // Checking first avoids assertion failures in the conversion.
850 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
851 !uprv_isInvariantUString(value.getBuffer(), value.length())
852 ) {
853 FAIL(ec);
854 }
855 CharString pname, vname;
856 pname.appendInvariantChars(prop, ec);
857 vname.appendInvariantChars(value, ec);
858 if (U_FAILURE(ec)) return *this;
859
860 UProperty p;
861 int32_t v;
862 UBool invert = FALSE;
863
864 if (value.length() > 0) {
865 p = u_getPropertyEnum(pname.data());
866 if (p == UCHAR_INVALID_CODE) FAIL(ec);
867
868 // Treat gc as gcm
869 if (p == UCHAR_GENERAL_CATEGORY) {
870 p = UCHAR_GENERAL_CATEGORY_MASK;
871 }
872
873 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
874 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
875 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
876 v = u_getPropertyValueEnum(p, vname.data());
877 if (v == UCHAR_INVALID_CODE) {
878 // Handle numeric CCC
879 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
880 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
881 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
882 char* end;
883 double val = uprv_strtod(vname.data(), &end);
884 // Anything between 0 and 255 is valid even if unused.
885 // Cast double->int only after range check.
886 // We catch NaN here because comparing it with both 0 and 255 will be false
887 // (as are all comparisons with NaN).
888 if (*end != 0 || !(0 <= val && val <= 255) ||
889 (v = (int32_t)val) != val) {
890 // non-integral value or outside 0..255, or trailing junk
891 FAIL(ec);
892 }
893 } else {
894 FAIL(ec);
895 }
896 }
897 }
898
899 else {
900
901 switch (p) {
902 case UCHAR_NUMERIC_VALUE:
903 {
904 char* end;
905 double val = uprv_strtod(vname.data(), &end);
906 if (*end != 0) {
907 FAIL(ec);
908 }
909 applyFilter(numericValueFilter, &val,
910 CharacterProperties::getInclusionsForProperty(p, ec), ec);
911 return *this;
912 }
913 case UCHAR_NAME:
914 {
915 // Must munge name, since u_charFromName() does not do
916 // 'loose' matching.
917 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
918 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
919 UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
920 if (U_SUCCESS(ec)) {
921 clear();
922 add(ch);
923 return *this;
924 } else {
925 FAIL(ec);
926 }
927 }
928 case UCHAR_UNICODE_1_NAME:
929 // ICU 49 deprecates the Unicode_1_Name property APIs.
930 FAIL(ec);
931 case UCHAR_AGE:
932 {
933 // Must munge name, since u_versionFromString() does not do
934 // 'loose' matching.
935 char buf[128];
936 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
937 UVersionInfo version;
938 u_versionFromString(version, buf);
939 applyFilter(versionFilter, &version,
940 CharacterProperties::getInclusionsForProperty(p, ec), ec);
941 return *this;
942 }
943 case UCHAR_SCRIPT_EXTENSIONS:
944 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
945 if (v == UCHAR_INVALID_CODE) {
946 FAIL(ec);
947 }
948 // fall through to calling applyIntPropertyValue()
949 break;
950 default:
951 // p is a non-binary, non-enumerated property that we
952 // don't support (yet).
953 FAIL(ec);
954 }
955 }
956 }
957
958 else {
959 // value is empty. Interpret as General Category, Script, or
960 // Binary property.
961 p = UCHAR_GENERAL_CATEGORY_MASK;
962 v = u_getPropertyValueEnum(p, pname.data());
963 if (v == UCHAR_INVALID_CODE) {
964 p = UCHAR_SCRIPT;
965 v = u_getPropertyValueEnum(p, pname.data());
966 if (v == UCHAR_INVALID_CODE) {
967 p = u_getPropertyEnum(pname.data());
968 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
969 v = 1;
970 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
971 set(MIN_VALUE, MAX_VALUE);
972 return *this;
973 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
974 set(0, 0x7F);
975 return *this;
976 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
977 // [:Assigned:]=[:^Cn:]
978 p = UCHAR_GENERAL_CATEGORY_MASK;
979 v = U_GC_CN_MASK;
980 invert = TRUE;
981 } else {
982 FAIL(ec);
983 }
984 }
985 }
986 }
987
988 applyIntPropertyValue(p, v, ec);
989 if(invert) {
990 complement();
991 }
992
993 if (isBogus() && U_SUCCESS(ec)) {
994 // We likely ran out of memory. AHHH!
995 ec = U_MEMORY_ALLOCATION_ERROR;
996 }
997 return *this;
998 }
999
1000 //----------------------------------------------------------------
1001 // Property set patterns
1002 //----------------------------------------------------------------
1003
1004 /**
1005 * Return true if the given position, in the given pattern, appears
1006 * to be the start of a property set pattern.
1007 */
resemblesPropertyPattern(const UnicodeString & pattern,int32_t pos)1008 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
1009 int32_t pos) {
1010 // Patterns are at least 5 characters long
1011 if ((pos+5) > pattern.length()) {
1012 return FALSE;
1013 }
1014
1015 // Look for an opening [:, [:^, \p, or \P
1016 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
1017 }
1018
1019 /**
1020 * Return true if the given iterator appears to point at a
1021 * property pattern. Regardless of the result, return with the
1022 * iterator unchanged.
1023 * @param chars iterator over the pattern characters. Upon return
1024 * it will be unchanged.
1025 * @param iterOpts RuleCharacterIterator options
1026 */
resemblesPropertyPattern(RuleCharacterIterator & chars,int32_t iterOpts)1027 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
1028 int32_t iterOpts) {
1029 // NOTE: literal will always be FALSE, because we don't parse escapes.
1030 UBool result = FALSE, literal;
1031 UErrorCode ec = U_ZERO_ERROR;
1032 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1033 RuleCharacterIterator::Pos pos;
1034 chars.getPos(pos);
1035 UChar32 c = chars.next(iterOpts, literal, ec);
1036 if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) {
1037 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1038 literal, ec);
1039 result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) :
1040 (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/);
1041 }
1042 chars.setPos(pos);
1043 return result && U_SUCCESS(ec);
1044 }
1045
1046 /**
1047 * Parse the given property pattern at the given parse position.
1048 */
applyPropertyPattern(const UnicodeString & pattern,ParsePosition & ppos,UErrorCode & ec)1049 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1050 ParsePosition& ppos,
1051 UErrorCode &ec) {
1052 int32_t pos = ppos.getIndex();
1053
1054 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1055 UBool isName = FALSE; // true for \N{pat}, o/w false
1056 UBool invert = FALSE;
1057
1058 if (U_FAILURE(ec)) return *this;
1059
1060 // Minimum length is 5 characters, e.g. \p{L}
1061 if ((pos+5) > pattern.length()) {
1062 FAIL(ec);
1063 }
1064
1065 // On entry, ppos should point to one of the following locations:
1066 // Look for an opening [:, [:^, \p, or \P
1067 if (isPOSIXOpen(pattern, pos)) {
1068 posix = TRUE;
1069 pos += 2;
1070 pos = ICU_Utility::skipWhitespace(pattern, pos);
1071 if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) {
1072 ++pos;
1073 invert = TRUE;
1074 }
1075 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1076 UChar c = pattern.charAt(pos+1);
1077 invert = (c == UPPER_P);
1078 isName = (c == UPPER_N);
1079 pos += 2;
1080 pos = ICU_Utility::skipWhitespace(pattern, pos);
1081 if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) {
1082 // Syntax error; "\p" or "\P" not followed by "{"
1083 FAIL(ec);
1084 }
1085 } else {
1086 // Open delimiter not seen
1087 FAIL(ec);
1088 }
1089
1090 // Look for the matching close delimiter, either :] or }
1091 int32_t close;
1092 if (posix) {
1093 close = pattern.indexOf(POSIX_CLOSE, 2, pos);
1094 } else {
1095 close = pattern.indexOf(CLOSE_BRACE, pos);
1096 }
1097 if (close < 0) {
1098 // Syntax error; close delimiter missing
1099 FAIL(ec);
1100 }
1101
1102 // Look for an '=' sign. If this is present, we will parse a
1103 // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1104 // pattern.
1105 int32_t equals = pattern.indexOf(EQUALS, pos);
1106 UnicodeString propName, valueName;
1107 if (equals >= 0 && equals < close && !isName) {
1108 // Equals seen; parse medium/long pattern
1109 pattern.extractBetween(pos, equals, propName);
1110 pattern.extractBetween(equals+1, close, valueName);
1111 }
1112
1113 else {
1114 // Handle case where no '=' is seen, and \N{}
1115 pattern.extractBetween(pos, close, propName);
1116
1117 // Handle \N{name}
1118 if (isName) {
1119 // This is a little inefficient since it means we have to
1120 // parse NAME_PROP back to UCHAR_NAME even though we already
1121 // know it's UCHAR_NAME. If we refactor the API to
1122 // support args of (UProperty, char*) then we can remove
1123 // NAME_PROP and make this a little more efficient.
1124 valueName = propName;
1125 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1126 }
1127 }
1128
1129 applyPropertyAlias(propName, valueName, ec);
1130
1131 if (U_SUCCESS(ec)) {
1132 if (invert) {
1133 complement();
1134 }
1135
1136 // Move to the limit position after the close delimiter if the
1137 // parse succeeded.
1138 ppos.setIndex(close + (posix ? 2 : 1));
1139 }
1140
1141 return *this;
1142 }
1143
1144 /**
1145 * Parse a property pattern.
1146 * @param chars iterator over the pattern characters. Upon return
1147 * it will be advanced to the first character after the parsed
1148 * pattern, or the end of the iteration if all characters are
1149 * parsed.
1150 * @param rebuiltPat the pattern that was parsed, rebuilt or
1151 * copied from the input pattern, as appropriate.
1152 */
applyPropertyPattern(RuleCharacterIterator & chars,UnicodeString & rebuiltPat,UErrorCode & ec)1153 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1154 UnicodeString& rebuiltPat,
1155 UErrorCode& ec) {
1156 if (U_FAILURE(ec)) return;
1157 UnicodeString pattern;
1158 chars.lookahead(pattern);
1159 ParsePosition pos(0);
1160 applyPropertyPattern(pattern, pos, ec);
1161 if (U_FAILURE(ec)) return;
1162 if (pos.getIndex() == 0) {
1163 // syntaxError(chars, "Invalid property pattern");
1164 ec = U_MALFORMED_SET;
1165 return;
1166 }
1167 chars.jumpahead(pos.getIndex());
1168 rebuiltPat.append(pattern, 0, pos.getIndex());
1169 }
1170
1171 U_NAMESPACE_END
1172