1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 1999-2014, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: uniset_props.cpp
11 * encoding: UTF-8
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2004aug25
16 * created by: Markus W. Scherer
17 *
18 * Character property dependent functions moved here from uniset.cpp
19 */
20
21 #include "unicode/utypes.h"
22 #include "unicode/uniset.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/uchar.h"
25 #include "unicode/uscript.h"
26 #include "unicode/symtable.h"
27 #include "unicode/uset.h"
28 #include "unicode/locid.h"
29 #include "unicode/brkiter.h"
30 #include "uset_imp.h"
31 #include "ruleiter.h"
32 #include "cmemory.h"
33 #include "ucln_cmn.h"
34 #include "util.h"
35 #include "uvector.h"
36 #include "uprops.h"
37 #include "propname.h"
38 #include "normalizer2impl.h"
39 #include "uinvchar.h"
40 #include "uprops.h"
41 #include "charstr.h"
42 #include "cstring.h"
43 #include "mutex.h"
44 #include "umutex.h"
45 #include "uassert.h"
46 #include "hash.h"
47
48 U_NAMESPACE_USE
49
50 // Special property set IDs
51 static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF]
52 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
53 static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
54
55 // Unicode name property alias
56 #define NAME_PROP "na"
57 #define NAME_PROP_LENGTH 2
58
59 // Cached sets ------------------------------------------------------------- ***
60
61 U_CDECL_BEGIN
62 static UBool U_CALLCONV uset_cleanup();
63
64 static UnicodeSet *uni32Singleton;
65 static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER;
66
67 /**
68 * Cleanup function for UnicodeSet
69 */
uset_cleanup(void)70 static UBool U_CALLCONV uset_cleanup(void) {
71 delete uni32Singleton;
72 uni32Singleton = NULL;
73 uni32InitOnce.reset();
74 return TRUE;
75 }
76
77 U_CDECL_END
78
79 U_NAMESPACE_BEGIN
80
81 namespace {
82
83 // Cache some sets for other services -------------------------------------- ***
createUni32Set(UErrorCode & errorCode)84 void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
85 U_ASSERT(uni32Singleton == NULL);
86 uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
87 if(uni32Singleton==NULL) {
88 errorCode=U_MEMORY_ALLOCATION_ERROR;
89 } else {
90 uni32Singleton->freeze();
91 }
92 ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
93 }
94
95
96 U_CFUNC UnicodeSet *
uniset_getUnicode32Instance(UErrorCode & errorCode)97 uniset_getUnicode32Instance(UErrorCode &errorCode) {
98 umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
99 return uni32Singleton;
100 }
101
102 // helper functions for matching of pattern syntax pieces ------------------ ***
103 // these functions are parallel to the PERL_OPEN etc. strings above
104
105 // using these functions is not only faster than UnicodeString::compare() and
106 // caseCompare(), but they also make UnicodeSet work for simple patterns when
107 // no Unicode properties data is available - when caseCompare() fails
108
109 static inline UBool
isPerlOpen(const UnicodeString & pattern,int32_t pos)110 isPerlOpen(const UnicodeString &pattern, int32_t pos) {
111 UChar c;
112 return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P');
113 }
114
115 /*static inline UBool
116 isPerlClose(const UnicodeString &pattern, int32_t pos) {
117 return pattern.charAt(pos)==u'}';
118 }*/
119
120 static inline UBool
isNameOpen(const UnicodeString & pattern,int32_t pos)121 isNameOpen(const UnicodeString &pattern, int32_t pos) {
122 return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N';
123 }
124
125 static inline UBool
isPOSIXOpen(const UnicodeString & pattern,int32_t pos)126 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
127 return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':';
128 }
129
130 /*static inline UBool
131 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
132 return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';
133 }*/
134
135 // TODO memory debugging provided inside uniset.cpp
136 // could be made available here but probably obsolete with use of modern
137 // memory leak checker tools
138 #define _dbgct(me)
139
140 } // namespace
141
142 //----------------------------------------------------------------
143 // Constructors &c
144 //----------------------------------------------------------------
145
146 /**
147 * Constructs a set from the given pattern, optionally ignoring
148 * white space. See the class description for the syntax of the
149 * pattern language.
150 * @param pattern a string specifying what characters are in the set
151 */
UnicodeSet(const UnicodeString & pattern,UErrorCode & status)152 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
153 UErrorCode& status) {
154 applyPattern(pattern, status);
155 _dbgct(this);
156 }
157
158 //----------------------------------------------------------------
159 // Public API
160 //----------------------------------------------------------------
161
applyPattern(const UnicodeString & pattern,UErrorCode & status)162 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
163 UErrorCode& status) {
164 // Equivalent to
165 // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status);
166 // but without dependency on closeOver().
167 ParsePosition pos(0);
168 applyPatternIgnoreSpace(pattern, pos, NULL, status);
169 if (U_FAILURE(status)) return *this;
170
171 int32_t i = pos.getIndex();
172 // Skip over trailing whitespace
173 ICU_Utility::skipWhitespace(pattern, i, TRUE);
174 if (i != pattern.length()) {
175 status = U_ILLEGAL_ARGUMENT_ERROR;
176 }
177 return *this;
178 }
179
180 void
applyPatternIgnoreSpace(const UnicodeString & pattern,ParsePosition & pos,const SymbolTable * symbols,UErrorCode & status)181 UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
182 ParsePosition& pos,
183 const SymbolTable* symbols,
184 UErrorCode& status) {
185 if (U_FAILURE(status)) {
186 return;
187 }
188 if (isFrozen()) {
189 status = U_NO_WRITE_PERMISSION;
190 return;
191 }
192 // Need to build the pattern in a temporary string because
193 // _applyPattern calls add() etc., which set pat to empty.
194 UnicodeString rebuiltPat;
195 RuleCharacterIterator chars(pattern, symbols, pos);
196 applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, 0, status);
197 if (U_FAILURE(status)) return;
198 if (chars.inVariable()) {
199 // syntaxError(chars, "Extra chars in variable value");
200 status = U_MALFORMED_SET;
201 return;
202 }
203 setPattern(rebuiltPat);
204 }
205
206 /**
207 * Return true if the given position, in the given pattern, appears
208 * to be the start of a UnicodeSet pattern.
209 */
resemblesPattern(const UnicodeString & pattern,int32_t pos)210 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
211 return ((pos+1) < pattern.length() &&
212 pattern.charAt(pos) == (UChar)91/*[*/) ||
213 resemblesPropertyPattern(pattern, pos);
214 }
215
216 //----------------------------------------------------------------
217 // Implementation: Pattern parsing
218 //----------------------------------------------------------------
219
220 namespace {
221
222 /**
223 * A small all-inline class to manage a UnicodeSet pointer. Add
224 * operator->() etc. as needed.
225 */
226 class UnicodeSetPointer {
227 UnicodeSet* p;
228 public:
UnicodeSetPointer()229 inline UnicodeSetPointer() : p(0) {}
~UnicodeSetPointer()230 inline ~UnicodeSetPointer() { delete p; }
pointer()231 inline UnicodeSet* pointer() { return p; }
allocate()232 inline UBool allocate() {
233 if (p == 0) {
234 p = new UnicodeSet();
235 }
236 return p != 0;
237 }
238 };
239
240 constexpr int32_t MAX_DEPTH = 100;
241
242 } // namespace
243
244 /**
245 * Parse the pattern from the given RuleCharacterIterator. The
246 * iterator is advanced over the parsed pattern.
247 * @param chars iterator over the pattern characters. Upon return
248 * it will be advanced to the first character after the parsed
249 * pattern, or the end of the iteration if all characters are
250 * parsed.
251 * @param symbols symbol table to use to parse and dereference
252 * variables, or null if none.
253 * @param rebuiltPat the pattern that was parsed, rebuilt or
254 * copied from the input pattern, as appropriate.
255 * @param options a bit mask of zero or more of the following:
256 * IGNORE_SPACE, CASE.
257 */
applyPattern(RuleCharacterIterator & chars,const SymbolTable * symbols,UnicodeString & rebuiltPat,uint32_t options,UnicodeSet & (UnicodeSet::* caseClosure)(int32_t attribute),int32_t depth,UErrorCode & ec)258 void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
259 const SymbolTable* symbols,
260 UnicodeString& rebuiltPat,
261 uint32_t options,
262 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
263 int32_t depth,
264 UErrorCode& ec) {
265 if (U_FAILURE(ec)) return;
266 if (depth > MAX_DEPTH) {
267 ec = U_ILLEGAL_ARGUMENT_ERROR;
268 return;
269 }
270
271 // Syntax characters: [ ] ^ - & { }
272
273 // Recognized special forms for chars, sets: c-c s-s s&s
274
275 int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
276 RuleCharacterIterator::PARSE_ESCAPES;
277 if ((options & USET_IGNORE_SPACE) != 0) {
278 opts |= RuleCharacterIterator::SKIP_WHITESPACE;
279 }
280
281 UnicodeString patLocal, buf;
282 UBool usePat = FALSE;
283 UnicodeSetPointer scratch;
284 RuleCharacterIterator::Pos backup;
285
286 // mode: 0=before [, 1=between [...], 2=after ]
287 // lastItem: 0=none, 1=char, 2=set
288 int8_t lastItem = 0, mode = 0;
289 UChar32 lastChar = 0;
290 UChar op = 0;
291
292 UBool invert = FALSE;
293
294 clear();
295
296 while (mode != 2 && !chars.atEnd()) {
297 U_ASSERT((lastItem == 0 && op == 0) ||
298 (lastItem == 1 && (op == 0 || op == u'-')) ||
299 (lastItem == 2 && (op == 0 || op == u'-' || op == u'&')));
300
301 UChar32 c = 0;
302 UBool literal = FALSE;
303 UnicodeSet* nested = 0; // alias - do not delete
304
305 // -------- Check for property pattern
306
307 // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
308 int8_t setMode = 0;
309 if (resemblesPropertyPattern(chars, opts)) {
310 setMode = 2;
311 }
312
313 // -------- Parse '[' of opening delimiter OR nested set.
314 // If there is a nested set, use `setMode' to define how
315 // the set should be parsed. If the '[' is part of the
316 // opening delimiter for this pattern, parse special
317 // strings "[", "[^", "[-", and "[^-". Check for stand-in
318 // characters representing a nested set in the symbol
319 // table.
320
321 else {
322 // Prepare to backup if necessary
323 chars.getPos(backup);
324 c = chars.next(opts, literal, ec);
325 if (U_FAILURE(ec)) return;
326
327 if (c == u'[' && !literal) {
328 if (mode == 1) {
329 chars.setPos(backup); // backup
330 setMode = 1;
331 } else {
332 // Handle opening '[' delimiter
333 mode = 1;
334 patLocal.append(u'[');
335 chars.getPos(backup); // prepare to backup
336 c = chars.next(opts, literal, ec);
337 if (U_FAILURE(ec)) return;
338 if (c == u'^' && !literal) {
339 invert = TRUE;
340 patLocal.append(u'^');
341 chars.getPos(backup); // prepare to backup
342 c = chars.next(opts, literal, ec);
343 if (U_FAILURE(ec)) return;
344 }
345 // Fall through to handle special leading '-';
346 // otherwise restart loop for nested [], \p{}, etc.
347 if (c == u'-') {
348 literal = TRUE;
349 // Fall through to handle literal '-' below
350 } else {
351 chars.setPos(backup); // backup
352 continue;
353 }
354 }
355 } else if (symbols != 0) {
356 const UnicodeFunctor *m = symbols->lookupMatcher(c);
357 if (m != 0) {
358 const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
359 if (ms == NULL) {
360 ec = U_MALFORMED_SET;
361 return;
362 }
363 // casting away const, but `nested' won't be modified
364 // (important not to modify stored set)
365 nested = const_cast<UnicodeSet*>(ms);
366 setMode = 3;
367 }
368 }
369 }
370
371 // -------- Handle a nested set. This either is inline in
372 // the pattern or represented by a stand-in that has
373 // previously been parsed and was looked up in the symbol
374 // table.
375
376 if (setMode != 0) {
377 if (lastItem == 1) {
378 if (op != 0) {
379 // syntaxError(chars, "Char expected after operator");
380 ec = U_MALFORMED_SET;
381 return;
382 }
383 add(lastChar, lastChar);
384 _appendToPat(patLocal, lastChar, FALSE);
385 lastItem = 0;
386 op = 0;
387 }
388
389 if (op == u'-' || op == u'&') {
390 patLocal.append(op);
391 }
392
393 if (nested == 0) {
394 // lazy allocation
395 if (!scratch.allocate()) {
396 ec = U_MEMORY_ALLOCATION_ERROR;
397 return;
398 }
399 nested = scratch.pointer();
400 }
401 switch (setMode) {
402 case 1:
403 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
404 break;
405 case 2:
406 chars.skipIgnored(opts);
407 nested->applyPropertyPattern(chars, patLocal, ec);
408 if (U_FAILURE(ec)) return;
409 break;
410 case 3: // `nested' already parsed
411 nested->_toPattern(patLocal, FALSE);
412 break;
413 }
414
415 usePat = TRUE;
416
417 if (mode == 0) {
418 // Entire pattern is a category; leave parse loop
419 *this = *nested;
420 mode = 2;
421 break;
422 }
423
424 switch (op) {
425 case u'-':
426 removeAll(*nested);
427 break;
428 case u'&':
429 retainAll(*nested);
430 break;
431 case 0:
432 addAll(*nested);
433 break;
434 }
435
436 op = 0;
437 lastItem = 2;
438
439 continue;
440 }
441
442 if (mode == 0) {
443 // syntaxError(chars, "Missing '['");
444 ec = U_MALFORMED_SET;
445 return;
446 }
447
448 // -------- Parse special (syntax) characters. If the
449 // current character is not special, or if it is escaped,
450 // then fall through and handle it below.
451
452 if (!literal) {
453 switch (c) {
454 case u']':
455 if (lastItem == 1) {
456 add(lastChar, lastChar);
457 _appendToPat(patLocal, lastChar, FALSE);
458 }
459 // Treat final trailing '-' as a literal
460 if (op == u'-') {
461 add(op, op);
462 patLocal.append(op);
463 } else if (op == u'&') {
464 // syntaxError(chars, "Trailing '&'");
465 ec = U_MALFORMED_SET;
466 return;
467 }
468 patLocal.append(u']');
469 mode = 2;
470 continue;
471 case u'-':
472 if (op == 0) {
473 if (lastItem != 0) {
474 op = (UChar) c;
475 continue;
476 } else {
477 // Treat final trailing '-' as a literal
478 add(c, c);
479 c = chars.next(opts, literal, ec);
480 if (U_FAILURE(ec)) return;
481 if (c == u']' && !literal) {
482 patLocal.append(u"-]", 2);
483 mode = 2;
484 continue;
485 }
486 }
487 }
488 // syntaxError(chars, "'-' not after char or set");
489 ec = U_MALFORMED_SET;
490 return;
491 case u'&':
492 if (lastItem == 2 && op == 0) {
493 op = (UChar) c;
494 continue;
495 }
496 // syntaxError(chars, "'&' not after set");
497 ec = U_MALFORMED_SET;
498 return;
499 case u'^':
500 // syntaxError(chars, "'^' not after '['");
501 ec = U_MALFORMED_SET;
502 return;
503 case u'{':
504 if (op != 0) {
505 // syntaxError(chars, "Missing operand after operator");
506 ec = U_MALFORMED_SET;
507 return;
508 }
509 if (lastItem == 1) {
510 add(lastChar, lastChar);
511 _appendToPat(patLocal, lastChar, FALSE);
512 }
513 lastItem = 0;
514 buf.truncate(0);
515 {
516 UBool ok = FALSE;
517 while (!chars.atEnd()) {
518 c = chars.next(opts, literal, ec);
519 if (U_FAILURE(ec)) return;
520 if (c == u'}' && !literal) {
521 ok = TRUE;
522 break;
523 }
524 buf.append(c);
525 }
526 if (!ok) {
527 // syntaxError(chars, "Invalid multicharacter string");
528 ec = U_MALFORMED_SET;
529 return;
530 }
531 }
532 // We have new string. Add it to set and continue;
533 // we don't need to drop through to the further
534 // processing
535 add(buf);
536 patLocal.append(u'{');
537 _appendToPat(patLocal, buf, FALSE);
538 patLocal.append(u'}');
539 continue;
540 case SymbolTable::SYMBOL_REF:
541 // symbols nosymbols
542 // [a-$] error error (ambiguous)
543 // [a$] anchor anchor
544 // [a-$x] var "x"* literal '$'
545 // [a-$.] error literal '$'
546 // *We won't get here in the case of var "x"
547 {
548 chars.getPos(backup);
549 c = chars.next(opts, literal, ec);
550 if (U_FAILURE(ec)) return;
551 UBool anchor = (c == u']' && !literal);
552 if (symbols == 0 && !anchor) {
553 c = SymbolTable::SYMBOL_REF;
554 chars.setPos(backup);
555 break; // literal '$'
556 }
557 if (anchor && op == 0) {
558 if (lastItem == 1) {
559 add(lastChar, lastChar);
560 _appendToPat(patLocal, lastChar, FALSE);
561 }
562 add(U_ETHER);
563 usePat = TRUE;
564 patLocal.append((UChar) SymbolTable::SYMBOL_REF);
565 patLocal.append(u']');
566 mode = 2;
567 continue;
568 }
569 // syntaxError(chars, "Unquoted '$'");
570 ec = U_MALFORMED_SET;
571 return;
572 }
573 default:
574 break;
575 }
576 }
577
578 // -------- Parse literal characters. This includes both
579 // escaped chars ("\u4E01") and non-syntax characters
580 // ("a").
581
582 switch (lastItem) {
583 case 0:
584 lastItem = 1;
585 lastChar = c;
586 break;
587 case 1:
588 if (op == u'-') {
589 if (lastChar >= c) {
590 // Don't allow redundant (a-a) or empty (b-a) ranges;
591 // these are most likely typos.
592 // syntaxError(chars, "Invalid range");
593 ec = U_MALFORMED_SET;
594 return;
595 }
596 add(lastChar, c);
597 _appendToPat(patLocal, lastChar, FALSE);
598 patLocal.append(op);
599 _appendToPat(patLocal, c, FALSE);
600 lastItem = 0;
601 op = 0;
602 } else {
603 add(lastChar, lastChar);
604 _appendToPat(patLocal, lastChar, FALSE);
605 lastChar = c;
606 }
607 break;
608 case 2:
609 if (op != 0) {
610 // syntaxError(chars, "Set expected after operator");
611 ec = U_MALFORMED_SET;
612 return;
613 }
614 lastChar = c;
615 lastItem = 1;
616 break;
617 }
618 }
619
620 if (mode != 2) {
621 // syntaxError(chars, "Missing ']'");
622 ec = U_MALFORMED_SET;
623 return;
624 }
625
626 chars.skipIgnored(opts);
627
628 /**
629 * Handle global flags (invert, case insensitivity). If this
630 * pattern should be compiled case-insensitive, then we need
631 * to close over case BEFORE COMPLEMENTING. This makes
632 * patterns like /[^abc]/i work.
633 */
634 if ((options & USET_CASE_INSENSITIVE) != 0) {
635 (this->*caseClosure)(USET_CASE_INSENSITIVE);
636 }
637 else if ((options & USET_ADD_CASE_MAPPINGS) != 0) {
638 (this->*caseClosure)(USET_ADD_CASE_MAPPINGS);
639 }
640 if (invert) {
641 complement().removeAllStrings(); // code point complement
642 }
643
644 // Use the rebuilt pattern (patLocal) only if necessary. Prefer the
645 // generated pattern.
646 if (usePat) {
647 rebuiltPat.append(patLocal);
648 } else {
649 _generatePattern(rebuiltPat, FALSE);
650 }
651 if (isBogus() && U_SUCCESS(ec)) {
652 // We likely ran out of memory. AHHH!
653 ec = U_MEMORY_ALLOCATION_ERROR;
654 }
655 }
656
657 //----------------------------------------------------------------
658 // Property set implementation
659 //----------------------------------------------------------------
660
661 namespace {
662
numericValueFilter(UChar32 ch,void * context)663 static UBool numericValueFilter(UChar32 ch, void* context) {
664 return u_getNumericValue(ch) == *(double*)context;
665 }
666
generalCategoryMaskFilter(UChar32 ch,void * context)667 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
668 int32_t value = *(int32_t*)context;
669 return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
670 }
671
versionFilter(UChar32 ch,void * context)672 static UBool versionFilter(UChar32 ch, void* context) {
673 static const UVersionInfo none = { 0, 0, 0, 0 };
674 UVersionInfo v;
675 u_charAge(ch, v);
676 UVersionInfo* version = (UVersionInfo*)context;
677 return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
678 }
679
680 typedef struct {
681 UProperty prop;
682 int32_t value;
683 } IntPropertyContext;
684
intPropertyFilter(UChar32 ch,void * context)685 static UBool intPropertyFilter(UChar32 ch, void* context) {
686 IntPropertyContext* c = (IntPropertyContext*)context;
687 return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
688 }
689
scriptExtensionsFilter(UChar32 ch,void * context)690 static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
691 return uscript_hasScript(ch, *(UScriptCode*)context);
692 }
693
694 } // namespace
695
696 /**
697 * Generic filter-based scanning code for UCD property UnicodeSets.
698 */
applyFilter(UnicodeSet::Filter filter,void * context,const UnicodeSet * inclusions,UErrorCode & status)699 void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
700 void* context,
701 const UnicodeSet* inclusions,
702 UErrorCode &status) {
703 if (U_FAILURE(status)) return;
704
705 // Logically, walk through all Unicode characters, noting the start
706 // and end of each range for which filter.contain(c) is
707 // true. Add each range to a set.
708 //
709 // To improve performance, use an inclusions set which
710 // encodes information about character ranges that are known
711 // to have identical properties.
712 // inclusions contains the first characters of
713 // same-value ranges for the given property.
714
715 clear();
716
717 UChar32 startHasProperty = -1;
718 int32_t limitRange = inclusions->getRangeCount();
719
720 for (int j=0; j<limitRange; ++j) {
721 // get current range
722 UChar32 start = inclusions->getRangeStart(j);
723 UChar32 end = inclusions->getRangeEnd(j);
724
725 // for all the code points in the range, process
726 for (UChar32 ch = start; ch <= end; ++ch) {
727 // only add to this UnicodeSet on inflection points --
728 // where the hasProperty value changes to false
729 if ((*filter)(ch, context)) {
730 if (startHasProperty < 0) {
731 startHasProperty = ch;
732 }
733 } else if (startHasProperty >= 0) {
734 add(startHasProperty, ch-1);
735 startHasProperty = -1;
736 }
737 }
738 }
739 if (startHasProperty >= 0) {
740 add((UChar32)startHasProperty, (UChar32)0x10FFFF);
741 }
742 if (isBogus() && U_SUCCESS(status)) {
743 // We likely ran out of memory. AHHH!
744 status = U_MEMORY_ALLOCATION_ERROR;
745 }
746 }
747
748 namespace {
749
mungeCharName(char * dst,const char * src,int32_t dstCapacity)750 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
751 /* Note: we use ' ' in compiler code page */
752 int32_t j = 0;
753 char ch;
754 --dstCapacity; /* make room for term. zero */
755 while ((ch = *src++) != 0) {
756 if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
757 continue;
758 }
759 if (j >= dstCapacity) return FALSE;
760 dst[j++] = ch;
761 }
762 if (j > 0 && dst[j-1] == ' ') --j;
763 dst[j] = 0;
764 return TRUE;
765 }
766
767 } // namespace
768
769 //----------------------------------------------------------------
770 // Property set API
771 //----------------------------------------------------------------
772
773 #define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \
774 ec=U_ILLEGAL_ARGUMENT_ERROR; \
775 return *this; \
776 } UPRV_BLOCK_MACRO_END
777
778 UnicodeSet&
applyIntPropertyValue(UProperty prop,int32_t value,UErrorCode & ec)779 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
780 if (U_FAILURE(ec) || isFrozen()) { return *this; }
781 if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
782 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
783 applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
784 } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
785 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
786 UScriptCode script = (UScriptCode)value;
787 applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
788 } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
789 if (value == 0 || value == 1) {
790 const USet *set = u_getBinaryPropertySet(prop, &ec);
791 if (U_FAILURE(ec)) { return *this; }
792 copyFrom(*UnicodeSet::fromUSet(set), TRUE);
793 if (value == 0) {
794 complement().removeAllStrings(); // code point complement
795 }
796 } else {
797 clear();
798 }
799 } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
800 const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
801 IntPropertyContext c = {prop, value};
802 applyFilter(intPropertyFilter, &c, inclusions, ec);
803 } else {
804 ec = U_ILLEGAL_ARGUMENT_ERROR;
805 }
806 return *this;
807 }
808
809 UnicodeSet&
applyPropertyAlias(const UnicodeString & prop,const UnicodeString & value,UErrorCode & ec)810 UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
811 const UnicodeString& value,
812 UErrorCode& ec) {
813 if (U_FAILURE(ec) || isFrozen()) return *this;
814
815 // prop and value used to be converted to char * using the default
816 // converter instead of the invariant conversion.
817 // This should not be necessary because all Unicode property and value
818 // names use only invariant characters.
819 // If there are any variant characters, then we won't find them anyway.
820 // Checking first avoids assertion failures in the conversion.
821 if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
822 !uprv_isInvariantUString(value.getBuffer(), value.length())
823 ) {
824 FAIL(ec);
825 }
826 CharString pname, vname;
827 pname.appendInvariantChars(prop, ec);
828 vname.appendInvariantChars(value, ec);
829 if (U_FAILURE(ec)) return *this;
830
831 UProperty p;
832 int32_t v;
833 UBool invert = FALSE;
834
835 if (value.length() > 0) {
836 p = u_getPropertyEnum(pname.data());
837 if (p == UCHAR_INVALID_CODE) FAIL(ec);
838
839 // Treat gc as gcm
840 if (p == UCHAR_GENERAL_CATEGORY) {
841 p = UCHAR_GENERAL_CATEGORY_MASK;
842 }
843
844 if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
845 (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
846 (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
847 v = u_getPropertyValueEnum(p, vname.data());
848 if (v == UCHAR_INVALID_CODE) {
849 // Handle numeric CCC
850 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
851 p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
852 p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
853 char* end;
854 double val = uprv_strtod(vname.data(), &end);
855 // Anything between 0 and 255 is valid even if unused.
856 // Cast double->int only after range check.
857 // We catch NaN here because comparing it with both 0 and 255 will be false
858 // (as are all comparisons with NaN).
859 if (*end != 0 || !(0 <= val && val <= 255) ||
860 (v = (int32_t)val) != val) {
861 // non-integral value or outside 0..255, or trailing junk
862 FAIL(ec);
863 }
864 } else {
865 FAIL(ec);
866 }
867 }
868 }
869
870 else {
871
872 switch (p) {
873 case UCHAR_NUMERIC_VALUE:
874 {
875 char* end;
876 double val = uprv_strtod(vname.data(), &end);
877 if (*end != 0) {
878 FAIL(ec);
879 }
880 applyFilter(numericValueFilter, &val,
881 CharacterProperties::getInclusionsForProperty(p, ec), ec);
882 return *this;
883 }
884 case UCHAR_NAME:
885 {
886 // Must munge name, since u_charFromName() does not do
887 // 'loose' matching.
888 char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
889 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
890 UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
891 if (U_SUCCESS(ec)) {
892 clear();
893 add(ch);
894 return *this;
895 } else {
896 FAIL(ec);
897 }
898 }
899 case UCHAR_UNICODE_1_NAME:
900 // ICU 49 deprecates the Unicode_1_Name property APIs.
901 FAIL(ec);
902 case UCHAR_AGE:
903 {
904 // Must munge name, since u_versionFromString() does not do
905 // 'loose' matching.
906 char buf[128];
907 if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
908 UVersionInfo version;
909 u_versionFromString(version, buf);
910 applyFilter(versionFilter, &version,
911 CharacterProperties::getInclusionsForProperty(p, ec), ec);
912 return *this;
913 }
914 case UCHAR_SCRIPT_EXTENSIONS:
915 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
916 if (v == UCHAR_INVALID_CODE) {
917 FAIL(ec);
918 }
919 // fall through to calling applyIntPropertyValue()
920 break;
921 default:
922 // p is a non-binary, non-enumerated property that we
923 // don't support (yet).
924 FAIL(ec);
925 }
926 }
927 }
928
929 else {
930 // value is empty. Interpret as General Category, Script, or
931 // Binary property.
932 p = UCHAR_GENERAL_CATEGORY_MASK;
933 v = u_getPropertyValueEnum(p, pname.data());
934 if (v == UCHAR_INVALID_CODE) {
935 p = UCHAR_SCRIPT;
936 v = u_getPropertyValueEnum(p, pname.data());
937 if (v == UCHAR_INVALID_CODE) {
938 p = u_getPropertyEnum(pname.data());
939 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
940 v = 1;
941 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
942 set(MIN_VALUE, MAX_VALUE);
943 return *this;
944 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
945 set(0, 0x7F);
946 return *this;
947 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
948 // [:Assigned:]=[:^Cn:]
949 p = UCHAR_GENERAL_CATEGORY_MASK;
950 v = U_GC_CN_MASK;
951 invert = TRUE;
952 } else {
953 FAIL(ec);
954 }
955 }
956 }
957 }
958
959 applyIntPropertyValue(p, v, ec);
960 if(invert) {
961 complement().removeAllStrings(); // code point complement
962 }
963
964 if (isBogus() && U_SUCCESS(ec)) {
965 // We likely ran out of memory. AHHH!
966 ec = U_MEMORY_ALLOCATION_ERROR;
967 }
968 return *this;
969 }
970
971 //----------------------------------------------------------------
972 // Property set patterns
973 //----------------------------------------------------------------
974
975 /**
976 * Return true if the given position, in the given pattern, appears
977 * to be the start of a property set pattern.
978 */
resemblesPropertyPattern(const UnicodeString & pattern,int32_t pos)979 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
980 int32_t pos) {
981 // Patterns are at least 5 characters long
982 if ((pos+5) > pattern.length()) {
983 return FALSE;
984 }
985
986 // Look for an opening [:, [:^, \p, or \P
987 return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
988 }
989
990 /**
991 * Return true if the given iterator appears to point at a
992 * property pattern. Regardless of the result, return with the
993 * iterator unchanged.
994 * @param chars iterator over the pattern characters. Upon return
995 * it will be unchanged.
996 * @param iterOpts RuleCharacterIterator options
997 */
resemblesPropertyPattern(RuleCharacterIterator & chars,int32_t iterOpts)998 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
999 int32_t iterOpts) {
1000 // NOTE: literal will always be FALSE, because we don't parse escapes.
1001 UBool result = FALSE, literal;
1002 UErrorCode ec = U_ZERO_ERROR;
1003 iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1004 RuleCharacterIterator::Pos pos;
1005 chars.getPos(pos);
1006 UChar32 c = chars.next(iterOpts, literal, ec);
1007 if (c == u'[' || c == u'\\') {
1008 UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1009 literal, ec);
1010 result = (c == u'[') ? (d == u':') :
1011 (d == u'N' || d == u'p' || d == u'P');
1012 }
1013 chars.setPos(pos);
1014 return result && U_SUCCESS(ec);
1015 }
1016
1017 /**
1018 * Parse the given property pattern at the given parse position.
1019 */
applyPropertyPattern(const UnicodeString & pattern,ParsePosition & ppos,UErrorCode & ec)1020 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1021 ParsePosition& ppos,
1022 UErrorCode &ec) {
1023 int32_t pos = ppos.getIndex();
1024
1025 UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1026 UBool isName = FALSE; // true for \N{pat}, o/w false
1027 UBool invert = FALSE;
1028
1029 if (U_FAILURE(ec)) return *this;
1030
1031 // Minimum length is 5 characters, e.g. \p{L}
1032 if ((pos+5) > pattern.length()) {
1033 FAIL(ec);
1034 }
1035
1036 // On entry, ppos should point to one of the following locations:
1037 // Look for an opening [:, [:^, \p, or \P
1038 if (isPOSIXOpen(pattern, pos)) {
1039 posix = TRUE;
1040 pos += 2;
1041 pos = ICU_Utility::skipWhitespace(pattern, pos);
1042 if (pos < pattern.length() && pattern.charAt(pos) == u'^') {
1043 ++pos;
1044 invert = TRUE;
1045 }
1046 } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1047 UChar c = pattern.charAt(pos+1);
1048 invert = (c == u'P');
1049 isName = (c == u'N');
1050 pos += 2;
1051 pos = ICU_Utility::skipWhitespace(pattern, pos);
1052 if (pos == pattern.length() || pattern.charAt(pos++) != u'{') {
1053 // Syntax error; "\p" or "\P" not followed by "{"
1054 FAIL(ec);
1055 }
1056 } else {
1057 // Open delimiter not seen
1058 FAIL(ec);
1059 }
1060
1061 // Look for the matching close delimiter, either :] or }
1062 int32_t close;
1063 if (posix) {
1064 close = pattern.indexOf(u":]", 2, pos);
1065 } else {
1066 close = pattern.indexOf(u'}', pos);
1067 }
1068 if (close < 0) {
1069 // Syntax error; close delimiter missing
1070 FAIL(ec);
1071 }
1072
1073 // Look for an '=' sign. If this is present, we will parse a
1074 // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1075 // pattern.
1076 int32_t equals = pattern.indexOf(u'=', pos);
1077 UnicodeString propName, valueName;
1078 if (equals >= 0 && equals < close && !isName) {
1079 // Equals seen; parse medium/long pattern
1080 pattern.extractBetween(pos, equals, propName);
1081 pattern.extractBetween(equals+1, close, valueName);
1082 }
1083
1084 else {
1085 // Handle case where no '=' is seen, and \N{}
1086 pattern.extractBetween(pos, close, propName);
1087
1088 // Handle \N{name}
1089 if (isName) {
1090 // This is a little inefficient since it means we have to
1091 // parse NAME_PROP back to UCHAR_NAME even though we already
1092 // know it's UCHAR_NAME. If we refactor the API to
1093 // support args of (UProperty, char*) then we can remove
1094 // NAME_PROP and make this a little more efficient.
1095 valueName = propName;
1096 propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1097 }
1098 }
1099
1100 applyPropertyAlias(propName, valueName, ec);
1101
1102 if (U_SUCCESS(ec)) {
1103 if (invert) {
1104 complement().removeAllStrings(); // code point complement
1105 }
1106
1107 // Move to the limit position after the close delimiter if the
1108 // parse succeeded.
1109 ppos.setIndex(close + (posix ? 2 : 1));
1110 }
1111
1112 return *this;
1113 }
1114
1115 /**
1116 * Parse a property pattern.
1117 * @param chars iterator over the pattern characters. Upon return
1118 * it will be advanced to the first character after the parsed
1119 * pattern, or the end of the iteration if all characters are
1120 * parsed.
1121 * @param rebuiltPat the pattern that was parsed, rebuilt or
1122 * copied from the input pattern, as appropriate.
1123 */
applyPropertyPattern(RuleCharacterIterator & chars,UnicodeString & rebuiltPat,UErrorCode & ec)1124 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1125 UnicodeString& rebuiltPat,
1126 UErrorCode& ec) {
1127 if (U_FAILURE(ec)) return;
1128 UnicodeString pattern;
1129 chars.lookahead(pattern);
1130 ParsePosition pos(0);
1131 applyPropertyPattern(pattern, pos, ec);
1132 if (U_FAILURE(ec)) return;
1133 if (pos.getIndex() == 0) {
1134 // syntaxError(chars, "Invalid property pattern");
1135 ec = U_MALFORMED_SET;
1136 return;
1137 }
1138 chars.jumpahead(pos.getIndex());
1139 rebuiltPat.append(pattern, 0, pos.getIndex());
1140 }
1141
1142 U_NAMESPACE_END
1143