• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 1999-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  uniset_props.cpp
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2004aug25
16 *   created by: Markus W. Scherer
17 *
18 *   Character property dependent functions moved here from uniset.cpp
19 */
20 
21 #include "unicode/utypes.h"
22 #include "unicode/uniset.h"
23 #include "unicode/parsepos.h"
24 #include "unicode/uchar.h"
25 #include "unicode/uscript.h"
26 #include "unicode/symtable.h"
27 #include "unicode/uset.h"
28 #include "unicode/locid.h"
29 #include "unicode/brkiter.h"
30 #include "uset_imp.h"
31 #include "ruleiter.h"
32 #include "cmemory.h"
33 #include "ucln_cmn.h"
34 #include "util.h"
35 #include "uvector.h"
36 #include "uprops.h"
37 #include "propname.h"
38 #include "normalizer2impl.h"
39 #include "uinvchar.h"
40 #include "uprops.h"
41 #include "charstr.h"
42 #include "cstring.h"
43 #include "mutex.h"
44 #include "umutex.h"
45 #include "uassert.h"
46 #include "hash.h"
47 
48 U_NAMESPACE_USE
49 
50 // Special property set IDs
51 static const char ANY[]   = "ANY";   // [\u0000-\U0010FFFF]
52 static const char ASCII[] = "ASCII"; // [\u0000-\u007F]
53 static const char ASSIGNED[] = "Assigned"; // [:^Cn:]
54 
55 // Unicode name property alias
56 #define NAME_PROP "na"
57 #define NAME_PROP_LENGTH 2
58 
59 // Cached sets ------------------------------------------------------------- ***
60 
61 U_CDECL_BEGIN
62 static UBool U_CALLCONV uset_cleanup();
63 
64 static UnicodeSet *uni32Singleton;
65 static icu::UInitOnce uni32InitOnce {};
66 
67 /**
68  * Cleanup function for UnicodeSet
69  */
uset_cleanup()70 static UBool U_CALLCONV uset_cleanup() {
71     delete uni32Singleton;
72     uni32Singleton = nullptr;
73     uni32InitOnce.reset();
74     return true;
75 }
76 
77 U_CDECL_END
78 
79 U_NAMESPACE_BEGIN
80 
81 namespace {
82 
83 // Cache some sets for other services -------------------------------------- ***
createUni32Set(UErrorCode & errorCode)84 void U_CALLCONV createUni32Set(UErrorCode &errorCode) {
85     U_ASSERT(uni32Singleton == nullptr);
86     uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode);
87     if(uni32Singleton==nullptr) {
88         errorCode=U_MEMORY_ALLOCATION_ERROR;
89     } else {
90         uni32Singleton->freeze();
91     }
92     ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup);
93 }
94 
95 
96 U_CFUNC UnicodeSet *
uniset_getUnicode32Instance(UErrorCode & errorCode)97 uniset_getUnicode32Instance(UErrorCode &errorCode) {
98     umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode);
99     return uni32Singleton;
100 }
101 
102 // helper functions for matching of pattern syntax pieces ------------------ ***
103 // these functions are parallel to the PERL_OPEN etc. strings above
104 
105 // using these functions is not only faster than UnicodeString::compare() and
106 // caseCompare(), but they also make UnicodeSet work for simple patterns when
107 // no Unicode properties data is available - when caseCompare() fails
108 
109 static inline UBool
isPerlOpen(const UnicodeString & pattern,int32_t pos)110 isPerlOpen(const UnicodeString &pattern, int32_t pos) {
111     char16_t c;
112     return pattern.charAt(pos)==u'\\' && ((c=pattern.charAt(pos+1))==u'p' || c==u'P');
113 }
114 
115 /*static inline UBool
116 isPerlClose(const UnicodeString &pattern, int32_t pos) {
117     return pattern.charAt(pos)==u'}';
118 }*/
119 
120 static inline UBool
isNameOpen(const UnicodeString & pattern,int32_t pos)121 isNameOpen(const UnicodeString &pattern, int32_t pos) {
122     return pattern.charAt(pos)==u'\\' && pattern.charAt(pos+1)==u'N';
123 }
124 
125 static inline UBool
isPOSIXOpen(const UnicodeString & pattern,int32_t pos)126 isPOSIXOpen(const UnicodeString &pattern, int32_t pos) {
127     return pattern.charAt(pos)==u'[' && pattern.charAt(pos+1)==u':';
128 }
129 
130 /*static inline UBool
131 isPOSIXClose(const UnicodeString &pattern, int32_t pos) {
132     return pattern.charAt(pos)==u':' && pattern.charAt(pos+1)==u']';
133 }*/
134 
135 // TODO memory debugging provided inside uniset.cpp
136 // could be made available here but probably obsolete with use of modern
137 // memory leak checker tools
138 #define _dbgct(me)
139 
140 }  // namespace
141 
142 //----------------------------------------------------------------
143 // Constructors &c
144 //----------------------------------------------------------------
145 
146 /**
147  * Constructs a set from the given pattern, optionally ignoring
148  * white space.  See the class description for the syntax of the
149  * pattern language.
150  * @param pattern a string specifying what characters are in the set
151  */
UnicodeSet(const UnicodeString & pattern,UErrorCode & status)152 UnicodeSet::UnicodeSet(const UnicodeString& pattern,
153                        UErrorCode& status) {
154     applyPattern(pattern, status);
155     _dbgct(this);
156 }
157 
158 //----------------------------------------------------------------
159 // Public API
160 //----------------------------------------------------------------
161 
applyPattern(const UnicodeString & pattern,UErrorCode & status)162 UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern,
163                                      UErrorCode& status) {
164     // Equivalent to
165     //   return applyPattern(pattern, USET_IGNORE_SPACE, nullptr, status);
166     // but without dependency on closeOver().
167     ParsePosition pos(0);
168     applyPatternIgnoreSpace(pattern, pos, nullptr, status);
169     if (U_FAILURE(status)) return *this;
170 
171     int32_t i = pos.getIndex();
172     // Skip over trailing whitespace
173     ICU_Utility::skipWhitespace(pattern, i, true);
174     if (i != pattern.length()) {
175         status = U_ILLEGAL_ARGUMENT_ERROR;
176     }
177     return *this;
178 }
179 
180 void
applyPatternIgnoreSpace(const UnicodeString & pattern,ParsePosition & pos,const SymbolTable * symbols,UErrorCode & status)181 UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern,
182                                     ParsePosition& pos,
183                                     const SymbolTable* symbols,
184                                     UErrorCode& status) {
185     if (U_FAILURE(status)) {
186         return;
187     }
188     if (isFrozen()) {
189         status = U_NO_WRITE_PERMISSION;
190         return;
191     }
192     // Need to build the pattern in a temporary string because
193     // _applyPattern calls add() etc., which set pat to empty.
194     UnicodeString rebuiltPat;
195     RuleCharacterIterator chars(pattern, symbols, pos);
196     applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, nullptr, 0, status);
197     if (U_FAILURE(status)) return;
198     if (chars.inVariable()) {
199         // syntaxError(chars, "Extra chars in variable value");
200         status = U_MALFORMED_SET;
201         return;
202     }
203     setPattern(rebuiltPat);
204 }
205 
206 /**
207  * Return true if the given position, in the given pattern, appears
208  * to be the start of a UnicodeSet pattern.
209  */
resemblesPattern(const UnicodeString & pattern,int32_t pos)210 UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) {
211     return ((pos+1) < pattern.length() &&
212             pattern.charAt(pos) == (char16_t)91/*[*/) ||
213         resemblesPropertyPattern(pattern, pos);
214 }
215 
216 //----------------------------------------------------------------
217 // Implementation: Pattern parsing
218 //----------------------------------------------------------------
219 
220 namespace {
221 
222 /**
223  * A small all-inline class to manage a UnicodeSet pointer.  Add
224  * operator->() etc. as needed.
225  */
226 class UnicodeSetPointer {
227     UnicodeSet* p;
228 public:
UnicodeSetPointer()229     inline UnicodeSetPointer() : p(0) {}
~UnicodeSetPointer()230     inline ~UnicodeSetPointer() { delete p; }
pointer()231     inline UnicodeSet* pointer() { return p; }
allocate()232     inline UBool allocate() {
233         if (p == 0) {
234             p = new UnicodeSet();
235         }
236         return p != 0;
237     }
238 };
239 
240 constexpr int32_t MAX_DEPTH = 100;
241 
242 }  // namespace
243 
244 /**
245  * Parse the pattern from the given RuleCharacterIterator.  The
246  * iterator is advanced over the parsed pattern.
247  * @param chars iterator over the pattern characters.  Upon return
248  * it will be advanced to the first character after the parsed
249  * pattern, or the end of the iteration if all characters are
250  * parsed.
251  * @param symbols symbol table to use to parse and dereference
252  * variables, or null if none.
253  * @param rebuiltPat the pattern that was parsed, rebuilt or
254  * copied from the input pattern, as appropriate.
255  * @param options a bit mask of zero or more of the following:
256  * IGNORE_SPACE, CASE.
257  */
applyPattern(RuleCharacterIterator & chars,const SymbolTable * symbols,UnicodeString & rebuiltPat,uint32_t options,UnicodeSet & (UnicodeSet::* caseClosure)(int32_t attribute),int32_t depth,UErrorCode & ec)258 void UnicodeSet::applyPattern(RuleCharacterIterator& chars,
259                               const SymbolTable* symbols,
260                               UnicodeString& rebuiltPat,
261                               uint32_t options,
262                               UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
263                               int32_t depth,
264                               UErrorCode& ec) {
265     if (U_FAILURE(ec)) return;
266     if (depth > MAX_DEPTH) {
267         ec = U_ILLEGAL_ARGUMENT_ERROR;
268         return;
269     }
270 
271     // Syntax characters: [ ] ^ - & { }
272 
273     // Recognized special forms for chars, sets: c-c s-s s&s
274 
275     int32_t opts = RuleCharacterIterator::PARSE_VARIABLES |
276                    RuleCharacterIterator::PARSE_ESCAPES;
277     if ((options & USET_IGNORE_SPACE) != 0) {
278         opts |= RuleCharacterIterator::SKIP_WHITESPACE;
279     }
280 
281     UnicodeString patLocal, buf;
282     UBool usePat = false;
283     UnicodeSetPointer scratch;
284     RuleCharacterIterator::Pos backup;
285 
286     // mode: 0=before [, 1=between [...], 2=after ]
287     // lastItem: 0=none, 1=char, 2=set
288     int8_t lastItem = 0, mode = 0;
289     UChar32 lastChar = 0;
290     char16_t op = 0;
291 
292     UBool invert = false;
293 
294     clear();
295 
296     while (mode != 2 && !chars.atEnd()) {
297         U_ASSERT((lastItem == 0 && op == 0) ||
298                  (lastItem == 1 && (op == 0 || op == u'-')) ||
299                  (lastItem == 2 && (op == 0 || op == u'-' || op == u'&')));
300 
301         UChar32 c = 0;
302         UBool literal = false;
303         UnicodeSet* nested = 0; // alias - do not delete
304 
305         // -------- Check for property pattern
306 
307         // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed
308         int8_t setMode = 0;
309         if (resemblesPropertyPattern(chars, opts)) {
310             setMode = 2;
311         }
312 
313         // -------- Parse '[' of opening delimiter OR nested set.
314         // If there is a nested set, use `setMode' to define how
315         // the set should be parsed.  If the '[' is part of the
316         // opening delimiter for this pattern, parse special
317         // strings "[", "[^", "[-", and "[^-".  Check for stand-in
318         // characters representing a nested set in the symbol
319         // table.
320 
321         else {
322             // Prepare to backup if necessary
323             chars.getPos(backup);
324             c = chars.next(opts, literal, ec);
325             if (U_FAILURE(ec)) return;
326 
327             if (c == u'[' && !literal) {
328                 if (mode == 1) {
329                     chars.setPos(backup); // backup
330                     setMode = 1;
331                 } else {
332                     // Handle opening '[' delimiter
333                     mode = 1;
334                     patLocal.append(u'[');
335                     chars.getPos(backup); // prepare to backup
336                     c = chars.next(opts, literal, ec);
337                     if (U_FAILURE(ec)) return;
338                     if (c == u'^' && !literal) {
339                         invert = true;
340                         patLocal.append(u'^');
341                         chars.getPos(backup); // prepare to backup
342                         c = chars.next(opts, literal, ec);
343                         if (U_FAILURE(ec)) return;
344                     }
345                     // Fall through to handle special leading '-';
346                     // otherwise restart loop for nested [], \p{}, etc.
347                     if (c == u'-') {
348                         literal = true;
349                         // Fall through to handle literal '-' below
350                     } else {
351                         chars.setPos(backup); // backup
352                         continue;
353                     }
354                 }
355             } else if (symbols != 0) {
356                 const UnicodeFunctor *m = symbols->lookupMatcher(c);
357                 if (m != 0) {
358                     const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m);
359                     if (ms == nullptr) {
360                         ec = U_MALFORMED_SET;
361                         return;
362                     }
363                     // casting away const, but `nested' won't be modified
364                     // (important not to modify stored set)
365                     nested = const_cast<UnicodeSet*>(ms);
366                     setMode = 3;
367                 }
368             }
369         }
370 
371         // -------- Handle a nested set.  This either is inline in
372         // the pattern or represented by a stand-in that has
373         // previously been parsed and was looked up in the symbol
374         // table.
375 
376         if (setMode != 0) {
377             if (lastItem == 1) {
378                 if (op != 0) {
379                     // syntaxError(chars, "Char expected after operator");
380                     ec = U_MALFORMED_SET;
381                     return;
382                 }
383                 add(lastChar, lastChar);
384                 _appendToPat(patLocal, lastChar, false);
385                 lastItem = 0;
386                 op = 0;
387             }
388 
389             if (op == u'-' || op == u'&') {
390                 patLocal.append(op);
391             }
392 
393             if (nested == 0) {
394                 // lazy allocation
395                 if (!scratch.allocate()) {
396                     ec = U_MEMORY_ALLOCATION_ERROR;
397                     return;
398                 }
399                 nested = scratch.pointer();
400             }
401             switch (setMode) {
402             case 1:
403                 nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec);
404                 break;
405             case 2:
406                 chars.skipIgnored(opts);
407                 nested->applyPropertyPattern(chars, patLocal, ec);
408                 if (U_FAILURE(ec)) return;
409                 break;
410             case 3: // `nested' already parsed
411                 nested->_toPattern(patLocal, false);
412                 break;
413             }
414 
415             usePat = true;
416 
417             if (mode == 0) {
418                 // Entire pattern is a category; leave parse loop
419                 *this = *nested;
420                 mode = 2;
421                 break;
422             }
423 
424             switch (op) {
425             case u'-':
426                 removeAll(*nested);
427                 break;
428             case u'&':
429                 retainAll(*nested);
430                 break;
431             case 0:
432                 addAll(*nested);
433                 break;
434             }
435 
436             op = 0;
437             lastItem = 2;
438 
439             continue;
440         }
441 
442         if (mode == 0) {
443             // syntaxError(chars, "Missing '['");
444             ec = U_MALFORMED_SET;
445             return;
446         }
447 
448         // -------- Parse special (syntax) characters.  If the
449         // current character is not special, or if it is escaped,
450         // then fall through and handle it below.
451 
452         if (!literal) {
453             switch (c) {
454             case u']':
455                 if (lastItem == 1) {
456                     add(lastChar, lastChar);
457                     _appendToPat(patLocal, lastChar, false);
458                 }
459                 // Treat final trailing '-' as a literal
460                 if (op == u'-') {
461                     add(op, op);
462                     patLocal.append(op);
463                 } else if (op == u'&') {
464                     // syntaxError(chars, "Trailing '&'");
465                     ec = U_MALFORMED_SET;
466                     return;
467                 }
468                 patLocal.append(u']');
469                 mode = 2;
470                 continue;
471             case u'-':
472                 if (op == 0) {
473                     if (lastItem != 0) {
474                         op = (char16_t) c;
475                         continue;
476                     } else {
477                         // Treat final trailing '-' as a literal
478                         add(c, c);
479                         c = chars.next(opts, literal, ec);
480                         if (U_FAILURE(ec)) return;
481                         if (c == u']' && !literal) {
482                             patLocal.append(u"-]", 2);
483                             mode = 2;
484                             continue;
485                         }
486                     }
487                 }
488                 // syntaxError(chars, "'-' not after char or set");
489                 ec = U_MALFORMED_SET;
490                 return;
491             case u'&':
492                 if (lastItem == 2 && op == 0) {
493                     op = (char16_t) c;
494                     continue;
495                 }
496                 // syntaxError(chars, "'&' not after set");
497                 ec = U_MALFORMED_SET;
498                 return;
499             case u'^':
500                 // syntaxError(chars, "'^' not after '['");
501                 ec = U_MALFORMED_SET;
502                 return;
503             case u'{':
504                 if (op != 0) {
505                     // syntaxError(chars, "Missing operand after operator");
506                     ec = U_MALFORMED_SET;
507                     return;
508                 }
509                 if (lastItem == 1) {
510                     add(lastChar, lastChar);
511                     _appendToPat(patLocal, lastChar, false);
512                 }
513                 lastItem = 0;
514                 buf.truncate(0);
515                 {
516                     UBool ok = false;
517                     while (!chars.atEnd()) {
518                         c = chars.next(opts, literal, ec);
519                         if (U_FAILURE(ec)) return;
520                         if (c == u'}' && !literal) {
521                             ok = true;
522                             break;
523                         }
524                         buf.append(c);
525                     }
526                     if (!ok) {
527                         // syntaxError(chars, "Invalid multicharacter string");
528                         ec = U_MALFORMED_SET;
529                         return;
530                     }
531                 }
532                 // We have new string. Add it to set and continue;
533                 // we don't need to drop through to the further
534                 // processing
535                 add(buf);
536                 patLocal.append(u'{');
537                 _appendToPat(patLocal, buf, false);
538                 patLocal.append(u'}');
539                 continue;
540             case SymbolTable::SYMBOL_REF:
541                 //         symbols  nosymbols
542                 // [a-$]   error    error (ambiguous)
543                 // [a$]    anchor   anchor
544                 // [a-$x]  var "x"* literal '$'
545                 // [a-$.]  error    literal '$'
546                 // *We won't get here in the case of var "x"
547                 {
548                     chars.getPos(backup);
549                     c = chars.next(opts, literal, ec);
550                     if (U_FAILURE(ec)) return;
551                     UBool anchor = (c == u']' && !literal);
552                     if (symbols == 0 && !anchor) {
553                         c = SymbolTable::SYMBOL_REF;
554                         chars.setPos(backup);
555                         break; // literal '$'
556                     }
557                     if (anchor && op == 0) {
558                         if (lastItem == 1) {
559                             add(lastChar, lastChar);
560                             _appendToPat(patLocal, lastChar, false);
561                         }
562                         add(U_ETHER);
563                         usePat = true;
564                         patLocal.append((char16_t) SymbolTable::SYMBOL_REF);
565                         patLocal.append(u']');
566                         mode = 2;
567                         continue;
568                     }
569                     // syntaxError(chars, "Unquoted '$'");
570                     ec = U_MALFORMED_SET;
571                     return;
572                 }
573             default:
574                 break;
575             }
576         }
577 
578         // -------- Parse literal characters.  This includes both
579         // escaped chars ("\u4E01") and non-syntax characters
580         // ("a").
581 
582         switch (lastItem) {
583         case 0:
584             lastItem = 1;
585             lastChar = c;
586             break;
587         case 1:
588             if (op == u'-') {
589                 if (lastChar >= c) {
590                     // Don't allow redundant (a-a) or empty (b-a) ranges;
591                     // these are most likely typos.
592                     // syntaxError(chars, "Invalid range");
593                     ec = U_MALFORMED_SET;
594                     return;
595                 }
596                 add(lastChar, c);
597                 _appendToPat(patLocal, lastChar, false);
598                 patLocal.append(op);
599                 _appendToPat(patLocal, c, false);
600                 lastItem = 0;
601                 op = 0;
602             } else {
603                 add(lastChar, lastChar);
604                 _appendToPat(patLocal, lastChar, false);
605                 lastChar = c;
606             }
607             break;
608         case 2:
609             if (op != 0) {
610                 // syntaxError(chars, "Set expected after operator");
611                 ec = U_MALFORMED_SET;
612                 return;
613             }
614             lastChar = c;
615             lastItem = 1;
616             break;
617         }
618     }
619 
620     if (mode != 2) {
621         // syntaxError(chars, "Missing ']'");
622         ec = U_MALFORMED_SET;
623         return;
624     }
625 
626     chars.skipIgnored(opts);
627 
628     /**
629      * Handle global flags (invert, case insensitivity).  If this
630      * pattern should be compiled case-insensitive, then we need
631      * to close over case BEFORE COMPLEMENTING.  This makes
632      * patterns like /[^abc]/i work.
633      */
634     if ((options & USET_CASE_MASK) != 0) {
635         (this->*caseClosure)(options);
636     }
637     if (invert) {
638         complement().removeAllStrings();  // code point complement
639     }
640 
641     // Use the rebuilt pattern (patLocal) only if necessary.  Prefer the
642     // generated pattern.
643     if (usePat) {
644         rebuiltPat.append(patLocal);
645     } else {
646         _generatePattern(rebuiltPat, false);
647     }
648     if (isBogus() && U_SUCCESS(ec)) {
649         // We likely ran out of memory. AHHH!
650         ec = U_MEMORY_ALLOCATION_ERROR;
651     }
652 }
653 
654 //----------------------------------------------------------------
655 // Property set implementation
656 //----------------------------------------------------------------
657 
658 namespace {
659 
numericValueFilter(UChar32 ch,void * context)660 static UBool numericValueFilter(UChar32 ch, void* context) {
661     return u_getNumericValue(ch) == *(double*)context;
662 }
663 
generalCategoryMaskFilter(UChar32 ch,void * context)664 static UBool generalCategoryMaskFilter(UChar32 ch, void* context) {
665     int32_t value = *(int32_t*)context;
666     return (U_GET_GC_MASK((UChar32) ch) & value) != 0;
667 }
668 
versionFilter(UChar32 ch,void * context)669 static UBool versionFilter(UChar32 ch, void* context) {
670     static const UVersionInfo none = { 0, 0, 0, 0 };
671     UVersionInfo v;
672     u_charAge(ch, v);
673     UVersionInfo* version = (UVersionInfo*)context;
674     return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0;
675 }
676 
677 typedef struct {
678     UProperty prop;
679     int32_t value;
680 } IntPropertyContext;
681 
intPropertyFilter(UChar32 ch,void * context)682 static UBool intPropertyFilter(UChar32 ch, void* context) {
683     IntPropertyContext* c = (IntPropertyContext*)context;
684     return u_getIntPropertyValue((UChar32) ch, c->prop) == c->value;
685 }
686 
scriptExtensionsFilter(UChar32 ch,void * context)687 static UBool scriptExtensionsFilter(UChar32 ch, void* context) {
688     return uscript_hasScript(ch, *(UScriptCode*)context);
689 }
690 
691 }  // namespace
692 
693 /**
694  * Generic filter-based scanning code for UCD property UnicodeSets.
695  */
applyFilter(UnicodeSet::Filter filter,void * context,const UnicodeSet * inclusions,UErrorCode & status)696 void UnicodeSet::applyFilter(UnicodeSet::Filter filter,
697                              void* context,
698                              const UnicodeSet* inclusions,
699                              UErrorCode &status) {
700     if (U_FAILURE(status)) return;
701 
702     // Logically, walk through all Unicode characters, noting the start
703     // and end of each range for which filter.contain(c) is
704     // true.  Add each range to a set.
705     //
706     // To improve performance, use an inclusions set which
707     // encodes information about character ranges that are known
708     // to have identical properties.
709     // inclusions contains the first characters of
710     // same-value ranges for the given property.
711 
712     clear();
713 
714     UChar32 startHasProperty = -1;
715     int32_t limitRange = inclusions->getRangeCount();
716 
717     for (int j=0; j<limitRange; ++j) {
718         // get current range
719         UChar32 start = inclusions->getRangeStart(j);
720         UChar32 end = inclusions->getRangeEnd(j);
721 
722         // for all the code points in the range, process
723         for (UChar32 ch = start; ch <= end; ++ch) {
724             // only add to this UnicodeSet on inflection points --
725             // where the hasProperty value changes to false
726             if ((*filter)(ch, context)) {
727                 if (startHasProperty < 0) {
728                     startHasProperty = ch;
729                 }
730             } else if (startHasProperty >= 0) {
731                 add(startHasProperty, ch-1);
732                 startHasProperty = -1;
733             }
734         }
735     }
736     if (startHasProperty >= 0) {
737         add((UChar32)startHasProperty, (UChar32)0x10FFFF);
738     }
739     if (isBogus() && U_SUCCESS(status)) {
740         // We likely ran out of memory. AHHH!
741         status = U_MEMORY_ALLOCATION_ERROR;
742     }
743 }
744 
745 namespace {
746 
mungeCharName(char * dst,const char * src,int32_t dstCapacity)747 static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) {
748     /* Note: we use ' ' in compiler code page */
749     int32_t j = 0;
750     char ch;
751     --dstCapacity; /* make room for term. zero */
752     while ((ch = *src++) != 0) {
753         if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) {
754             continue;
755         }
756         if (j >= dstCapacity) return false;
757         dst[j++] = ch;
758     }
759     if (j > 0 && dst[j-1] == ' ') --j;
760     dst[j] = 0;
761     return true;
762 }
763 
764 }  // namespace
765 
766 //----------------------------------------------------------------
767 // Property set API
768 //----------------------------------------------------------------
769 
770 #define FAIL(ec) UPRV_BLOCK_MACRO_BEGIN { \
771     ec=U_ILLEGAL_ARGUMENT_ERROR; \
772     return *this; \
773 } UPRV_BLOCK_MACRO_END
774 
775 UnicodeSet&
applyIntPropertyValue(UProperty prop,int32_t value,UErrorCode & ec)776 UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) {
777     if (U_FAILURE(ec) || isFrozen()) { return *this; }
778     if (prop == UCHAR_GENERAL_CATEGORY_MASK) {
779         const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
780         applyFilter(generalCategoryMaskFilter, &value, inclusions, ec);
781     } else if (prop == UCHAR_SCRIPT_EXTENSIONS) {
782         const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
783         UScriptCode script = (UScriptCode)value;
784         applyFilter(scriptExtensionsFilter, &script, inclusions, ec);
785     } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) {
786         if (value == 0 || value == 1) {
787             const USet *set = u_getBinaryPropertySet(prop, &ec);
788             if (U_FAILURE(ec)) { return *this; }
789             copyFrom(*UnicodeSet::fromUSet(set), true);
790             if (value == 0) {
791                 complement().removeAllStrings();  // code point complement
792             }
793         } else {
794             clear();
795         }
796     } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) {
797         const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec);
798         IntPropertyContext c = {prop, value};
799         applyFilter(intPropertyFilter, &c, inclusions, ec);
800     } else {
801         ec = U_ILLEGAL_ARGUMENT_ERROR;
802     }
803     return *this;
804 }
805 
806 UnicodeSet&
applyPropertyAlias(const UnicodeString & prop,const UnicodeString & value,UErrorCode & ec)807 UnicodeSet::applyPropertyAlias(const UnicodeString& prop,
808                                const UnicodeString& value,
809                                UErrorCode& ec) {
810     if (U_FAILURE(ec) || isFrozen()) return *this;
811 
812     // prop and value used to be converted to char * using the default
813     // converter instead of the invariant conversion.
814     // This should not be necessary because all Unicode property and value
815     // names use only invariant characters.
816     // If there are any variant characters, then we won't find them anyway.
817     // Checking first avoids assertion failures in the conversion.
818     if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) ||
819         !uprv_isInvariantUString(value.getBuffer(), value.length())
820     ) {
821         FAIL(ec);
822     }
823     CharString pname, vname;
824     pname.appendInvariantChars(prop, ec);
825     vname.appendInvariantChars(value, ec);
826     if (U_FAILURE(ec)) return *this;
827 
828     UProperty p;
829     int32_t v;
830     UBool invert = false;
831 
832     if (value.length() > 0) {
833         p = u_getPropertyEnum(pname.data());
834         if (p == UCHAR_INVALID_CODE) FAIL(ec);
835 
836         // Treat gc as gcm
837         if (p == UCHAR_GENERAL_CATEGORY) {
838             p = UCHAR_GENERAL_CATEGORY_MASK;
839         }
840 
841         if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) ||
842             (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) ||
843             (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) {
844             v = u_getPropertyValueEnum(p, vname.data());
845             if (v == UCHAR_INVALID_CODE) {
846                 // Handle numeric CCC
847                 if (p == UCHAR_CANONICAL_COMBINING_CLASS ||
848                     p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS ||
849                     p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) {
850                     char* end;
851                     double val = uprv_strtod(vname.data(), &end);
852                     // Anything between 0 and 255 is valid even if unused.
853                     // Cast double->int only after range check.
854                     // We catch NaN here because comparing it with both 0 and 255 will be false
855                     // (as are all comparisons with NaN).
856                     if (*end != 0 || !(0 <= val && val <= 255) ||
857                             (v = (int32_t)val) != val) {
858                         // non-integral value or outside 0..255, or trailing junk
859                         FAIL(ec);
860                     }
861                 } else {
862                     FAIL(ec);
863                 }
864             }
865         }
866 
867         else {
868 
869             switch (p) {
870             case UCHAR_NUMERIC_VALUE:
871                 {
872                     char* end;
873                     double val = uprv_strtod(vname.data(), &end);
874                     if (*end != 0) {
875                         FAIL(ec);
876                     }
877                     applyFilter(numericValueFilter, &val,
878                                 CharacterProperties::getInclusionsForProperty(p, ec), ec);
879                     return *this;
880                 }
881             case UCHAR_NAME:
882                 {
883                     // Must munge name, since u_charFromName() does not do
884                     // 'loose' matching.
885                     char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength
886                     if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
887                     UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec);
888                     if (U_SUCCESS(ec)) {
889                         clear();
890                         add(ch);
891                         return *this;
892                     } else {
893                         FAIL(ec);
894                     }
895                 }
896             case UCHAR_UNICODE_1_NAME:
897                 // ICU 49 deprecates the Unicode_1_Name property APIs.
898                 FAIL(ec);
899             case UCHAR_AGE:
900                 {
901                     // Must munge name, since u_versionFromString() does not do
902                     // 'loose' matching.
903                     char buf[128];
904                     if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec);
905                     UVersionInfo version;
906                     u_versionFromString(version, buf);
907                     applyFilter(versionFilter, &version,
908                                 CharacterProperties::getInclusionsForProperty(p, ec), ec);
909                     return *this;
910                 }
911             case UCHAR_SCRIPT_EXTENSIONS:
912                 v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data());
913                 if (v == UCHAR_INVALID_CODE) {
914                     FAIL(ec);
915                 }
916                 // fall through to calling applyIntPropertyValue()
917                 break;
918             default:
919                 // p is a non-binary, non-enumerated property that we
920                 // don't support (yet).
921                 FAIL(ec);
922             }
923         }
924     }
925 
926     else {
927         // value is empty.  Interpret as General Category, Script, or
928         // Binary property.
929         p = UCHAR_GENERAL_CATEGORY_MASK;
930         v = u_getPropertyValueEnum(p, pname.data());
931         if (v == UCHAR_INVALID_CODE) {
932             p = UCHAR_SCRIPT;
933             v = u_getPropertyValueEnum(p, pname.data());
934             if (v == UCHAR_INVALID_CODE) {
935                 p = u_getPropertyEnum(pname.data());
936                 if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) {
937                     v = 1;
938                 } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) {
939                     set(MIN_VALUE, MAX_VALUE);
940                     return *this;
941                 } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) {
942                     set(0, 0x7F);
943                     return *this;
944                 } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) {
945                     // [:Assigned:]=[:^Cn:]
946                     p = UCHAR_GENERAL_CATEGORY_MASK;
947                     v = U_GC_CN_MASK;
948                     invert = true;
949                 } else {
950                     FAIL(ec);
951                 }
952             }
953         }
954     }
955 
956     applyIntPropertyValue(p, v, ec);
957     if(invert) {
958         complement().removeAllStrings();  // code point complement
959     }
960 
961     if (isBogus() && U_SUCCESS(ec)) {
962         // We likely ran out of memory. AHHH!
963         ec = U_MEMORY_ALLOCATION_ERROR;
964     }
965     return *this;
966 }
967 
968 //----------------------------------------------------------------
969 // Property set patterns
970 //----------------------------------------------------------------
971 
972 /**
973  * Return true if the given position, in the given pattern, appears
974  * to be the start of a property set pattern.
975  */
resemblesPropertyPattern(const UnicodeString & pattern,int32_t pos)976 UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern,
977                                            int32_t pos) {
978     // Patterns are at least 5 characters long
979     if ((pos+5) > pattern.length()) {
980         return false;
981     }
982 
983     // Look for an opening [:, [:^, \p, or \P
984     return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos);
985 }
986 
987 /**
988  * Return true if the given iterator appears to point at a
989  * property pattern.  Regardless of the result, return with the
990  * iterator unchanged.
991  * @param chars iterator over the pattern characters.  Upon return
992  * it will be unchanged.
993  * @param iterOpts RuleCharacterIterator options
994  */
resemblesPropertyPattern(RuleCharacterIterator & chars,int32_t iterOpts)995 UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars,
996                                            int32_t iterOpts) {
997     // NOTE: literal will always be false, because we don't parse escapes.
998     UBool result = false, literal;
999     UErrorCode ec = U_ZERO_ERROR;
1000     iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES;
1001     RuleCharacterIterator::Pos pos;
1002     chars.getPos(pos);
1003     UChar32 c = chars.next(iterOpts, literal, ec);
1004     if (c == u'[' || c == u'\\') {
1005         UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE,
1006                                literal, ec);
1007         result = (c == u'[') ? (d == u':') :
1008                                (d == u'N' || d == u'p' || d == u'P');
1009     }
1010     chars.setPos(pos);
1011     return result && U_SUCCESS(ec);
1012 }
1013 
1014 /**
1015  * Parse the given property pattern at the given parse position.
1016  */
applyPropertyPattern(const UnicodeString & pattern,ParsePosition & ppos,UErrorCode & ec)1017 UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern,
1018                                              ParsePosition& ppos,
1019                                              UErrorCode &ec) {
1020     int32_t pos = ppos.getIndex();
1021 
1022     UBool posix = false; // true for [:pat:], false for \p{pat} \P{pat} \N{pat}
1023     UBool isName = false; // true for \N{pat}, o/w false
1024     UBool invert = false;
1025 
1026     if (U_FAILURE(ec)) return *this;
1027 
1028     // Minimum length is 5 characters, e.g. \p{L}
1029     if ((pos+5) > pattern.length()) {
1030         FAIL(ec);
1031     }
1032 
1033     // On entry, ppos should point to one of the following locations:
1034     // Look for an opening [:, [:^, \p, or \P
1035     if (isPOSIXOpen(pattern, pos)) {
1036         posix = true;
1037         pos += 2;
1038         pos = ICU_Utility::skipWhitespace(pattern, pos);
1039         if (pos < pattern.length() && pattern.charAt(pos) == u'^') {
1040             ++pos;
1041             invert = true;
1042         }
1043     } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) {
1044         char16_t c = pattern.charAt(pos+1);
1045         invert = (c == u'P');
1046         isName = (c == u'N');
1047         pos += 2;
1048         pos = ICU_Utility::skipWhitespace(pattern, pos);
1049         if (pos == pattern.length() || pattern.charAt(pos++) != u'{') {
1050             // Syntax error; "\p" or "\P" not followed by "{"
1051             FAIL(ec);
1052         }
1053     } else {
1054         // Open delimiter not seen
1055         FAIL(ec);
1056     }
1057 
1058     // Look for the matching close delimiter, either :] or }
1059     int32_t close;
1060     if (posix) {
1061       close = pattern.indexOf(u":]", 2, pos);
1062     } else {
1063       close = pattern.indexOf(u'}', pos);
1064     }
1065     if (close < 0) {
1066         // Syntax error; close delimiter missing
1067         FAIL(ec);
1068     }
1069 
1070     // Look for an '=' sign.  If this is present, we will parse a
1071     // medium \p{gc=Cf} or long \p{GeneralCategory=Format}
1072     // pattern.
1073     int32_t equals = pattern.indexOf(u'=', pos);
1074     UnicodeString propName, valueName;
1075     if (equals >= 0 && equals < close && !isName) {
1076         // Equals seen; parse medium/long pattern
1077         pattern.extractBetween(pos, equals, propName);
1078         pattern.extractBetween(equals+1, close, valueName);
1079     }
1080 
1081     else {
1082         // Handle case where no '=' is seen, and \N{}
1083         pattern.extractBetween(pos, close, propName);
1084 
1085         // Handle \N{name}
1086         if (isName) {
1087             // This is a little inefficient since it means we have to
1088             // parse NAME_PROP back to UCHAR_NAME even though we already
1089             // know it's UCHAR_NAME.  If we refactor the API to
1090             // support args of (UProperty, char*) then we can remove
1091             // NAME_PROP and make this a little more efficient.
1092             valueName = propName;
1093             propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV);
1094         }
1095     }
1096 
1097     applyPropertyAlias(propName, valueName, ec);
1098 
1099     if (U_SUCCESS(ec)) {
1100         if (invert) {
1101             complement().removeAllStrings();  // code point complement
1102         }
1103 
1104         // Move to the limit position after the close delimiter if the
1105         // parse succeeded.
1106         ppos.setIndex(close + (posix ? 2 : 1));
1107     }
1108 
1109     return *this;
1110 }
1111 
1112 /**
1113  * Parse a property pattern.
1114  * @param chars iterator over the pattern characters.  Upon return
1115  * it will be advanced to the first character after the parsed
1116  * pattern, or the end of the iteration if all characters are
1117  * parsed.
1118  * @param rebuiltPat the pattern that was parsed, rebuilt or
1119  * copied from the input pattern, as appropriate.
1120  */
applyPropertyPattern(RuleCharacterIterator & chars,UnicodeString & rebuiltPat,UErrorCode & ec)1121 void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars,
1122                                       UnicodeString& rebuiltPat,
1123                                       UErrorCode& ec) {
1124     if (U_FAILURE(ec)) return;
1125     UnicodeString pattern;
1126     chars.lookahead(pattern);
1127     ParsePosition pos(0);
1128     applyPropertyPattern(pattern, pos, ec);
1129     if (U_FAILURE(ec)) return;
1130     if (pos.getIndex() == 0) {
1131         // syntaxError(chars, "Invalid property pattern");
1132         ec = U_MALFORMED_SET;
1133         return;
1134     }
1135     chars.jumpahead(pos.getIndex());
1136     rebuiltPat.append(pattern, 0, pos.getIndex());
1137 }
1138 
1139 U_NAMESPACE_END
1140