1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 * Copyright (C) 1999-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 ***************************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 ***************************************************************************
11 */
12
13 #ifndef UNICODESET_H
14 #define UNICODESET_H
15
16 #include "unicode/utypes.h"
17
18 #if U_SHOW_CPLUSPLUS_API
19
20 #include "unicode/ucpmap.h"
21 #include "unicode/unifilt.h"
22 #include "unicode/unistr.h"
23 #include "unicode/uset.h"
24
25 /**
26 * \file
27 * \brief C++ API: Unicode Set
28 */
29
30 U_NAMESPACE_BEGIN
31
32 // Forward Declarations.
33 class BMPSet;
34 class ParsePosition;
35 class RBBIRuleScanner;
36 class SymbolTable;
37 class UnicodeSetStringSpan;
38 class UVector;
39 class RuleCharacterIterator;
40
41 /**
42 * A mutable set of Unicode characters and multicharacter strings. Objects of this class
43 * represent <em>character classes</em> used in regular expressions.
44 * A character specifies a subset of Unicode code points. Legal
45 * code points are U+0000 to U+10FFFF, inclusive.
46 *
47 * <p>The UnicodeSet class is not designed to be subclassed.
48 *
49 * <p><code>UnicodeSet</code> supports two APIs. The first is the
50 * <em>operand</em> API that allows the caller to modify the value of
51 * a <code>UnicodeSet</code> object. It conforms to Java 2's
52 * <code>java.util.Set</code> interface, although
53 * <code>UnicodeSet</code> does not actually implement that
54 * interface. All methods of <code>Set</code> are supported, with the
55 * modification that they take a character range or single character
56 * instead of an <code>Object</code>, and they take a
57 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The
58 * operand API may be thought of in terms of boolean logic: a boolean
59 * OR is implemented by <code>add</code>, a boolean AND is implemented
60 * by <code>retain</code>, a boolean XOR is implemented by
61 * <code>complement</code> taking an argument, and a boolean NOT is
62 * implemented by <code>complement</code> with no argument. In terms
63 * of traditional set theory function names, <code>add</code> is a
64 * union, <code>retain</code> is an intersection, <code>remove</code>
65 * is an asymmetric difference, and <code>complement</code> with no
66 * argument is a set complement with respect to the superset range
67 * <code>MIN_VALUE-MAX_VALUE</code>
68 *
69 * <p>The second API is the
70 * <code>applyPattern()</code>/<code>toPattern()</code> API from the
71 * <code>java.text.Format</code>-derived classes. Unlike the
72 * methods that add characters, add categories, and control the logic
73 * of the set, the method <code>applyPattern()</code> sets all
74 * attributes of a <code>UnicodeSet</code> at once, based on a
75 * string pattern.
76 *
77 * <p><b>Pattern syntax</b></p>
78 *
79 * Patterns are accepted by the constructors and the
80 * <code>applyPattern()</code> methods and returned by the
81 * <code>toPattern()</code> method. These patterns follow a syntax
82 * similar to that employed by version 8 regular expression character
83 * classes. Here are some simple examples:
84 *
85 * \htmlonly<blockquote>\endhtmlonly
86 * <table>
87 * <tr align="top">
88 * <td nowrap valign="top" align="left"><code>[]</code></td>
89 * <td valign="top">No characters</td>
90 * </tr><tr align="top">
91 * <td nowrap valign="top" align="left"><code>[a]</code></td>
92 * <td valign="top">The character 'a'</td>
93 * </tr><tr align="top">
94 * <td nowrap valign="top" align="left"><code>[ae]</code></td>
95 * <td valign="top">The characters 'a' and 'e'</td>
96 * </tr>
97 * <tr>
98 * <td nowrap valign="top" align="left"><code>[a-e]</code></td>
99 * <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
100 * point order</td>
101 * </tr>
102 * <tr>
103 * <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
104 * <td valign="top">The character U+4E01</td>
105 * </tr>
106 * <tr>
107 * <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
108 * <td valign="top">The character 'a' and the multicharacter strings "ab" and
109 * "ac"</td>
110 * </tr>
111 * <tr>
112 * <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>
113 * <td valign="top">All characters in the general category Uppercase Letter</td>
114 * </tr>
115 * </table>
116 * \htmlonly</blockquote>\endhtmlonly
117 *
118 * Any character may be preceded by a backslash in order to remove any special
119 * meaning. White space characters, as defined by UCharacter.isWhitespace(), are
120 * ignored, unless they are escaped.
121 *
122 * <p>Property patterns specify a set of characters having a certain
123 * property as defined by the Unicode standard. Both the POSIX-like
124 * "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized. For a
125 * complete list of supported property patterns, see the User's Guide
126 * for UnicodeSet at
127 * <a href="https://unicode-org.github.io/icu/userguide/strings/unicodeset">
128 * https://unicode-org.github.io/icu/userguide/strings/unicodeset</a>.
129 * Actual determination of property data is defined by the underlying
130 * Unicode database as implemented by UCharacter.
131 *
132 * <p>Patterns specify individual characters, ranges of characters, and
133 * Unicode property sets. When elements are concatenated, they
134 * specify their union. To complement a set, place a '^' immediately
135 * after the opening '['. Property patterns are inverted by modifying
136 * their delimiters; "[:^foo]" and "\\P{foo}". In any other location,
137 * '^' has no special meaning.
138 *
139 * <p>Since ICU 70, "[^...]", "[:^foo]", "\\P{foo}", and "[:binaryProperty=No:]"
140 * perform a “code point complement” (all code points minus the original set),
141 * removing all multicharacter strings,
142 * equivalent to <code>.complement().removeAllStrings()</code>.
143 * The complement() API function continues to perform a
144 * symmetric difference with all code points and thus retains all multicharacter strings.
145 *
146 * <p>Ranges are indicated by placing two a '-' between two
147 * characters, as in "a-z". This specifies the range of all
148 * characters from the left to the right, in Unicode order. If the
149 * left character is greater than or equal to the
150 * right character it is a syntax error. If a '-' occurs as the first
151 * character after the opening '[' or '[^', or if it occurs as the
152 * last character before the closing ']', then it is taken as a
153 * literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
154 * set of three characters, 'a', 'b', and '-'.
155 *
156 * <p>Sets may be intersected using the '&' operator or the asymmetric
157 * set difference may be taken using the '-' operator, for example,
158 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
159 * with values less than 4096. Operators ('&' and '|') have equal
160 * precedence and bind left-to-right. Thus
161 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
162 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
163 * difference; intersection is commutative.
164 *
165 * <table>
166 * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
167 * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
168 * through 'z' and all letters in between, in Unicode order
169 * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
170 * all characters but 'a' through 'z',
171 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
172 * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
173 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
174 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
175 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
176 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
177 * <td>The asymmetric difference of sets specified by <em>pat1</em> and
178 * <em>pat2</em>
179 * <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>
180 * <td>The set of characters having the specified
181 * Unicode property; in
182 * this case, Unicode uppercase letters
183 * <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>
184 * <td>The set of characters <em>not</em> having the given
185 * Unicode property
186 * </table>
187 *
188 * <p><b>Formal syntax</b></p>
189 *
190 * \htmlonly<blockquote>\endhtmlonly
191 * <table>
192 * <tr align="top">
193 * <td nowrap valign="top" align="right"><code>pattern := </code></td>
194 * <td valign="top"><code>('[' '^'? item* ']') |
195 * property</code></td>
196 * </tr>
197 * <tr align="top">
198 * <td nowrap valign="top" align="right"><code>item := </code></td>
199 * <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
200 * </code></td>
201 * </tr>
202 * <tr align="top">
203 * <td nowrap valign="top" align="right"><code>pattern-expr := </code></td>
204 * <td valign="top"><code>pattern | pattern-expr pattern |
205 * pattern-expr op pattern<br>
206 * </code></td>
207 * </tr>
208 * <tr align="top">
209 * <td nowrap valign="top" align="right"><code>op := </code></td>
210 * <td valign="top"><code>'&' | '-'<br>
211 * </code></td>
212 * </tr>
213 * <tr align="top">
214 * <td nowrap valign="top" align="right"><code>special := </code></td>
215 * <td valign="top"><code>'[' | ']' | '-'<br>
216 * </code></td>
217 * </tr>
218 * <tr align="top">
219 * <td nowrap valign="top" align="right"><code>char := </code></td>
220 * <td valign="top"><em>any character that is not</em><code> special<br>
221 * | ('\' </code><em>any character</em><code>)<br>
222 * | ('\\u' hex hex hex hex)<br>
223 * </code></td>
224 * </tr>
225 * <tr align="top">
226 * <td nowrap valign="top" align="right"><code>hex := </code></td>
227 * <td valign="top"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br>
228 * 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td>
229 * </tr>
230 * <tr>
231 * <td nowrap valign="top" align="right"><code>property := </code></td>
232 * <td valign="top"><em>a Unicode property set pattern</em></td>
233 * </tr>
234 * </table>
235 * <br>
236 * <table border="1">
237 * <tr>
238 * <td>Legend: <table>
239 * <tr>
240 * <td nowrap valign="top"><code>a := b</code></td>
241 * <td width="20" valign="top"> </td>
242 * <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
243 * </tr>
244 * <tr>
245 * <td nowrap valign="top"><code>a?</code></td>
246 * <td valign="top"></td>
247 * <td valign="top">zero or one instance of <code>a</code><br>
248 * </td>
249 * </tr>
250 * <tr>
251 * <td nowrap valign="top"><code>a*</code></td>
252 * <td valign="top"></td>
253 * <td valign="top">one or more instances of <code>a</code><br>
254 * </td>
255 * </tr>
256 * <tr>
257 * <td nowrap valign="top"><code>a | b</code></td>
258 * <td valign="top"></td>
259 * <td valign="top">either <code>a</code> or <code>b</code><br>
260 * </td>
261 * </tr>
262 * <tr>
263 * <td nowrap valign="top"><code>'a'</code></td>
264 * <td valign="top"></td>
265 * <td valign="top">the literal string between the quotes </td>
266 * </tr>
267 * </table>
268 * </td>
269 * </tr>
270 * </table>
271 * \htmlonly</blockquote>\endhtmlonly
272 *
273 * <p>Note:
274 * - Most UnicodeSet methods do not take a UErrorCode parameter because
275 * there are usually very few opportunities for failure other than a shortage
276 * of memory, error codes in low-level C++ string methods would be inconvenient,
277 * and the error code as the last parameter (ICU convention) would prevent
278 * the use of default parameter values.
279 * Instead, such methods set the UnicodeSet into a "bogus" state
280 * (see isBogus()) if an error occurs.
281 *
282 * @author Alan Liu
283 * @stable ICU 2.0
284 */
285 class U_COMMON_API UnicodeSet final : public UnicodeFilter {
286 private:
287 /**
288 * Enough for sets with few ranges.
289 * For example, White_Space has 10 ranges, list length 21.
290 */
291 static constexpr int32_t INITIAL_CAPACITY = 25;
292 // fFlags constant
293 static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid)
294
295 UChar32* list = stackList; // MUST be terminated with HIGH
296 int32_t capacity = INITIAL_CAPACITY; // capacity of list
297 int32_t len = 1; // length of list used; 1 <= len <= capacity
298 uint8_t fFlags = 0; // Bit flag (see constants above)
299
300 BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not nullptr.
301 UChar32* buffer = nullptr; // internal buffer, may be nullptr
302 int32_t bufferCapacity = 0; // capacity of buffer
303
304 /**
305 * The pattern representation of this set. This may not be the
306 * most economical pattern. It is the pattern supplied to
307 * applyPattern(), with variables substituted and whitespace
308 * removed. For sets constructed without applyPattern(), or
309 * modified using the non-pattern API, this string will be empty,
310 * indicating that toPattern() must generate a pattern
311 * representation from the inversion list.
312 */
313 char16_t *pat = nullptr;
314 int32_t patLen = 0;
315
316 UVector* strings_ = nullptr; // maintained in sorted order
317 UnicodeSetStringSpan *stringSpan = nullptr;
318
319 /**
320 * Initial list array.
321 * Avoids some heap allocations, and list is never nullptr.
322 * Increases the object size a bit.
323 */
324 UChar32 stackList[INITIAL_CAPACITY];
325
326 public:
327 /**
328 * Determine if this object contains a valid set.
329 * A bogus set has no value. It is different from an empty set.
330 * It can be used to indicate that no set value is available.
331 *
332 * @return true if the set is bogus/invalid, false otherwise
333 * @see setToBogus()
334 * @stable ICU 4.0
335 */
336 inline UBool isBogus() const;
337
338 /**
339 * Make this UnicodeSet object invalid.
340 * The string will test true with isBogus().
341 *
342 * A bogus set has no value. It is different from an empty set.
343 * It can be used to indicate that no set value is available.
344 *
345 * This utility function is used throughout the UnicodeSet
346 * implementation to indicate that a UnicodeSet operation failed,
347 * and may be used in other functions,
348 * especially but not exclusively when such functions do not
349 * take a UErrorCode for simplicity.
350 *
351 * @see isBogus()
352 * @stable ICU 4.0
353 */
354 void setToBogus();
355
356 public:
357
358 enum {
359 /**
360 * Minimum value that can be stored in a UnicodeSet.
361 * @stable ICU 2.4
362 */
363 MIN_VALUE = 0,
364
365 /**
366 * Maximum value that can be stored in a UnicodeSet.
367 * @stable ICU 2.4
368 */
369 MAX_VALUE = 0x10ffff
370 };
371
372 //----------------------------------------------------------------
373 // Constructors &c
374 //----------------------------------------------------------------
375
376 public:
377
378 /**
379 * Constructs an empty set.
380 * @stable ICU 2.0
381 */
382 UnicodeSet();
383
384 /**
385 * Constructs a set containing the given range. If <code>end <
386 * start</code> then an empty set is created.
387 *
388 * @param start first character, inclusive, of range
389 * @param end last character, inclusive, of range
390 * @stable ICU 2.4
391 */
392 UnicodeSet(UChar32 start, UChar32 end);
393
394 #ifndef U_HIDE_INTERNAL_API
395 /**
396 * @internal
397 */
398 enum ESerialization {
399 kSerialized /* result of serialize() */
400 };
401
402 /**
403 * Constructs a set from the output of serialize().
404 *
405 * @param buffer the 16 bit array
406 * @param bufferLen the original length returned from serialize()
407 * @param serialization the value 'kSerialized'
408 * @param status error code
409 *
410 * @internal
411 */
412 UnicodeSet(const uint16_t buffer[], int32_t bufferLen,
413 ESerialization serialization, UErrorCode &status);
414 #endif /* U_HIDE_INTERNAL_API */
415
416 /**
417 * Constructs a set from the given pattern. See the class
418 * description for the syntax of the pattern language.
419 * @param pattern a string specifying what characters are in the set
420 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
421 * contains a syntax error.
422 * @stable ICU 2.0
423 */
424 UnicodeSet(const UnicodeString& pattern,
425 UErrorCode& status);
426
427 #ifndef U_HIDE_INTERNAL_API
428 /**
429 * Constructs a set from the given pattern. See the class
430 * description for the syntax of the pattern language.
431 * @param pattern a string specifying what characters are in the set
432 * @param options bitmask for options to apply to the pattern.
433 * Valid options are USET_IGNORE_SPACE and
434 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
435 * These case options are mutually exclusive.
436 * @param symbols a symbol table mapping variable names to values
437 * and stand-in characters to UnicodeSets; may be nullptr
438 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
439 * contains a syntax error.
440 * @internal
441 */
442 UnicodeSet(const UnicodeString& pattern,
443 uint32_t options,
444 const SymbolTable* symbols,
445 UErrorCode& status);
446 #endif /* U_HIDE_INTERNAL_API */
447
448 /**
449 * Constructs a set from the given pattern. See the class description
450 * for the syntax of the pattern language.
451 * @param pattern a string specifying what characters are in the set
452 * @param pos on input, the position in pattern at which to start parsing.
453 * On output, the position after the last character parsed.
454 * @param options bitmask for options to apply to the pattern.
455 * Valid options are USET_IGNORE_SPACE and
456 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
457 * These case options are mutually exclusive.
458 * @param symbols a symbol table mapping variable names to values
459 * and stand-in characters to UnicodeSets; may be nullptr
460 * @param status input-output error code
461 * @stable ICU 2.8
462 */
463 UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
464 uint32_t options,
465 const SymbolTable* symbols,
466 UErrorCode& status);
467
468 /**
469 * Constructs a set that is identical to the given UnicodeSet.
470 * @stable ICU 2.0
471 */
472 UnicodeSet(const UnicodeSet& o);
473
474 /**
475 * Destructs the set.
476 * @stable ICU 2.0
477 */
478 virtual ~UnicodeSet();
479
480 /**
481 * Assigns this object to be a copy of another.
482 * A frozen set will not be modified.
483 * @stable ICU 2.0
484 */
485 UnicodeSet& operator=(const UnicodeSet& o);
486
487 /**
488 * Compares the specified object with this set for equality. Returns
489 * <tt>true</tt> if the two sets
490 * have the same size, and every member of the specified set is
491 * contained in this set (or equivalently, every member of this set is
492 * contained in the specified set).
493 *
494 * @param o set to be compared for equality with this set.
495 * @return <tt>true</tt> if the specified set is equal to this set.
496 * @stable ICU 2.0
497 */
498 virtual bool operator==(const UnicodeSet& o) const;
499
500 /**
501 * Compares the specified object with this set for equality. Returns
502 * <tt>true</tt> if the specified set is not equal to this set.
503 * @stable ICU 2.0
504 */
505 inline bool operator!=(const UnicodeSet& o) const;
506
507 /**
508 * Returns a copy of this object. All UnicodeFunctor objects have
509 * to support cloning in order to allow classes using
510 * UnicodeFunctors, such as Transliterator, to implement cloning.
511 * If this set is frozen, then the clone will be frozen as well.
512 * Use cloneAsThawed() for a mutable clone of a frozen set.
513 * @see cloneAsThawed
514 * @stable ICU 2.0
515 */
516 virtual UnicodeSet* clone() const override;
517
518 /**
519 * Returns the hash code value for this set.
520 *
521 * @return the hash code value for this set.
522 * @see Object#hashCode()
523 * @stable ICU 2.0
524 */
525 virtual int32_t hashCode() const;
526
527 /**
528 * Get a UnicodeSet pointer from a USet
529 *
530 * @param uset a USet (the ICU plain C type for UnicodeSet)
531 * @return the corresponding UnicodeSet pointer.
532 *
533 * @stable ICU 4.2
534 */
535 inline static UnicodeSet *fromUSet(USet *uset);
536
537 /**
538 * Get a UnicodeSet pointer from a const USet
539 *
540 * @param uset a const USet (the ICU plain C type for UnicodeSet)
541 * @return the corresponding UnicodeSet pointer.
542 *
543 * @stable ICU 4.2
544 */
545 inline static const UnicodeSet *fromUSet(const USet *uset);
546
547 /**
548 * Produce a USet * pointer for this UnicodeSet.
549 * USet is the plain C type for UnicodeSet
550 *
551 * @return a USet pointer for this UnicodeSet
552 * @stable ICU 4.2
553 */
554 inline USet *toUSet();
555
556
557 /**
558 * Produce a const USet * pointer for this UnicodeSet.
559 * USet is the plain C type for UnicodeSet
560 *
561 * @return a const USet pointer for this UnicodeSet
562 * @stable ICU 4.2
563 */
564 inline const USet * toUSet() const;
565
566
567 //----------------------------------------------------------------
568 // Freezable API
569 //----------------------------------------------------------------
570
571 /**
572 * Determines whether the set has been frozen (made immutable) or not.
573 * See the ICU4J Freezable interface for details.
574 * @return true/false for whether the set has been frozen
575 * @see freeze
576 * @see cloneAsThawed
577 * @stable ICU 3.8
578 */
579 inline UBool isFrozen() const;
580
581 /**
582 * Freeze the set (make it immutable).
583 * Once frozen, it cannot be unfrozen and is therefore thread-safe
584 * until it is deleted.
585 * See the ICU4J Freezable interface for details.
586 * Freezing the set may also make some operations faster, for example
587 * contains() and span().
588 * A frozen set will not be modified. (It remains frozen.)
589 * @return this set.
590 * @see isFrozen
591 * @see cloneAsThawed
592 * @stable ICU 3.8
593 */
594 UnicodeSet *freeze();
595
596 /**
597 * Clone the set and make the clone mutable.
598 * See the ICU4J Freezable interface for details.
599 * @return the mutable clone
600 * @see freeze
601 * @see isFrozen
602 * @stable ICU 3.8
603 */
604 UnicodeSet *cloneAsThawed() const;
605
606 //----------------------------------------------------------------
607 // Public API
608 //----------------------------------------------------------------
609
610 /**
611 * Make this object represent the range `start - end`.
612 * If `start > end` then this object is set to an empty range.
613 * A frozen set will not be modified.
614 *
615 * @param start first character in the set, inclusive
616 * @param end last character in the set, inclusive
617 * @stable ICU 2.4
618 */
619 UnicodeSet& set(UChar32 start, UChar32 end);
620
621 /**
622 * Return true if the given position, in the given pattern, appears
623 * to be the start of a UnicodeSet pattern.
624 * @stable ICU 2.4
625 */
626 static UBool resemblesPattern(const UnicodeString& pattern,
627 int32_t pos);
628
629 /**
630 * Modifies this set to represent the set specified by the given
631 * pattern, ignoring Unicode Pattern_White_Space characters.
632 * See the class description for the syntax of the pattern language.
633 * A frozen set will not be modified.
634 * @param pattern a string specifying what characters are in the set
635 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
636 * contains a syntax error.
637 * <em> Empties the set passed before applying the pattern.</em>
638 * @return a reference to this
639 * @stable ICU 2.0
640 */
641 UnicodeSet& applyPattern(const UnicodeString& pattern,
642 UErrorCode& status);
643
644 #ifndef U_HIDE_INTERNAL_API
645 /**
646 * Modifies this set to represent the set specified by the given
647 * pattern, optionally ignoring Unicode Pattern_White_Space characters.
648 * See the class description for the syntax of the pattern language.
649 * A frozen set will not be modified.
650 * @param pattern a string specifying what characters are in the set
651 * @param options bitmask for options to apply to the pattern.
652 * Valid options are USET_IGNORE_SPACE and
653 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
654 * These case options are mutually exclusive.
655 * @param symbols a symbol table mapping variable names to
656 * values and stand-ins to UnicodeSets; may be nullptr
657 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
658 * contains a syntax error.
659 *<em> Empties the set passed before applying the pattern.</em>
660 * @return a reference to this
661 * @internal
662 */
663 UnicodeSet& applyPattern(const UnicodeString& pattern,
664 uint32_t options,
665 const SymbolTable* symbols,
666 UErrorCode& status);
667 #endif /* U_HIDE_INTERNAL_API */
668
669 /**
670 * Parses the given pattern, starting at the given position. The
671 * character at pattern.charAt(pos.getIndex()) must be '[', or the
672 * parse fails. Parsing continues until the corresponding closing
673 * ']'. If a syntax error is encountered between the opening and
674 * closing brace, the parse fails. Upon return from a successful
675 * parse, the ParsePosition is updated to point to the character
676 * following the closing ']', and a StringBuffer containing a
677 * pairs list for the parsed pattern is returned. This method calls
678 * itself recursively to parse embedded subpatterns.
679 *<em> Empties the set passed before applying the pattern.</em>
680 * A frozen set will not be modified.
681 *
682 * @param pattern the string containing the pattern to be parsed.
683 * The portion of the string from pos.getIndex(), which must be a
684 * '[', to the corresponding closing ']', is parsed.
685 * @param pos upon entry, the position at which to being parsing.
686 * The character at pattern.charAt(pos.getIndex()) must be a '['.
687 * Upon return from a successful parse, pos.getIndex() is either
688 * the character after the closing ']' of the parsed pattern, or
689 * pattern.length() if the closing ']' is the last character of
690 * the pattern string.
691 * @param options bitmask for options to apply to the pattern.
692 * Valid options are USET_IGNORE_SPACE and
693 * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
694 * These case options are mutually exclusive.
695 * @param symbols a symbol table mapping variable names to
696 * values and stand-ins to UnicodeSets; may be nullptr
697 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
698 * contains a syntax error.
699 * @return a reference to this
700 * @stable ICU 2.8
701 */
702 UnicodeSet& applyPattern(const UnicodeString& pattern,
703 ParsePosition& pos,
704 uint32_t options,
705 const SymbolTable* symbols,
706 UErrorCode& status);
707
708 /**
709 * Returns a string representation of this set. If the result of
710 * calling this function is passed to a UnicodeSet constructor, it
711 * will produce another set that is equal to this one.
712 * A frozen set will not be modified.
713 * @param result the string to receive the rules. Previous
714 * contents will be deleted.
715 * @param escapeUnprintable if true then convert unprintable
716 * character to their hex escape representations, \\uxxxx or
717 * \\Uxxxxxxxx. Unprintable characters are those other than
718 * U+000A, U+0020..U+007E.
719 * @stable ICU 2.0
720 */
721 virtual UnicodeString& toPattern(UnicodeString& result,
722 UBool escapeUnprintable = false) const override;
723
724 /**
725 * Modifies this set to contain those code points which have the given value
726 * for the given binary or enumerated property, as returned by
727 * u_getIntPropertyValue. Prior contents of this set are lost.
728 * A frozen set will not be modified.
729 *
730 * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
731 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
732 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
733 *
734 * @param value a value in the range u_getIntPropertyMinValue(prop)..
735 * u_getIntPropertyMaxValue(prop), with one exception. If prop is
736 * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
737 * rather a mask value produced by U_GET_GC_MASK(). This allows grouped
738 * categories such as [:L:] to be represented.
739 *
740 * @param ec error code input/output parameter
741 *
742 * @return a reference to this set
743 *
744 * @stable ICU 2.4
745 */
746 UnicodeSet& applyIntPropertyValue(UProperty prop,
747 int32_t value,
748 UErrorCode& ec);
749
750 /**
751 * Modifies this set to contain those code points which have the
752 * given value for the given property. Prior contents of this
753 * set are lost.
754 * A frozen set will not be modified.
755 *
756 * @param prop a property alias, either short or long. The name is matched
757 * loosely. See PropertyAliases.txt for names and a description of loose
758 * matching. If the value string is empty, then this string is interpreted
759 * as either a General_Category value alias, a Script value alias, a binary
760 * property alias, or a special ID. Special IDs are matched loosely and
761 * correspond to the following sets:
762 *
763 * "ANY" = [\\u0000-\\U0010FFFF],
764 * "ASCII" = [\\u0000-\\u007F],
765 * "Assigned" = [:^Cn:].
766 *
767 * @param value a value alias, either short or long. The name is matched
768 * loosely. See PropertyValueAliases.txt for names and a description of
769 * loose matching. In addition to aliases listed, numeric values and
770 * canonical combining classes may be expressed numerically, e.g., ("nv",
771 * "0.5") or ("ccc", "220"). The value string may also be empty.
772 *
773 * @param ec error code input/output parameter
774 *
775 * @return a reference to this set
776 *
777 * @stable ICU 2.4
778 */
779 UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
780 const UnicodeString& value,
781 UErrorCode& ec);
782
783 /**
784 * Returns the number of elements in this set (its cardinality).
785 * Note than the elements of a set may include both individual
786 * codepoints and strings.
787 *
788 * This is slower than getRangeCount() because
789 * it counts the code points of all ranges.
790 *
791 * @return the number of elements in this set (its cardinality).
792 * @stable ICU 2.0
793 * @see getRangeCount
794 */
795 virtual int32_t size() const;
796
797 /**
798 * Returns <tt>true</tt> if this set contains no elements.
799 *
800 * @return <tt>true</tt> if this set contains no elements.
801 * @stable ICU 2.0
802 */
803 virtual UBool isEmpty() const;
804
805 /**
806 * @return true if this set contains multi-character strings or the empty string.
807 * @stable ICU 70
808 */
809 UBool hasStrings() const;
810
811 /**
812 * Returns true if this set contains the given character.
813 * This function works faster with a frozen set.
814 * @param c character to be checked for containment
815 * @return true if the test condition is met
816 * @stable ICU 2.0
817 */
818 virtual UBool contains(UChar32 c) const override;
819
820 /**
821 * Returns true if this set contains every character
822 * of the given range.
823 * @param start first character, inclusive, of the range
824 * @param end last character, inclusive, of the range
825 * @return true if the test condition is met
826 * @stable ICU 2.0
827 */
828 virtual UBool contains(UChar32 start, UChar32 end) const;
829
830 /**
831 * Returns <tt>true</tt> if this set contains the given
832 * multicharacter string.
833 * @param s string to be checked for containment
834 * @return <tt>true</tt> if this set contains the specified string
835 * @stable ICU 2.4
836 */
837 UBool contains(const UnicodeString& s) const;
838
839 /**
840 * Returns true if this set contains all the characters and strings
841 * of the given set.
842 * @param c set to be checked for containment
843 * @return true if the test condition is met
844 * @stable ICU 2.4
845 */
846 virtual UBool containsAll(const UnicodeSet& c) const;
847
848 /**
849 * Returns true if this set contains all the characters
850 * of the given string.
851 * @param s string containing characters to be checked for containment
852 * @return true if the test condition is met
853 * @stable ICU 2.4
854 */
855 UBool containsAll(const UnicodeString& s) const;
856
857 /**
858 * Returns true if this set contains none of the characters
859 * of the given range.
860 * @param start first character, inclusive, of the range
861 * @param end last character, inclusive, of the range
862 * @return true if the test condition is met
863 * @stable ICU 2.4
864 */
865 UBool containsNone(UChar32 start, UChar32 end) const;
866
867 /**
868 * Returns true if this set contains none of the characters and strings
869 * of the given set.
870 * @param c set to be checked for containment
871 * @return true if the test condition is met
872 * @stable ICU 2.4
873 */
874 UBool containsNone(const UnicodeSet& c) const;
875
876 /**
877 * Returns true if this set contains none of the characters
878 * of the given string.
879 * @param s string containing characters to be checked for containment
880 * @return true if the test condition is met
881 * @stable ICU 2.4
882 */
883 UBool containsNone(const UnicodeString& s) const;
884
885 /**
886 * Returns true if this set contains one or more of the characters
887 * in the given range.
888 * @param start first character, inclusive, of the range
889 * @param end last character, inclusive, of the range
890 * @return true if the condition is met
891 * @stable ICU 2.4
892 */
893 inline UBool containsSome(UChar32 start, UChar32 end) const;
894
895 /**
896 * Returns true if this set contains one or more of the characters
897 * and strings of the given set.
898 * @param s The set to be checked for containment
899 * @return true if the condition is met
900 * @stable ICU 2.4
901 */
902 inline UBool containsSome(const UnicodeSet& s) const;
903
904 /**
905 * Returns true if this set contains one or more of the characters
906 * of the given string.
907 * @param s string containing characters to be checked for containment
908 * @return true if the condition is met
909 * @stable ICU 2.4
910 */
911 inline UBool containsSome(const UnicodeString& s) const;
912
913 /**
914 * Returns the length of the initial substring of the input string which
915 * consists only of characters and strings that are contained in this set
916 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
917 * or only of characters and strings that are not contained
918 * in this set (USET_SPAN_NOT_CONTAINED).
919 * See USetSpanCondition for details.
920 * Similar to the strspn() C library function.
921 * Unpaired surrogates are treated according to contains() of their surrogate code points.
922 * This function works faster with a frozen set and with a non-negative string length argument.
923 * @param s start of the string
924 * @param length of the string; can be -1 for NUL-terminated
925 * @param spanCondition specifies the containment condition
926 * @return the length of the initial substring according to the spanCondition;
927 * 0 if the start of the string does not fit the spanCondition
928 * @stable ICU 3.8
929 * @see USetSpanCondition
930 */
931 int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
932
933 /**
934 * Returns the end of the substring of the input string according to the USetSpanCondition.
935 * Same as <code>start+span(s.getBuffer()+start, s.length()-start, spanCondition)</code>
936 * after pinning start to 0<=start<=s.length().
937 * @param s the string
938 * @param start the start index in the string for the span operation
939 * @param spanCondition specifies the containment condition
940 * @return the exclusive end of the substring according to the spanCondition;
941 * the substring s.tempSubStringBetween(start, end) fulfills the spanCondition
942 * @stable ICU 4.4
943 * @see USetSpanCondition
944 */
945 inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
946
947 /**
948 * Returns the start of the trailing substring of the input string which
949 * consists only of characters and strings that are contained in this set
950 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
951 * or only of characters and strings that are not contained
952 * in this set (USET_SPAN_NOT_CONTAINED).
953 * See USetSpanCondition for details.
954 * Unpaired surrogates are treated according to contains() of their surrogate code points.
955 * This function works faster with a frozen set and with a non-negative string length argument.
956 * @param s start of the string
957 * @param length of the string; can be -1 for NUL-terminated
958 * @param spanCondition specifies the containment condition
959 * @return the start of the trailing substring according to the spanCondition;
960 * the string length if the end of the string does not fit the spanCondition
961 * @stable ICU 3.8
962 * @see USetSpanCondition
963 */
964 int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
965
966 /**
967 * Returns the start of the substring of the input string according to the USetSpanCondition.
968 * Same as <code>spanBack(s.getBuffer(), limit, spanCondition)</code>
969 * after pinning limit to 0<=end<=s.length().
970 * @param s the string
971 * @param limit the exclusive-end index in the string for the span operation
972 * (use s.length() or INT32_MAX for spanning back from the end of the string)
973 * @param spanCondition specifies the containment condition
974 * @return the start of the substring according to the spanCondition;
975 * the substring s.tempSubStringBetween(start, limit) fulfills the spanCondition
976 * @stable ICU 4.4
977 * @see USetSpanCondition
978 */
979 inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
980
981 /**
982 * Returns the length of the initial substring of the input string which
983 * consists only of characters and strings that are contained in this set
984 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
985 * or only of characters and strings that are not contained
986 * in this set (USET_SPAN_NOT_CONTAINED).
987 * See USetSpanCondition for details.
988 * Similar to the strspn() C library function.
989 * Malformed byte sequences are treated according to contains(0xfffd).
990 * This function works faster with a frozen set and with a non-negative string length argument.
991 * @param s start of the string (UTF-8)
992 * @param length of the string; can be -1 for NUL-terminated
993 * @param spanCondition specifies the containment condition
994 * @return the length of the initial substring according to the spanCondition;
995 * 0 if the start of the string does not fit the spanCondition
996 * @stable ICU 3.8
997 * @see USetSpanCondition
998 */
999 int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
1000
1001 /**
1002 * Returns the start of the trailing substring of the input string which
1003 * consists only of characters and strings that are contained in this set
1004 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
1005 * or only of characters and strings that are not contained
1006 * in this set (USET_SPAN_NOT_CONTAINED).
1007 * See USetSpanCondition for details.
1008 * Malformed byte sequences are treated according to contains(0xfffd).
1009 * This function works faster with a frozen set and with a non-negative string length argument.
1010 * @param s start of the string (UTF-8)
1011 * @param length of the string; can be -1 for NUL-terminated
1012 * @param spanCondition specifies the containment condition
1013 * @return the start of the trailing substring according to the spanCondition;
1014 * the string length if the end of the string does not fit the spanCondition
1015 * @stable ICU 3.8
1016 * @see USetSpanCondition
1017 */
1018 int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
1019
1020 /**
1021 * Implement UnicodeMatcher::matches()
1022 * @stable ICU 2.4
1023 */
1024 virtual UMatchDegree matches(const Replaceable& text,
1025 int32_t& offset,
1026 int32_t limit,
1027 UBool incremental) override;
1028
1029 private:
1030 /**
1031 * Returns the longest match for s in text at the given position.
1032 * If limit > start then match forward from start+1 to limit
1033 * matching all characters except s.charAt(0). If limit < start,
1034 * go backward starting from start-1 matching all characters
1035 * except s.charAt(s.length()-1). This method assumes that the
1036 * first character, text.charAt(start), matches s, so it does not
1037 * check it.
1038 * @param text the text to match
1039 * @param start the first character to match. In the forward
1040 * direction, text.charAt(start) is matched against s.charAt(0).
1041 * In the reverse direction, it is matched against
1042 * s.charAt(s.length()-1).
1043 * @param limit the limit offset for matching, either last+1 in
1044 * the forward direction, or last-1 in the reverse direction,
1045 * where last is the index of the last character to match.
1046 * @param s
1047 * @return If part of s matches up to the limit, return |limit -
1048 * start|. If all of s matches before reaching the limit, return
1049 * s.length(). If there is a mismatch between s and text, return
1050 * 0
1051 */
1052 static int32_t matchRest(const Replaceable& text,
1053 int32_t start, int32_t limit,
1054 const UnicodeString& s);
1055
1056 /**
1057 * Returns the smallest value i such that c < list[i]. Caller
1058 * must ensure that c is a legal value or this method will enter
1059 * an infinite loop. This method performs a binary search.
1060 * @param c a character in the range MIN_VALUE..MAX_VALUE
1061 * inclusive
1062 * @return the smallest integer i in the range 0..len-1,
1063 * inclusive, such that c < list[i]
1064 */
1065 int32_t findCodePoint(UChar32 c) const;
1066
1067 public:
1068
1069 /**
1070 * Implementation of UnicodeMatcher API. Union the set of all
1071 * characters that may be matched by this object into the given
1072 * set.
1073 * @param toUnionTo the set into which to union the source characters
1074 * @stable ICU 2.4
1075 */
1076 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const override;
1077
1078 /**
1079 * Returns the index of the given character within this set, where
1080 * the set is ordered by ascending code point. If the character
1081 * is not in this set, return -1. The inverse of this method is
1082 * <code>charAt()</code>.
1083 * @return an index from 0..size()-1, or -1
1084 * @stable ICU 2.4
1085 */
1086 int32_t indexOf(UChar32 c) const;
1087
1088 /**
1089 * Returns the character at the given index within this set, where
1090 * the set is ordered by ascending code point. If the index is
1091 * out of range for characters, returns (UChar32)-1.
1092 * The inverse of this method is <code>indexOf()</code>.
1093 *
1094 * For iteration, this is slower than UnicodeSetIterator or
1095 * getRangeCount()/getRangeStart()/getRangeEnd(),
1096 * because for each call it skips linearly over <code>index</code>
1097 * characters in the ranges.
1098 *
1099 * @param index an index from 0..size()-1
1100 * @return the character at the given index, or (UChar32)-1.
1101 * @stable ICU 2.4
1102 */
1103 UChar32 charAt(int32_t index) const;
1104
1105 #ifndef U_HIDE_DRAFT_API
1106 /**
1107 * Returns a C++ "range" for iterating over the code points of this set.
1108 *
1109 * \code
1110 * UnicodeSet set(u"[abcçカ]", errorCode);
1111 * for (UChar32 c : set.codePoints()) {
1112 * printf("set.codePoint U+%04lx\n", (long)c);
1113 * }
1114 * \endcode
1115 *
1116 * @return a "range" object for iterating over the code points of this set.
1117 * @draft ICU 76
1118 * @see ranges
1119 * @see strings
1120 * @see begin
1121 * @see end
1122 */
codePoints()1123 inline U_HEADER_NESTED_NAMESPACE::USetCodePoints codePoints() const {
1124 return U_HEADER_NESTED_NAMESPACE::USetCodePoints(toUSet());
1125 }
1126
1127 /**
1128 * Returns a C++ "range" for iterating over the code point ranges of this set.
1129 *
1130 * \code
1131 * UnicodeSet set(u"[abcçカ]", errorCode);
1132 * for (auto [start, end] : set.ranges()) {
1133 * printf("set.range U+%04lx..U+%04lx\n", (long)start, (long)end);
1134 * }
1135 * for (auto range : set.ranges()) {
1136 * for (UChar32 c : range) {
1137 * printf("set.range.c U+%04lx\n", (long)c);
1138 * }
1139 * }
1140 * \endcode
1141 *
1142 * @return a "range" object for iterating over the code point ranges of this set.
1143 * @draft ICU 76
1144 * @see codePoints
1145 * @see strings
1146 * @see begin
1147 * @see end
1148 */
ranges()1149 inline U_HEADER_NESTED_NAMESPACE::USetRanges ranges() const {
1150 return U_HEADER_NESTED_NAMESPACE::USetRanges(toUSet());
1151 }
1152
1153 /**
1154 * Returns a C++ "range" for iterating over the empty and multi-character strings of this set.
1155 * Returns each string as a std::u16string_view without copying its contents.
1156 *
1157 * \code
1158 * UnicodeSet set(u"[abcçカ{}{abc}{de}]", errorCode);
1159 * for (auto s : set.strings()) {
1160 * UnicodeString us(s);
1161 * std::string u8;
1162 * printf("set.string length %ld \"%s\"\n", (long)s.length(), us.toUTF8String(u8).c_str());
1163 * }
1164 * \endcode
1165 *
1166 * @return a "range" object for iterating over the strings of this set.
1167 * @draft ICU 76
1168 * @see codePoints
1169 * @see ranges
1170 * @see begin
1171 * @see end
1172 */
strings()1173 inline U_HEADER_NESTED_NAMESPACE::USetStrings strings() const {
1174 return U_HEADER_NESTED_NAMESPACE::USetStrings(toUSet());
1175 }
1176
1177 /**
1178 * Returns a C++ iterator for iterating over all of the elements of this set.
1179 * Convenient all-in one iteration, but creates a UnicodeString for each
1180 * code point or string.
1181 * (Similar to how Java UnicodeSet *is an* Iterable<String>.)
1182 *
1183 * Code points are returned first, then empty and multi-character strings.
1184 *
1185 * \code
1186 * UnicodeSet set(u"[abcçカ{}{abc}{de}]", errorCode);
1187 * for (auto el : set) {
1188 * std::string u8;
1189 * printf("set.string length %ld \"%s\"\n", (long)el.length(), el.toUTF8String(u8).c_str());
1190 * }
1191 * \endcode
1192 *
1193 * @return an all-elements iterator.
1194 * @draft ICU 76
1195 * @see end
1196 * @see codePoints
1197 * @see ranges
1198 * @see strings
1199 */
begin()1200 inline U_HEADER_NESTED_NAMESPACE::USetElementIterator begin() const {
1201 return U_HEADER_NESTED_NAMESPACE::USetElements(toUSet()).begin();
1202 }
1203
1204 /**
1205 * @return an exclusive-end sentinel for iterating over all of the elements of this set.
1206 * @draft ICU 76
1207 * @see begin
1208 * @see codePoints
1209 * @see ranges
1210 * @see strings
1211 */
end()1212 inline U_HEADER_NESTED_NAMESPACE::USetElementIterator end() const {
1213 return U_HEADER_NESTED_NAMESPACE::USetElements(toUSet()).end();
1214 }
1215 #endif // U_HIDE_DRAFT_API
1216
1217 /**
1218 * Adds the specified range to this set if it is not already
1219 * present. If this set already contains the specified range,
1220 * the call leaves this set unchanged. If <code>start > end</code>
1221 * then an empty range is added, leaving the set unchanged.
1222 * This is equivalent to a boolean logic OR, or a set UNION.
1223 * A frozen set will not be modified.
1224 *
1225 * @param start first character, inclusive, of range to be added
1226 * to this set.
1227 * @param end last character, inclusive, of range to be added
1228 * to this set.
1229 * @stable ICU 2.0
1230 */
1231 virtual UnicodeSet& add(UChar32 start, UChar32 end);
1232
1233 /**
1234 * Adds the specified character to this set if it is not already
1235 * present. If this set already contains the specified character,
1236 * the call leaves this set unchanged.
1237 * A frozen set will not be modified.
1238 *
1239 * @param c the character (code point)
1240 * @return this object, for chaining
1241 * @stable ICU 2.0
1242 */
1243 UnicodeSet& add(UChar32 c);
1244
1245 /**
1246 * Adds the specified multicharacter to this set if it is not already
1247 * present. If this set already contains the multicharacter,
1248 * the call leaves this set unchanged.
1249 * Thus "ch" => {"ch"}
1250 * A frozen set will not be modified.
1251 *
1252 * @param s the source string
1253 * @return this object, for chaining
1254 * @stable ICU 2.4
1255 */
1256 UnicodeSet& add(const UnicodeString& s);
1257
1258 private:
1259 /**
1260 * @return a code point IF the string consists of a single one.
1261 * otherwise returns -1.
1262 * @param s string to test
1263 */
1264 static int32_t getSingleCP(const UnicodeString& s);
1265
1266 void _add(const UnicodeString& s);
1267
1268 public:
1269 /**
1270 * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
1271 * If this set already contains any particular character, it has no effect on that character.
1272 * A frozen set will not be modified.
1273 * @param s the source string
1274 * @return this object, for chaining
1275 * @stable ICU 2.4
1276 */
1277 UnicodeSet& addAll(const UnicodeString& s);
1278
1279 /**
1280 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
1281 * A frozen set will not be modified.
1282 * @param s the source string
1283 * @return this object, for chaining
1284 * @stable ICU 2.4
1285 */
1286 UnicodeSet& retainAll(const UnicodeString& s);
1287
1288 /**
1289 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
1290 * A frozen set will not be modified.
1291 * @param s the source string
1292 * @return this object, for chaining
1293 * @stable ICU 2.4
1294 */
1295 UnicodeSet& complementAll(const UnicodeString& s);
1296
1297 /**
1298 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
1299 * A frozen set will not be modified.
1300 * @param s the source string
1301 * @return this object, for chaining
1302 * @stable ICU 2.4
1303 */
1304 UnicodeSet& removeAll(const UnicodeString& s);
1305
1306 /**
1307 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1308 *
1309 * @param s the source string
1310 * @return a newly created set containing the given string.
1311 * The caller owns the return object and is responsible for deleting it.
1312 * @stable ICU 2.4
1313 */
1314 static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1315
1316
1317 /**
1318 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1319 * @param s the source string
1320 * @return a newly created set containing the given characters
1321 * The caller owns the return object and is responsible for deleting it.
1322 * @stable ICU 2.4
1323 */
1324 static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1325
1326 /**
1327 * Retain only the elements in this set that are contained in the
1328 * specified range. If <code>start > end</code> then an empty range is
1329 * retained, leaving the set empty. This is equivalent to
1330 * a boolean logic AND, or a set INTERSECTION.
1331 * A frozen set will not be modified.
1332 *
1333 * @param start first character, inclusive, of range
1334 * @param end last character, inclusive, of range
1335 * @stable ICU 2.0
1336 */
1337 virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1338
1339
1340 /**
1341 * Retain the specified character from this set if it is present.
1342 * A frozen set will not be modified.
1343 *
1344 * @param c the character (code point)
1345 * @return this object, for chaining
1346 * @stable ICU 2.0
1347 */
1348 UnicodeSet& retain(UChar32 c);
1349
1350 /**
1351 * Retains only the specified string from this set if it is present.
1352 * Upon return this set will be empty if it did not contain s, or
1353 * will only contain s if it did contain s.
1354 * A frozen set will not be modified.
1355 *
1356 * @param s the source string
1357 * @return this object, for chaining
1358 * @stable ICU 69
1359 */
1360 UnicodeSet& retain(const UnicodeString &s);
1361
1362 /**
1363 * Removes the specified range from this set if it is present.
1364 * The set will not contain the specified range once the call
1365 * returns. If <code>start > end</code> then an empty range is
1366 * removed, leaving the set unchanged.
1367 * A frozen set will not be modified.
1368 *
1369 * @param start first character, inclusive, of range to be removed
1370 * from this set.
1371 * @param end last character, inclusive, of range to be removed
1372 * from this set.
1373 * @stable ICU 2.0
1374 */
1375 virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1376
1377 /**
1378 * Removes the specified character from this set if it is present.
1379 * The set will not contain the specified range once the call
1380 * returns.
1381 * A frozen set will not be modified.
1382 *
1383 * @param c the character (code point)
1384 * @return this object, for chaining
1385 * @stable ICU 2.0
1386 */
1387 UnicodeSet& remove(UChar32 c);
1388
1389 /**
1390 * Removes the specified string from this set if it is present.
1391 * The set will not contain the specified character once the call
1392 * returns.
1393 * A frozen set will not be modified.
1394 * @param s the source string
1395 * @return this object, for chaining
1396 * @stable ICU 2.4
1397 */
1398 UnicodeSet& remove(const UnicodeString& s);
1399
1400 /**
1401 * This is equivalent to
1402 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1403 *
1404 * <strong>Note:</strong> This performs a symmetric difference with all code points
1405 * <em>and thus retains all multicharacter strings</em>.
1406 * In order to achieve a “code point complement” (all code points minus this set),
1407 * the easiest is to <code>.complement().removeAllStrings()</code>.
1408 *
1409 * A frozen set will not be modified.
1410 * @stable ICU 2.0
1411 */
1412 virtual UnicodeSet& complement();
1413
1414 /**
1415 * Complements the specified range in this set. Any character in
1416 * the range will be removed if it is in this set, or will be
1417 * added if it is not in this set. If <code>start > end</code>
1418 * then an empty range is complemented, leaving the set unchanged.
1419 * This is equivalent to a boolean logic XOR.
1420 * A frozen set will not be modified.
1421 *
1422 * @param start first character, inclusive, of range
1423 * @param end last character, inclusive, of range
1424 * @stable ICU 2.0
1425 */
1426 virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1427
1428 /**
1429 * Complements the specified character in this set. The character
1430 * will be removed if it is in this set, or will be added if it is
1431 * not in this set.
1432 * A frozen set will not be modified.
1433 *
1434 * @param c the character (code point)
1435 * @return this object, for chaining
1436 * @stable ICU 2.0
1437 */
1438 UnicodeSet& complement(UChar32 c);
1439
1440 /**
1441 * Complement the specified string in this set.
1442 * The string will be removed if it is in this set, or will be added if it is not in this set.
1443 * A frozen set will not be modified.
1444 *
1445 * @param s the string to complement
1446 * @return this object, for chaining
1447 * @stable ICU 2.4
1448 */
1449 UnicodeSet& complement(const UnicodeString& s);
1450
1451 /**
1452 * Adds all of the elements in the specified set to this set if
1453 * they're not already present. This operation effectively
1454 * modifies this set so that its value is the <i>union</i> of the two
1455 * sets. The behavior of this operation is unspecified if the specified
1456 * collection is modified while the operation is in progress.
1457 * A frozen set will not be modified.
1458 *
1459 * @param c set whose elements are to be added to this set.
1460 * @see #add(UChar32, UChar32)
1461 * @stable ICU 2.0
1462 */
1463 virtual UnicodeSet& addAll(const UnicodeSet& c);
1464
1465 /**
1466 * Retains only the elements in this set that are contained in the
1467 * specified set. In other words, removes from this set all of
1468 * its elements that are not contained in the specified set. This
1469 * operation effectively modifies this set so that its value is
1470 * the <i>intersection</i> of the two sets.
1471 * A frozen set will not be modified.
1472 *
1473 * @param c set that defines which elements this set will retain.
1474 * @stable ICU 2.0
1475 */
1476 virtual UnicodeSet& retainAll(const UnicodeSet& c);
1477
1478 /**
1479 * Removes from this set all of its elements that are contained in the
1480 * specified set. This operation effectively modifies this
1481 * set so that its value is the <i>asymmetric set difference</i> of
1482 * the two sets.
1483 * A frozen set will not be modified.
1484 *
1485 * @param c set that defines which elements will be removed from
1486 * this set.
1487 * @stable ICU 2.0
1488 */
1489 virtual UnicodeSet& removeAll(const UnicodeSet& c);
1490
1491 /**
1492 * Complements in this set all elements contained in the specified
1493 * set. Any character in the other set will be removed if it is
1494 * in this set, or will be added if it is not in this set.
1495 * A frozen set will not be modified.
1496 *
1497 * @param c set that defines which elements will be xor'ed from
1498 * this set.
1499 * @stable ICU 2.4
1500 */
1501 virtual UnicodeSet& complementAll(const UnicodeSet& c);
1502
1503 /**
1504 * Removes all of the elements from this set. This set will be
1505 * empty after this call returns.
1506 * A frozen set will not be modified.
1507 * @stable ICU 2.0
1508 */
1509 virtual UnicodeSet& clear();
1510
1511 /**
1512 * Close this set over the given attribute. For the attribute
1513 * USET_CASE_INSENSITIVE, the result is to modify this set so that:
1514 *
1515 * 1. For each character or string 'a' in this set, all strings or
1516 * characters 'b' such that foldCase(a) == foldCase(b) are added
1517 * to this set.
1518 *
1519 * 2. For each string 'e' in the resulting set, if e !=
1520 * foldCase(e), 'e' will be removed.
1521 *
1522 * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
1523 *
1524 * (Here foldCase(x) refers to the operation u_strFoldCase, and a
1525 * == b denotes that the contents are the same, not pointer
1526 * comparison.)
1527 *
1528 * A frozen set will not be modified.
1529 *
1530 * @param attribute bitmask for attributes to close over.
1531 * Valid options:
1532 * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
1533 * These case options are mutually exclusive.
1534 * Unrelated options bits are ignored.
1535 * @return a reference to this set.
1536 * @stable ICU 4.2
1537 */
1538 UnicodeSet& closeOver(int32_t attribute);
1539
1540 /**
1541 * Remove all strings from this set.
1542 *
1543 * @return a reference to this set.
1544 * @stable ICU 4.2
1545 */
1546 virtual UnicodeSet &removeAllStrings();
1547
1548 /**
1549 * Iteration method that returns the number of ranges contained in
1550 * this set.
1551 * @see #getRangeStart
1552 * @see #getRangeEnd
1553 * @stable ICU 2.4
1554 */
1555 virtual int32_t getRangeCount() const;
1556
1557 /**
1558 * Iteration method that returns the first character in the
1559 * specified range of this set.
1560 * @see #getRangeCount
1561 * @see #getRangeEnd
1562 * @stable ICU 2.4
1563 */
1564 virtual UChar32 getRangeStart(int32_t index) const;
1565
1566 /**
1567 * Iteration method that returns the last character in the
1568 * specified range of this set.
1569 * @see #getRangeStart
1570 * @see #getRangeEnd
1571 * @stable ICU 2.4
1572 */
1573 virtual UChar32 getRangeEnd(int32_t index) const;
1574
1575 /**
1576 * Serializes this set into an array of 16-bit integers. Serialization
1577 * (currently) only records the characters in the set; multicharacter
1578 * strings are ignored.
1579 *
1580 * The array has following format (each line is one 16-bit
1581 * integer):
1582 *
1583 * length = (n+2*m) | (m!=0?0x8000:0)
1584 * bmpLength = n; present if m!=0
1585 * bmp[0]
1586 * bmp[1]
1587 * ...
1588 * bmp[n-1]
1589 * supp-high[0]
1590 * supp-low[0]
1591 * supp-high[1]
1592 * supp-low[1]
1593 * ...
1594 * supp-high[m-1]
1595 * supp-low[m-1]
1596 *
1597 * The array starts with a header. After the header are n bmp
1598 * code points, then m supplementary code points. Either n or m
1599 * or both may be zero. n+2*m is always <= 0x7FFF.
1600 *
1601 * If there are no supplementary characters (if m==0) then the
1602 * header is one 16-bit integer, 'length', with value n.
1603 *
1604 * If there are supplementary characters (if m!=0) then the header
1605 * is two 16-bit integers. The first, 'length', has value
1606 * (n+2*m)|0x8000. The second, 'bmpLength', has value n.
1607 *
1608 * After the header the code points are stored in ascending order.
1609 * Supplementary code points are stored as most significant 16
1610 * bits followed by least significant 16 bits.
1611 *
1612 * @param dest pointer to buffer of destCapacity 16-bit integers.
1613 * May be nullptr only if destCapacity is zero.
1614 * @param destCapacity size of dest, or zero. Must not be negative.
1615 * @param ec error code. Will be set to U_INDEX_OUTOFBOUNDS_ERROR
1616 * if n+2*m > 0x7FFF. Will be set to U_BUFFER_OVERFLOW_ERROR if
1617 * n+2*m+(m!=0?2:1) > destCapacity.
1618 * @return the total length of the serialized format, including
1619 * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
1620 * than U_BUFFER_OVERFLOW_ERROR.
1621 * @stable ICU 2.4
1622 */
1623 int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1624
1625 /**
1626 * Reallocate this objects internal structures to take up the least
1627 * possible space, without changing this object's value.
1628 * A frozen set will not be modified.
1629 * @stable ICU 2.4
1630 */
1631 virtual UnicodeSet& compact();
1632
1633 /**
1634 * Return the class ID for this class. This is useful only for
1635 * comparing to a return value from getDynamicClassID(). For example:
1636 * <pre>
1637 * . Base* polymorphic_pointer = createPolymorphicObject();
1638 * . if (polymorphic_pointer->getDynamicClassID() ==
1639 * . Derived::getStaticClassID()) ...
1640 * </pre>
1641 * @return The class ID for all objects of this class.
1642 * @stable ICU 2.0
1643 */
1644 static UClassID U_EXPORT2 getStaticClassID();
1645
1646 /**
1647 * Implement UnicodeFunctor API.
1648 *
1649 * @return The class ID for this object. All objects of a given
1650 * class have the same class ID. Objects of other classes have
1651 * different class IDs.
1652 * @stable ICU 2.4
1653 */
1654 virtual UClassID getDynamicClassID() const override;
1655
1656 private:
1657
1658 // Private API for the USet API
1659
1660 friend class USetAccess;
1661
1662 const UnicodeString* getString(int32_t index) const;
1663
1664 //----------------------------------------------------------------
1665 // RuleBasedTransliterator support
1666 //----------------------------------------------------------------
1667
1668 private:
1669
1670 /**
1671 * Returns <tt>true</tt> if this set contains any character whose low byte
1672 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
1673 * indexing.
1674 */
1675 virtual UBool matchesIndexValue(uint8_t v) const override;
1676
1677 private:
1678 friend class RBBIRuleScanner;
1679
1680 //----------------------------------------------------------------
1681 // Implementation: Clone as thawed (see ICU4J Freezable)
1682 //----------------------------------------------------------------
1683
1684 UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1685 UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
1686
1687 //----------------------------------------------------------------
1688 // Implementation: Pattern parsing
1689 //----------------------------------------------------------------
1690
1691 void applyPatternIgnoreSpace(const UnicodeString& pattern,
1692 ParsePosition& pos,
1693 const SymbolTable* symbols,
1694 UErrorCode& status);
1695
1696 void applyPattern(RuleCharacterIterator& chars,
1697 const SymbolTable* symbols,
1698 UnicodeString& rebuiltPat,
1699 uint32_t options,
1700 UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
1701 int32_t depth,
1702 UErrorCode& ec);
1703
1704 void closeOverCaseInsensitive(bool simple);
1705 void closeOverAddCaseMappings();
1706
1707 //----------------------------------------------------------------
1708 // Implementation: Utility methods
1709 //----------------------------------------------------------------
1710
1711 static int32_t nextCapacity(int32_t minCapacity);
1712
1713 bool ensureCapacity(int32_t newLen);
1714
1715 bool ensureBufferCapacity(int32_t newLen);
1716
1717 void swapBuffers();
1718
1719 UBool allocateStrings(UErrorCode &status);
1720 int32_t stringsSize() const;
1721 UBool stringsContains(const UnicodeString &s) const;
1722
1723 UnicodeString& _toPattern(UnicodeString& result,
1724 UBool escapeUnprintable) const;
1725
1726 UnicodeString& _generatePattern(UnicodeString& result,
1727 UBool escapeUnprintable) const;
1728
1729 static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1730
1731 static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1732
1733 static void _appendToPat(UnicodeString &result, UChar32 start, UChar32 end,
1734 UBool escapeUnprintable);
1735
1736 //----------------------------------------------------------------
1737 // Implementation: Fundamental operators
1738 //----------------------------------------------------------------
1739
1740 void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1741
1742 void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1743
1744 void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1745
1746 /**
1747 * Return true if the given position, in the given pattern, appears
1748 * to be the start of a property set pattern [:foo:], \\p{foo}, or
1749 * \\P{foo}, or \\N{name}.
1750 */
1751 static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1752 int32_t pos);
1753
1754 static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1755 int32_t iterOpts);
1756
1757 /**
1758 * Parse the given property pattern at the given parse position
1759 * and set this UnicodeSet to the result.
1760 *
1761 * The original design document is out of date, but still useful.
1762 * Ignore the property and value names:
1763 * https://htmlpreview.github.io/?https://github.com/unicode-org/icu-docs/blob/main/design/unicodeset_properties.html
1764 *
1765 * Recognized syntax:
1766 *
1767 * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
1768 * \\p{foo} \\P{foo} - white space not allowed within "\\p" or "\\P"
1769 * \\N{name} - white space not allowed within "\\N"
1770 *
1771 * Other than the above restrictions, Unicode Pattern_White_Space characters are ignored.
1772 * Case is ignored except in "\\p" and "\\P" and "\\N". In 'name' leading
1773 * and trailing space is deleted, and internal runs of whitespace
1774 * are collapsed to a single space.
1775 *
1776 * We support binary properties, enumerated properties, and the
1777 * following non-enumerated properties:
1778 *
1779 * Numeric_Value
1780 * Name
1781 * Unicode_1_Name
1782 *
1783 * @param pattern the pattern string
1784 * @param ppos on entry, the position at which to begin parsing.
1785 * This should be one of the locations marked '^':
1786 *
1787 * [:blah:] \\p{blah} \\P{blah} \\N{name}
1788 * ^ % ^ % ^ % ^ %
1789 *
1790 * On return, the position after the last character parsed, that is,
1791 * the locations marked '%'. If the parse fails, ppos is returned
1792 * unchanged.
1793 * @param ec status
1794 * @return a reference to this.
1795 */
1796 UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1797 ParsePosition& ppos,
1798 UErrorCode &ec);
1799
1800 void applyPropertyPattern(RuleCharacterIterator& chars,
1801 UnicodeString& rebuiltPat,
1802 UErrorCode& ec);
1803
1804 /**
1805 * A filter that returns true if the given code point should be
1806 * included in the UnicodeSet being constructed.
1807 */
1808 typedef UBool (*Filter)(UChar32 codePoint, void* context);
1809
1810 /**
1811 * Given a filter, set this UnicodeSet to the code points
1812 * contained by that filter. The filter MUST be
1813 * property-conformant. That is, if it returns value v for one
1814 * code point, then it must return v for all affiliated code
1815 * points, as defined by the inclusions list. See
1816 * getInclusions().
1817 * src is a UPropertySource value.
1818 */
1819 void applyFilter(Filter filter,
1820 void* context,
1821 const UnicodeSet* inclusions,
1822 UErrorCode &status);
1823
1824 /**
1825 * Set the new pattern to cache.
1826 */
setPattern(const UnicodeString & newPat)1827 void setPattern(const UnicodeString& newPat) {
1828 setPattern(newPat.getBuffer(), newPat.length());
1829 }
1830 void setPattern(const char16_t *newPat, int32_t newPatLen);
1831 /**
1832 * Release existing cached pattern.
1833 */
1834 void releasePattern();
1835
1836 friend class UnicodeSetIterator;
1837 };
1838
1839
1840
1841 inline bool UnicodeSet::operator!=(const UnicodeSet& o) const {
1842 return !operator==(o);
1843 }
1844
isFrozen()1845 inline UBool UnicodeSet::isFrozen() const {
1846 return bmpSet != nullptr || stringSpan != nullptr;
1847 }
1848
containsSome(UChar32 start,UChar32 end)1849 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1850 return !containsNone(start, end);
1851 }
1852
containsSome(const UnicodeSet & s)1853 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
1854 return !containsNone(s);
1855 }
1856
containsSome(const UnicodeString & s)1857 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
1858 return !containsNone(s);
1859 }
1860
isBogus()1861 inline UBool UnicodeSet::isBogus() const {
1862 return fFlags & kIsBogus;
1863 }
1864
fromUSet(USet * uset)1865 inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
1866 return reinterpret_cast<UnicodeSet *>(uset);
1867 }
1868
fromUSet(const USet * uset)1869 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
1870 return reinterpret_cast<const UnicodeSet *>(uset);
1871 }
1872
toUSet()1873 inline USet *UnicodeSet::toUSet() {
1874 return reinterpret_cast<USet *>(this);
1875 }
1876
toUSet()1877 inline const USet *UnicodeSet::toUSet() const {
1878 return reinterpret_cast<const USet *>(this);
1879 }
1880
span(const UnicodeString & s,int32_t start,USetSpanCondition spanCondition)1881 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1882 int32_t sLength=s.length();
1883 if(start<0) {
1884 start=0;
1885 } else if(start>sLength) {
1886 start=sLength;
1887 }
1888 return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1889 }
1890
spanBack(const UnicodeString & s,int32_t limit,USetSpanCondition spanCondition)1891 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1892 int32_t sLength=s.length();
1893 if(limit<0) {
1894 limit=0;
1895 } else if(limit>sLength) {
1896 limit=sLength;
1897 }
1898 return spanBack(s.getBuffer(), limit, spanCondition);
1899 }
1900
1901 U_NAMESPACE_END
1902
1903 #endif /* U_SHOW_CPLUSPLUS_API */
1904
1905 #endif
1906