1 /*
2 ***************************************************************************
3 * Copyright (C) 1999-2007, International Business Machines Corporation
4 * and others. All Rights Reserved.
5 ***************************************************************************
6 * Date Name Description
7 * 10/20/99 alan Creation.
8 ***************************************************************************
9 */
10
11 #ifndef UNICODESET_H
12 #define UNICODESET_H
13
14 #include "unicode/unifilt.h"
15 #include "unicode/unistr.h"
16 #include "unicode/uset.h"
17
18 /**
19 * \file
20 * \brief C++ API: Unicode Set
21 */
22
23 U_NAMESPACE_BEGIN
24
25 class BMPSet;
26 class ParsePosition;
27 class SymbolTable;
28 class UnicodeSetStringSpan;
29 class UVector;
30 class RuleCharacterIterator;
31
32 /**
33 * A mutable set of Unicode characters and multicharacter strings. Objects of this class
34 * represent <em>character classes</em> used in regular expressions.
35 * A character specifies a subset of Unicode code points. Legal
36 * code points are U+0000 to U+10FFFF, inclusive.
37 *
38 * <p>The UnicodeSet class is not designed to be subclassed.
39 *
40 * <p><code>UnicodeSet</code> supports two APIs. The first is the
41 * <em>operand</em> API that allows the caller to modify the value of
42 * a <code>UnicodeSet</code> object. It conforms to Java 2's
43 * <code>java.util.Set</code> interface, although
44 * <code>UnicodeSet</code> does not actually implement that
45 * interface. All methods of <code>Set</code> are supported, with the
46 * modification that they take a character range or single character
47 * instead of an <code>Object</code>, and they take a
48 * <code>UnicodeSet</code> instead of a <code>Collection</code>. The
49 * operand API may be thought of in terms of boolean logic: a boolean
50 * OR is implemented by <code>add</code>, a boolean AND is implemented
51 * by <code>retain</code>, a boolean XOR is implemented by
52 * <code>complement</code> taking an argument, and a boolean NOT is
53 * implemented by <code>complement</code> with no argument. In terms
54 * of traditional set theory function names, <code>add</code> is a
55 * union, <code>retain</code> is an intersection, <code>remove</code>
56 * is an asymmetric difference, and <code>complement</code> with no
57 * argument is a set complement with respect to the superset range
58 * <code>MIN_VALUE-MAX_VALUE</code>
59 *
60 * <p>The second API is the
61 * <code>applyPattern()</code>/<code>toPattern()</code> API from the
62 * <code>java.text.Format</code>-derived classes. Unlike the
63 * methods that add characters, add categories, and control the logic
64 * of the set, the method <code>applyPattern()</code> sets all
65 * attributes of a <code>UnicodeSet</code> at once, based on a
66 * string pattern.
67 *
68 * <p><b>Pattern syntax</b></p>
69 *
70 * Patterns are accepted by the constructors and the
71 * <code>applyPattern()</code> methods and returned by the
72 * <code>toPattern()</code> method. These patterns follow a syntax
73 * similar to that employed by version 8 regular expression character
74 * classes. Here are some simple examples:
75 *
76 * \htmlonly<blockquote>\endhtmlonly
77 * <table>
78 * <tr align="top">
79 * <td nowrap valign="top" align="left"><code>[]</code></td>
80 * <td valign="top">No characters</td>
81 * </tr><tr align="top">
82 * <td nowrap valign="top" align="left"><code>[a]</code></td>
83 * <td valign="top">The character 'a'</td>
84 * </tr><tr align="top">
85 * <td nowrap valign="top" align="left"><code>[ae]</code></td>
86 * <td valign="top">The characters 'a' and 'e'</td>
87 * </tr>
88 * <tr>
89 * <td nowrap valign="top" align="left"><code>[a-e]</code></td>
90 * <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
91 * point order</td>
92 * </tr>
93 * <tr>
94 * <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
95 * <td valign="top">The character U+4E01</td>
96 * </tr>
97 * <tr>
98 * <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
99 * <td valign="top">The character 'a' and the multicharacter strings "ab" and
100 * "ac"</td>
101 * </tr>
102 * <tr>
103 * <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>
104 * <td valign="top">All characters in the general category Uppercase Letter</td>
105 * </tr>
106 * </table>
107 * \htmlonly</blockquote>\endhtmlonly
108 *
109 * Any character may be preceded by a backslash in order to remove any special
110 * meaning. White space characters, as defined by UCharacter.isWhitespace(), are
111 * ignored, unless they are escaped.
112 *
113 * <p>Property patterns specify a set of characters having a certain
114 * property as defined by the Unicode standard. Both the POSIX-like
115 * "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized. For a
116 * complete list of supported property patterns, see the User's Guide
117 * for UnicodeSet at
118 * <a href="http://icu-project.org/userguide/unicodeSet.html">
119 * http://icu-project.org/userguide/unicodeSet.html</a>.
120 * Actual determination of property data is defined by the underlying
121 * Unicode database as implemented by UCharacter.
122 *
123 * <p>Patterns specify individual characters, ranges of characters, and
124 * Unicode property sets. When elements are concatenated, they
125 * specify their union. To complement a set, place a '^' immediately
126 * after the opening '['. Property patterns are inverted by modifying
127 * their delimiters; "[:^foo]" and "\\P{foo}". In any other location,
128 * '^' has no special meaning.
129 *
130 * <p>Ranges are indicated by placing two a '-' between two
131 * characters, as in "a-z". This specifies the range of all
132 * characters from the left to the right, in Unicode order. If the
133 * left character is greater than or equal to the
134 * right character it is a syntax error. If a '-' occurs as the first
135 * character after the opening '[' or '[^', or if it occurs as the
136 * last character before the closing ']', then it is taken as a
137 * literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
138 * set of three characters, 'a', 'b', and '-'.
139 *
140 * <p>Sets may be intersected using the '&' operator or the asymmetric
141 * set difference may be taken using the '-' operator, for example,
142 * "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
143 * with values less than 4096. Operators ('&' and '|') have equal
144 * precedence and bind left-to-right. Thus
145 * "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
146 * "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
147 * difference; intersection is commutative.
148 *
149 * <table>
150 * <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
151 * <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
152 * through 'z' and all letters in between, in Unicode order
153 * <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
154 * all characters but 'a' through 'z',
155 * that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
156 * <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
157 * <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
158 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
159 * <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
160 * <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
161 * <td>The asymmetric difference of sets specified by <em>pat1</em> and
162 * <em>pat2</em>
163 * <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>
164 * <td>The set of characters having the specified
165 * Unicode property; in
166 * this case, Unicode uppercase letters
167 * <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>
168 * <td>The set of characters <em>not</em> having the given
169 * Unicode property
170 * </table>
171 *
172 * <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
173 *
174 * <p><b>Formal syntax</b></p>
175 *
176 * \htmlonly<blockquote>\endhtmlonly
177 * <table>
178 * <tr align="top">
179 * <td nowrap valign="top" align="right"><code>pattern := </code></td>
180 * <td valign="top"><code>('[' '^'? item* ']') |
181 * property</code></td>
182 * </tr>
183 * <tr align="top">
184 * <td nowrap valign="top" align="right"><code>item := </code></td>
185 * <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
186 * </code></td>
187 * </tr>
188 * <tr align="top">
189 * <td nowrap valign="top" align="right"><code>pattern-expr := </code></td>
190 * <td valign="top"><code>pattern | pattern-expr pattern |
191 * pattern-expr op pattern<br>
192 * </code></td>
193 * </tr>
194 * <tr align="top">
195 * <td nowrap valign="top" align="right"><code>op := </code></td>
196 * <td valign="top"><code>'&' | '-'<br>
197 * </code></td>
198 * </tr>
199 * <tr align="top">
200 * <td nowrap valign="top" align="right"><code>special := </code></td>
201 * <td valign="top"><code>'[' | ']' | '-'<br>
202 * </code></td>
203 * </tr>
204 * <tr align="top">
205 * <td nowrap valign="top" align="right"><code>char := </code></td>
206 * <td valign="top"><em>any character that is not</em><code> special<br>
207 * | ('\' </code><em>any character</em><code>)<br>
208 * | ('\\u' hex hex hex hex)<br>
209 * </code></td>
210 * </tr>
211 * <tr align="top">
212 * <td nowrap valign="top" align="right"><code>hex := </code></td>
213 * <td valign="top"><em>any character for which
214 * </em><code>Character.digit(c, 16)</code><em>
215 * returns a non-negative result</em></td>
216 * </tr>
217 * <tr>
218 * <td nowrap valign="top" align="right"><code>property := </code></td>
219 * <td valign="top"><em>a Unicode property set pattern</em></td>
220 * </tr>
221 * </table>
222 * <br>
223 * <table border="1">
224 * <tr>
225 * <td>Legend: <table>
226 * <tr>
227 * <td nowrap valign="top"><code>a := b</code></td>
228 * <td width="20" valign="top"> </td>
229 * <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
230 * </tr>
231 * <tr>
232 * <td nowrap valign="top"><code>a?</code></td>
233 * <td valign="top"></td>
234 * <td valign="top">zero or one instance of <code>a</code><br>
235 * </td>
236 * </tr>
237 * <tr>
238 * <td nowrap valign="top"><code>a*</code></td>
239 * <td valign="top"></td>
240 * <td valign="top">one or more instances of <code>a</code><br>
241 * </td>
242 * </tr>
243 * <tr>
244 * <td nowrap valign="top"><code>a | b</code></td>
245 * <td valign="top"></td>
246 * <td valign="top">either <code>a</code> or <code>b</code><br>
247 * </td>
248 * </tr>
249 * <tr>
250 * <td nowrap valign="top"><code>'a'</code></td>
251 * <td valign="top"></td>
252 * <td valign="top">the literal string between the quotes </td>
253 * </tr>
254 * </table>
255 * </td>
256 * </tr>
257 * </table>
258 * \htmlonly</blockquote>\endhtmlonly
259 *
260 * @author Alan Liu
261 * @stable ICU 2.0
262 */
263 class U_COMMON_API UnicodeSet : public UnicodeFilter {
264
265 int32_t len; // length of list used; 0 <= len <= capacity
266 int32_t capacity; // capacity of list
267 UChar32* list; // MUST be terminated with HIGH
268 BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
269 UChar32* buffer; // internal buffer, may be NULL
270 int32_t bufferCapacity; // capacity of buffer
271 int32_t patLen;
272
273 /**
274 * The pattern representation of this set. This may not be the
275 * most economical pattern. It is the pattern supplied to
276 * applyPattern(), with variables substituted and whitespace
277 * removed. For sets constructed without applyPattern(), or
278 * modified using the non-pattern API, this string will be empty,
279 * indicating that toPattern() must generate a pattern
280 * representation from the inversion list.
281 */
282 UChar *pat;
283 UVector* strings; // maintained in sorted order
284 UnicodeSetStringSpan *stringSpan;
285
286 public:
287
288 enum {
289 /**
290 * Minimum value that can be stored in a UnicodeSet.
291 * @stable ICU 2.4
292 */
293 MIN_VALUE = 0,
294
295 /**
296 * Maximum value that can be stored in a UnicodeSet.
297 * @stable ICU 2.4
298 */
299 MAX_VALUE = 0x10ffff
300 };
301
302 //----------------------------------------------------------------
303 // Constructors &c
304 //----------------------------------------------------------------
305
306 public:
307
308 /**
309 * Constructs an empty set.
310 * @stable ICU 2.0
311 */
312 UnicodeSet();
313
314 /**
315 * Constructs a set containing the given range. If <code>end >
316 * start</code> then an empty set is created.
317 *
318 * @param start first character, inclusive, of range
319 * @param end last character, inclusive, of range
320 * @stable ICU 2.4
321 */
322 UnicodeSet(UChar32 start, UChar32 end);
323
324 /**
325 * Constructs a set from the given pattern. See the class
326 * description for the syntax of the pattern language.
327 * @param pattern a string specifying what characters are in the set
328 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
329 * contains a syntax error.
330 * @stable ICU 2.0
331 */
332 UnicodeSet(const UnicodeString& pattern,
333 UErrorCode& status);
334
335 /**
336 * Constructs a set from the given pattern. See the class
337 * description for the syntax of the pattern language.
338 * @param pattern a string specifying what characters are in the set
339 * @param options bitmask for options to apply to the pattern.
340 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
341 * @param symbols a symbol table mapping variable names to values
342 * and stand-in characters to UnicodeSets; may be NULL
343 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
344 * contains a syntax error.
345 * @internal
346 */
347 UnicodeSet(const UnicodeString& pattern,
348 uint32_t options,
349 const SymbolTable* symbols,
350 UErrorCode& status);
351
352 /**
353 * Constructs a set from the given pattern. See the class description
354 * for the syntax of the pattern language.
355 * @param pattern a string specifying what characters are in the set
356 * @param pos on input, the position in pattern at which to start parsing.
357 * On output, the position after the last character parsed.
358 * @param options bitmask for options to apply to the pattern.
359 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
360 * @param symbols a symbol table mapping variable names to values
361 * and stand-in characters to UnicodeSets; may be NULL
362 * @param status input-output error code
363 * @stable ICU 2.8
364 */
365 UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
366 uint32_t options,
367 const SymbolTable* symbols,
368 UErrorCode& status);
369
370 /**
371 * Constructs a set that is identical to the given UnicodeSet.
372 * @stable ICU 2.0
373 */
374 UnicodeSet(const UnicodeSet& o);
375
376 /**
377 * Destructs the set.
378 * @stable ICU 2.0
379 */
380 virtual ~UnicodeSet();
381
382 /**
383 * Assigns this object to be a copy of another.
384 * A frozen set will not be modified.
385 * @stable ICU 2.0
386 */
387 UnicodeSet& operator=(const UnicodeSet& o);
388
389 /**
390 * Compares the specified object with this set for equality. Returns
391 * <tt>true</tt> if the two sets
392 * have the same size, and every member of the specified set is
393 * contained in this set (or equivalently, every member of this set is
394 * contained in the specified set).
395 *
396 * @param o set to be compared for equality with this set.
397 * @return <tt>true</tt> if the specified set is equal to this set.
398 * @stable ICU 2.0
399 */
400 virtual UBool operator==(const UnicodeSet& o) const;
401
402 /**
403 * Compares the specified object with this set for equality. Returns
404 * <tt>true</tt> if the specified set is not equal to this set.
405 * @stable ICU 2.0
406 */
407 UBool operator!=(const UnicodeSet& o) const;
408
409 /**
410 * Returns a copy of this object. All UnicodeFunctor objects have
411 * to support cloning in order to allow classes using
412 * UnicodeFunctors, such as Transliterator, to implement cloning.
413 * If this set is frozen, then the clone will be frozen as well.
414 * Use cloneAsThawed() for a mutable clone of a frozen set.
415 * @see cloneAsThawed
416 * @stable ICU 2.0
417 */
418 virtual UnicodeFunctor* clone() const;
419
420 /**
421 * Returns the hash code value for this set.
422 *
423 * @return the hash code value for this set.
424 * @see Object#hashCode()
425 * @stable ICU 2.0
426 */
427 virtual int32_t hashCode(void) const;
428
429 //----------------------------------------------------------------
430 // Freezable API
431 //----------------------------------------------------------------
432
433 /**
434 * Determines whether the set has been frozen (made immutable) or not.
435 * See the ICU4J Freezable interface for details.
436 * @return TRUE/FALSE for whether the set has been frozen
437 * @see freeze
438 * @see cloneAsThawed
439 * @draft ICU 3.8
440 */
441 inline UBool isFrozen() const;
442
443 /**
444 * Freeze the set (make it immutable).
445 * Once frozen, it cannot be unfrozen and is therefore thread-safe
446 * until it is deleted.
447 * See the ICU4J Freezable interface for details.
448 * Freezing the set may also make some operations faster, for example
449 * contains() and span().
450 * A frozen set will not be modified. (It remains frozen.)
451 * @return this set.
452 * @see isFrozen
453 * @see cloneAsThawed
454 * @draft ICU 3.8
455 */
456 UnicodeFunctor *freeze();
457
458 /**
459 * Clone the set and make the clone mutable.
460 * See the ICU4J Freezable interface for details.
461 * @return the mutable clone
462 * @see freeze
463 * @see isFrozen
464 * @draft ICU 3.8
465 */
466 UnicodeFunctor *cloneAsThawed() const;
467
468 //----------------------------------------------------------------
469 // Public API
470 //----------------------------------------------------------------
471
472 /**
473 * Make this object represent the range <code>start - end</code>.
474 * If <code>end > start</code> then this object is set to an
475 * an empty range.
476 * A frozen set will not be modified.
477 *
478 * @param start first character in the set, inclusive
479 * @param end last character in the set, inclusive
480 * @stable ICU 2.4
481 */
482 UnicodeSet& set(UChar32 start, UChar32 end);
483
484 /**
485 * Return true if the given position, in the given pattern, appears
486 * to be the start of a UnicodeSet pattern.
487 * @stable ICU 2.4
488 */
489 static UBool resemblesPattern(const UnicodeString& pattern,
490 int32_t pos);
491
492 /**
493 * Modifies this set to represent the set specified by the given
494 * pattern, optionally ignoring white space. See the class
495 * description for the syntax of the pattern language.
496 * A frozen set will not be modified.
497 * @param pattern a string specifying what characters are in the set
498 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
499 * contains a syntax error.
500 * <em> Empties the set passed before applying the pattern.</em>
501 * @return a reference to this
502 * @stable ICU 2.0
503 */
504 UnicodeSet& applyPattern(const UnicodeString& pattern,
505 UErrorCode& status);
506
507 /**
508 * Modifies this set to represent the set specified by the given
509 * pattern, optionally ignoring white space. See the class
510 * description for the syntax of the pattern language.
511 * A frozen set will not be modified.
512 * @param pattern a string specifying what characters are in the set
513 * @param options bitmask for options to apply to the pattern.
514 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
515 * @param symbols a symbol table mapping variable names to
516 * values and stand-ins to UnicodeSets; may be NULL
517 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
518 * contains a syntax error.
519 *<em> Empties the set passed before applying the pattern.</em>
520 * @return a reference to this
521 * @internal
522 */
523 UnicodeSet& applyPattern(const UnicodeString& pattern,
524 uint32_t options,
525 const SymbolTable* symbols,
526 UErrorCode& status);
527
528 /**
529 * Parses the given pattern, starting at the given position. The
530 * character at pattern.charAt(pos.getIndex()) must be '[', or the
531 * parse fails. Parsing continues until the corresponding closing
532 * ']'. If a syntax error is encountered between the opening and
533 * closing brace, the parse fails. Upon return from a successful
534 * parse, the ParsePosition is updated to point to the character
535 * following the closing ']', and a StringBuffer containing a
536 * pairs list for the parsed pattern is returned. This method calls
537 * itself recursively to parse embedded subpatterns.
538 *<em> Empties the set passed before applying the pattern.</em>
539 * A frozen set will not be modified.
540 *
541 * @param pattern the string containing the pattern to be parsed.
542 * The portion of the string from pos.getIndex(), which must be a
543 * '[', to the corresponding closing ']', is parsed.
544 * @param pos upon entry, the position at which to being parsing.
545 * The character at pattern.charAt(pos.getIndex()) must be a '['.
546 * Upon return from a successful parse, pos.getIndex() is either
547 * the character after the closing ']' of the parsed pattern, or
548 * pattern.length() if the closing ']' is the last character of
549 * the pattern string.
550 * @param options bitmask for options to apply to the pattern.
551 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
552 * @param symbols a symbol table mapping variable names to
553 * values and stand-ins to UnicodeSets; may be NULL
554 * @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
555 * contains a syntax error.
556 * @return a reference to this
557 * @stable ICU 2.8
558 */
559 UnicodeSet& applyPattern(const UnicodeString& pattern,
560 ParsePosition& pos,
561 uint32_t options,
562 const SymbolTable* symbols,
563 UErrorCode& status);
564
565 /**
566 * Returns a string representation of this set. If the result of
567 * calling this function is passed to a UnicodeSet constructor, it
568 * will produce another set that is equal to this one.
569 * A frozen set will not be modified.
570 * @param result the string to receive the rules. Previous
571 * contents will be deleted.
572 * @param escapeUnprintable if TRUE then convert unprintable
573 * character to their hex escape representations, \\uxxxx or
574 * \\Uxxxxxxxx. Unprintable characters are those other than
575 * U+000A, U+0020..U+007E.
576 * @stable ICU 2.0
577 */
578 virtual UnicodeString& toPattern(UnicodeString& result,
579 UBool escapeUnprintable = FALSE) const;
580
581 /**
582 * Modifies this set to contain those code points which have the given value
583 * for the given binary or enumerated property, as returned by
584 * u_getIntPropertyValue. Prior contents of this set are lost.
585 * A frozen set will not be modified.
586 *
587 * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
588 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
589 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
590 *
591 * @param value a value in the range u_getIntPropertyMinValue(prop)..
592 * u_getIntPropertyMaxValue(prop), with one exception. If prop is
593 * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
594 * rather a mask value produced by U_GET_GC_MASK(). This allows grouped
595 * categories such as [:L:] to be represented.
596 *
597 * @param ec error code input/output parameter
598 *
599 * @return a reference to this set
600 *
601 * @stable ICU 2.4
602 */
603 UnicodeSet& applyIntPropertyValue(UProperty prop,
604 int32_t value,
605 UErrorCode& ec);
606
607 /**
608 * Modifies this set to contain those code points which have the
609 * given value for the given property. Prior contents of this
610 * set are lost.
611 * A frozen set will not be modified.
612 *
613 * @param prop a property alias, either short or long. The name is matched
614 * loosely. See PropertyAliases.txt for names and a description of loose
615 * matching. If the value string is empty, then this string is interpreted
616 * as either a General_Category value alias, a Script value alias, a binary
617 * property alias, or a special ID. Special IDs are matched loosely and
618 * correspond to the following sets:
619 *
620 * "ANY" = [\\u0000-\\U0010FFFF],
621 * "ASCII" = [\\u0000-\\u007F],
622 * "Assigned" = [:^Cn:].
623 *
624 * @param value a value alias, either short or long. The name is matched
625 * loosely. See PropertyValueAliases.txt for names and a description of
626 * loose matching. In addition to aliases listed, numeric values and
627 * canonical combining classes may be expressed numerically, e.g., ("nv",
628 * "0.5") or ("ccc", "220"). The value string may also be empty.
629 *
630 * @param ec error code input/output parameter
631 *
632 * @return a reference to this set
633 *
634 * @stable ICU 2.4
635 */
636 UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
637 const UnicodeString& value,
638 UErrorCode& ec);
639
640 /**
641 * Returns the number of elements in this set (its cardinality).
642 * Note than the elements of a set may include both individual
643 * codepoints and strings.
644 *
645 * @return the number of elements in this set (its cardinality).
646 * @stable ICU 2.0
647 */
648 virtual int32_t size(void) const;
649
650 /**
651 * Returns <tt>true</tt> if this set contains no elements.
652 *
653 * @return <tt>true</tt> if this set contains no elements.
654 * @stable ICU 2.0
655 */
656 virtual UBool isEmpty(void) const;
657
658 /**
659 * Returns true if this set contains the given character.
660 * This function works faster with a frozen set.
661 * @param c character to be checked for containment
662 * @return true if the test condition is met
663 * @stable ICU 2.0
664 */
665 virtual UBool contains(UChar32 c) const;
666
667 /**
668 * Returns true if this set contains every character
669 * of the given range.
670 * @param start first character, inclusive, of the range
671 * @param end last character, inclusive, of the range
672 * @return true if the test condition is met
673 * @stable ICU 2.0
674 */
675 virtual UBool contains(UChar32 start, UChar32 end) const;
676
677 /**
678 * Returns <tt>true</tt> if this set contains the given
679 * multicharacter string.
680 * @param s string to be checked for containment
681 * @return <tt>true</tt> if this set contains the specified string
682 * @stable ICU 2.4
683 */
684 UBool contains(const UnicodeString& s) const;
685
686 /**
687 * Returns true if this set contains all the characters and strings
688 * of the given set.
689 * @param c set to be checked for containment
690 * @return true if the test condition is met
691 * @stable ICU 2.4
692 */
693 virtual UBool containsAll(const UnicodeSet& c) const;
694
695 /**
696 * Returns true if this set contains all the characters
697 * of the given string.
698 * @param s string containing characters to be checked for containment
699 * @return true if the test condition is met
700 * @stable ICU 2.4
701 */
702 UBool containsAll(const UnicodeString& s) const;
703
704 /**
705 * Returns true if this set contains none of the characters
706 * of the given range.
707 * @param start first character, inclusive, of the range
708 * @param end last character, inclusive, of the range
709 * @return true if the test condition is met
710 * @stable ICU 2.4
711 */
712 UBool containsNone(UChar32 start, UChar32 end) const;
713
714 /**
715 * Returns true if this set contains none of the characters and strings
716 * of the given set.
717 * @param c set to be checked for containment
718 * @return true if the test condition is met
719 * @stable ICU 2.4
720 */
721 UBool containsNone(const UnicodeSet& c) const;
722
723 /**
724 * Returns true if this set contains none of the characters
725 * of the given string.
726 * @param s string containing characters to be checked for containment
727 * @return true if the test condition is met
728 * @stable ICU 2.4
729 */
730 UBool containsNone(const UnicodeString& s) const;
731
732 /**
733 * Returns true if this set contains one or more of the characters
734 * in the given range.
735 * @param start first character, inclusive, of the range
736 * @param end last character, inclusive, of the range
737 * @return true if the condition is met
738 * @stable ICU 2.4
739 */
740 inline UBool containsSome(UChar32 start, UChar32 end) const;
741
742 /**
743 * Returns true if this set contains one or more of the characters
744 * and strings of the given set.
745 * @param s The set to be checked for containment
746 * @return true if the condition is met
747 * @stable ICU 2.4
748 */
749 inline UBool containsSome(const UnicodeSet& s) const;
750
751 /**
752 * Returns true if this set contains one or more of the characters
753 * of the given string.
754 * @param s string containing characters to be checked for containment
755 * @return true if the condition is met
756 * @stable ICU 2.4
757 */
758 inline UBool containsSome(const UnicodeString& s) const;
759
760 /**
761 * Returns the length of the initial substring of the input string which
762 * consists only of characters and strings that are contained in this set
763 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
764 * or only of characters and strings that are not contained
765 * in this set (USET_SPAN_NOT_CONTAINED).
766 * See USetSpanCondition for details.
767 * Similar to the strspn() C library function.
768 * Unpaired surrogates are treated according to contains() of their surrogate code points.
769 * This function works faster with a frozen set and with a non-negative string length argument.
770 * @param s start of the string
771 * @param length of the string; can be -1 for NUL-terminated
772 * @param spanCondition specifies the containment condition
773 * @return the length of the initial substring according to the spanCondition;
774 * 0 if the start of the string does not fit the spanCondition
775 * @draft ICU 3.8
776 * @see USetSpanCondition
777 */
778 int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
779
780 /**
781 * Returns the start of the trailing substring of the input string which
782 * consists only of characters and strings that are contained in this set
783 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
784 * or only of characters and strings that are not contained
785 * in this set (USET_SPAN_NOT_CONTAINED).
786 * See USetSpanCondition for details.
787 * Unpaired surrogates are treated according to contains() of their surrogate code points.
788 * This function works faster with a frozen set and with a non-negative string length argument.
789 * @param s start of the string
790 * @param length of the string; can be -1 for NUL-terminated
791 * @param spanCondition specifies the containment condition
792 * @return the start of the trailing substring according to the spanCondition;
793 * the string length if the end of the string does not fit the spanCondition
794 * @draft ICU 3.8
795 * @see USetSpanCondition
796 */
797 int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
798
799 /**
800 * Returns the length of the initial substring of the input string which
801 * consists only of characters and strings that are contained in this set
802 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
803 * or only of characters and strings that are not contained
804 * in this set (USET_SPAN_NOT_CONTAINED).
805 * See USetSpanCondition for details.
806 * Similar to the strspn() C library function.
807 * Malformed byte sequences are treated according to contains(0xfffd).
808 * This function works faster with a frozen set and with a non-negative string length argument.
809 * @param s start of the string (UTF-8)
810 * @param length of the string; can be -1 for NUL-terminated
811 * @param spanCondition specifies the containment condition
812 * @return the length of the initial substring according to the spanCondition;
813 * 0 if the start of the string does not fit the spanCondition
814 * @draft ICU 3.8
815 * @see USetSpanCondition
816 */
817 int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
818
819 /**
820 * Returns the start of the trailing substring of the input string which
821 * consists only of characters and strings that are contained in this set
822 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
823 * or only of characters and strings that are not contained
824 * in this set (USET_SPAN_NOT_CONTAINED).
825 * See USetSpanCondition for details.
826 * Malformed byte sequences are treated according to contains(0xfffd).
827 * This function works faster with a frozen set and with a non-negative string length argument.
828 * @param s start of the string (UTF-8)
829 * @param length of the string; can be -1 for NUL-terminated
830 * @param spanCondition specifies the containment condition
831 * @return the start of the trailing substring according to the spanCondition;
832 * the string length if the end of the string does not fit the spanCondition
833 * @draft ICU 3.8
834 * @see USetSpanCondition
835 */
836 int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
837
838 /**
839 * Implement UnicodeMatcher::matches()
840 * @stable ICU 2.4
841 */
842 virtual UMatchDegree matches(const Replaceable& text,
843 int32_t& offset,
844 int32_t limit,
845 UBool incremental);
846
847 private:
848 /**
849 * Returns the longest match for s in text at the given position.
850 * If limit > start then match forward from start+1 to limit
851 * matching all characters except s.charAt(0). If limit < start,
852 * go backward starting from start-1 matching all characters
853 * except s.charAt(s.length()-1). This method assumes that the
854 * first character, text.charAt(start), matches s, so it does not
855 * check it.
856 * @param text the text to match
857 * @param start the first character to match. In the forward
858 * direction, text.charAt(start) is matched against s.charAt(0).
859 * In the reverse direction, it is matched against
860 * s.charAt(s.length()-1).
861 * @param limit the limit offset for matching, either last+1 in
862 * the forward direction, or last-1 in the reverse direction,
863 * where last is the index of the last character to match.
864 * @return If part of s matches up to the limit, return |limit -
865 * start|. If all of s matches before reaching the limit, return
866 * s.length(). If there is a mismatch between s and text, return
867 * 0
868 */
869 static int32_t matchRest(const Replaceable& text,
870 int32_t start, int32_t limit,
871 const UnicodeString& s);
872
873 /**
874 * Returns the smallest value i such that c < list[i]. Caller
875 * must ensure that c is a legal value or this method will enter
876 * an infinite loop. This method performs a binary search.
877 * @param c a character in the range MIN_VALUE..MAX_VALUE
878 * inclusive
879 * @return the smallest integer i in the range 0..len-1,
880 * inclusive, such that c < list[i]
881 */
882 int32_t findCodePoint(UChar32 c) const;
883
884 public:
885
886 /**
887 * Implementation of UnicodeMatcher API. Union the set of all
888 * characters that may be matched by this object into the given
889 * set.
890 * @param toUnionTo the set into which to union the source characters
891 * @stable ICU 2.4
892 */
893 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
894
895 /**
896 * Returns the index of the given character within this set, where
897 * the set is ordered by ascending code point. If the character
898 * is not in this set, return -1. The inverse of this method is
899 * <code>charAt()</code>.
900 * @return an index from 0..size()-1, or -1
901 * @stable ICU 2.4
902 */
903 int32_t indexOf(UChar32 c) const;
904
905 /**
906 * Returns the character at the given index within this set, where
907 * the set is ordered by ascending code point. If the index is
908 * out of range, return (UChar32)-1. The inverse of this method is
909 * <code>indexOf()</code>.
910 * @param index an index from 0..size()-1
911 * @return the character at the given index, or (UChar32)-1.
912 * @stable ICU 2.4
913 */
914 UChar32 charAt(int32_t index) const;
915
916 /**
917 * Adds the specified range to this set if it is not already
918 * present. If this set already contains the specified range,
919 * the call leaves this set unchanged. If <code>end > start</code>
920 * then an empty range is added, leaving the set unchanged.
921 * This is equivalent to a boolean logic OR, or a set UNION.
922 * A frozen set will not be modified.
923 *
924 * @param start first character, inclusive, of range to be added
925 * to this set.
926 * @param end last character, inclusive, of range to be added
927 * to this set.
928 * @stable ICU 2.0
929 */
930 virtual UnicodeSet& add(UChar32 start, UChar32 end);
931
932 /**
933 * Adds the specified character to this set if it is not already
934 * present. If this set already contains the specified character,
935 * the call leaves this set unchanged.
936 * A frozen set will not be modified.
937 * @stable ICU 2.0
938 */
939 UnicodeSet& add(UChar32 c);
940
941 /**
942 * Adds the specified multicharacter to this set if it is not already
943 * present. If this set already contains the multicharacter,
944 * the call leaves this set unchanged.
945 * Thus "ch" => {"ch"}
946 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
947 * A frozen set will not be modified.
948 * @param s the source string
949 * @return this object, for chaining
950 * @stable ICU 2.4
951 */
952 UnicodeSet& add(const UnicodeString& s);
953
954 private:
955 /**
956 * @return a code point IF the string consists of a single one.
957 * otherwise returns -1.
958 * @param string to test
959 */
960 static int32_t getSingleCP(const UnicodeString& s);
961
962 void _add(const UnicodeString& s);
963
964 public:
965 /**
966 * Adds each of the characters in this string to the set. Thus "ch" => {"c", "h"}
967 * If this set already any particular character, it has no effect on that character.
968 * A frozen set will not be modified.
969 * @param s the source string
970 * @return this object, for chaining
971 * @stable ICU 2.4
972 */
973 UnicodeSet& addAll(const UnicodeString& s);
974
975 /**
976 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
977 * If this set already any particular character, it has no effect on that character.
978 * A frozen set will not be modified.
979 * @param s the source string
980 * @return this object, for chaining
981 * @stable ICU 2.4
982 */
983 UnicodeSet& retainAll(const UnicodeString& s);
984
985 /**
986 * Complement EACH of the characters in this string. Note: "ch" == {"c", "h"}
987 * If this set already any particular character, it has no effect on that character.
988 * A frozen set will not be modified.
989 * @param s the source string
990 * @return this object, for chaining
991 * @stable ICU 2.4
992 */
993 UnicodeSet& complementAll(const UnicodeString& s);
994
995 /**
996 * Remove EACH of the characters in this string. Note: "ch" == {"c", "h"}
997 * If this set already any particular character, it has no effect on that character.
998 * A frozen set will not be modified.
999 * @param s the source string
1000 * @return this object, for chaining
1001 * @stable ICU 2.4
1002 */
1003 UnicodeSet& removeAll(const UnicodeString& s);
1004
1005 /**
1006 * Makes a set from a multicharacter string. Thus "ch" => {"ch"}
1007 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1008 * @param s the source string
1009 * @return a newly created set containing the given string.
1010 * The caller owns the return object and is responsible for deleting it.
1011 * @stable ICU 2.4
1012 */
1013 static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1014
1015
1016 /**
1017 * Makes a set from each of the characters in the string. Thus "ch" => {"c", "h"}
1018 * @param s the source string
1019 * @return a newly created set containing the given characters
1020 * The caller owns the return object and is responsible for deleting it.
1021 * @stable ICU 2.4
1022 */
1023 static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1024
1025 /**
1026 * Retain only the elements in this set that are contained in the
1027 * specified range. If <code>end > start</code> then an empty range is
1028 * retained, leaving the set empty. This is equivalent to
1029 * a boolean logic AND, or a set INTERSECTION.
1030 * A frozen set will not be modified.
1031 *
1032 * @param start first character, inclusive, of range to be retained
1033 * to this set.
1034 * @param end last character, inclusive, of range to be retained
1035 * to this set.
1036 * @stable ICU 2.0
1037 */
1038 virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1039
1040
1041 /**
1042 * Retain the specified character from this set if it is present.
1043 * A frozen set will not be modified.
1044 * @stable ICU 2.0
1045 */
1046 UnicodeSet& retain(UChar32 c);
1047
1048 /**
1049 * Removes the specified range from this set if it is present.
1050 * The set will not contain the specified range once the call
1051 * returns. If <code>end > start</code> then an empty range is
1052 * removed, leaving the set unchanged.
1053 * A frozen set will not be modified.
1054 *
1055 * @param start first character, inclusive, of range to be removed
1056 * from this set.
1057 * @param end last character, inclusive, of range to be removed
1058 * from this set.
1059 * @stable ICU 2.0
1060 */
1061 virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1062
1063 /**
1064 * Removes the specified character from this set if it is present.
1065 * The set will not contain the specified range once the call
1066 * returns.
1067 * A frozen set will not be modified.
1068 * @stable ICU 2.0
1069 */
1070 UnicodeSet& remove(UChar32 c);
1071
1072 /**
1073 * Removes the specified string from this set if it is present.
1074 * The set will not contain the specified character once the call
1075 * returns.
1076 * A frozen set will not be modified.
1077 * @param s the source string
1078 * @return this object, for chaining
1079 * @stable ICU 2.4
1080 */
1081 UnicodeSet& remove(const UnicodeString& s);
1082
1083 /**
1084 * Inverts this set. This operation modifies this set so that
1085 * its value is its complement. This is equivalent to
1086 * <code>complement(MIN_VALUE, MAX_VALUE)</code>.
1087 * A frozen set will not be modified.
1088 * @stable ICU 2.0
1089 */
1090 virtual UnicodeSet& complement(void);
1091
1092 /**
1093 * Complements the specified range in this set. Any character in
1094 * the range will be removed if it is in this set, or will be
1095 * added if it is not in this set. If <code>end > start</code>
1096 * then an empty range is complemented, leaving the set unchanged.
1097 * This is equivalent to a boolean logic XOR.
1098 * A frozen set will not be modified.
1099 *
1100 * @param start first character, inclusive, of range to be removed
1101 * from this set.
1102 * @param end last character, inclusive, of range to be removed
1103 * from this set.
1104 * @stable ICU 2.0
1105 */
1106 virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1107
1108 /**
1109 * Complements the specified character in this set. The character
1110 * will be removed if it is in this set, or will be added if it is
1111 * not in this set.
1112 * A frozen set will not be modified.
1113 * @stable ICU 2.0
1114 */
1115 UnicodeSet& complement(UChar32 c);
1116
1117 /**
1118 * Complement the specified string in this set.
1119 * The set will not contain the specified string once the call
1120 * returns.
1121 * <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
1122 * A frozen set will not be modified.
1123 * @param s the string to complement
1124 * @return this object, for chaining
1125 * @stable ICU 2.4
1126 */
1127 UnicodeSet& complement(const UnicodeString& s);
1128
1129 /**
1130 * Adds all of the elements in the specified set to this set if
1131 * they're not already present. This operation effectively
1132 * modifies this set so that its value is the <i>union</i> of the two
1133 * sets. The behavior of this operation is unspecified if the specified
1134 * collection is modified while the operation is in progress.
1135 * A frozen set will not be modified.
1136 *
1137 * @param c set whose elements are to be added to this set.
1138 * @see #add(UChar32, UChar32)
1139 * @stable ICU 2.0
1140 */
1141 virtual UnicodeSet& addAll(const UnicodeSet& c);
1142
1143 /**
1144 * Retains only the elements in this set that are contained in the
1145 * specified set. In other words, removes from this set all of
1146 * its elements that are not contained in the specified set. This
1147 * operation effectively modifies this set so that its value is
1148 * the <i>intersection</i> of the two sets.
1149 * A frozen set will not be modified.
1150 *
1151 * @param c set that defines which elements this set will retain.
1152 * @stable ICU 2.0
1153 */
1154 virtual UnicodeSet& retainAll(const UnicodeSet& c);
1155
1156 /**
1157 * Removes from this set all of its elements that are contained in the
1158 * specified set. This operation effectively modifies this
1159 * set so that its value is the <i>asymmetric set difference</i> of
1160 * the two sets.
1161 * A frozen set will not be modified.
1162 *
1163 * @param c set that defines which elements will be removed from
1164 * this set.
1165 * @stable ICU 2.0
1166 */
1167 virtual UnicodeSet& removeAll(const UnicodeSet& c);
1168
1169 /**
1170 * Complements in this set all elements contained in the specified
1171 * set. Any character in the other set will be removed if it is
1172 * in this set, or will be added if it is not in this set.
1173 * A frozen set will not be modified.
1174 *
1175 * @param c set that defines which elements will be xor'ed from
1176 * this set.
1177 * @stable ICU 2.4
1178 */
1179 virtual UnicodeSet& complementAll(const UnicodeSet& c);
1180
1181 /**
1182 * Removes all of the elements from this set. This set will be
1183 * empty after this call returns.
1184 * A frozen set will not be modified.
1185 * @stable ICU 2.0
1186 */
1187 virtual UnicodeSet& clear(void);
1188
1189 /**
1190 * Close this set over the given attribute. For the attribute
1191 * USET_CASE, the result is to modify this set so that:
1192 *
1193 * 1. For each character or string 'a' in this set, all strings or
1194 * characters 'b' such that foldCase(a) == foldCase(b) are added
1195 * to this set.
1196 *
1197 * 2. For each string 'e' in the resulting set, if e !=
1198 * foldCase(e), 'e' will be removed.
1199 *
1200 * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
1201 *
1202 * (Here foldCase(x) refers to the operation u_strFoldCase, and a
1203 * == b denotes that the contents are the same, not pointer
1204 * comparison.)
1205 *
1206 * A frozen set will not be modified.
1207 *
1208 * @param attribute bitmask for attributes to close over.
1209 * Currently only the USET_CASE bit is supported. Any undefined bits
1210 * are ignored.
1211 * @return a reference to this set.
1212 * @internal
1213 */
1214 UnicodeSet& closeOver(int32_t attribute);
1215
1216 /**
1217 * Remove all strings from this set.
1218 *
1219 * @return a reference to this set.
1220 * @internal
1221 */
1222 virtual UnicodeSet &removeAllStrings();
1223
1224 /**
1225 * Iteration method that returns the number of ranges contained in
1226 * this set.
1227 * @see #getRangeStart
1228 * @see #getRangeEnd
1229 * @stable ICU 2.4
1230 */
1231 virtual int32_t getRangeCount(void) const;
1232
1233 /**
1234 * Iteration method that returns the first character in the
1235 * specified range of this set.
1236 * @see #getRangeCount
1237 * @see #getRangeEnd
1238 * @stable ICU 2.4
1239 */
1240 virtual UChar32 getRangeStart(int32_t index) const;
1241
1242 /**
1243 * Iteration method that returns the last character in the
1244 * specified range of this set.
1245 * @see #getRangeStart
1246 * @see #getRangeEnd
1247 * @stable ICU 2.4
1248 */
1249 virtual UChar32 getRangeEnd(int32_t index) const;
1250
1251 /**
1252 * Serializes this set into an array of 16-bit integers. Serialization
1253 * (currently) only records the characters in the set; multicharacter
1254 * strings are ignored.
1255 *
1256 * The array has following format (each line is one 16-bit
1257 * integer):
1258 *
1259 * length = (n+2*m) | (m!=0?0x8000:0)
1260 * bmpLength = n; present if m!=0
1261 * bmp[0]
1262 * bmp[1]
1263 * ...
1264 * bmp[n-1]
1265 * supp-high[0]
1266 * supp-low[0]
1267 * supp-high[1]
1268 * supp-low[1]
1269 * ...
1270 * supp-high[m-1]
1271 * supp-low[m-1]
1272 *
1273 * The array starts with a header. After the header are n bmp
1274 * code points, then m supplementary code points. Either n or m
1275 * or both may be zero. n+2*m is always <= 0x7FFF.
1276 *
1277 * If there are no supplementary characters (if m==0) then the
1278 * header is one 16-bit integer, 'length', with value n.
1279 *
1280 * If there are supplementary characters (if m!=0) then the header
1281 * is two 16-bit integers. The first, 'length', has value
1282 * (n+2*m)|0x8000. The second, 'bmpLength', has value n.
1283 *
1284 * After the header the code points are stored in ascending order.
1285 * Supplementary code points are stored as most significant 16
1286 * bits followed by least significant 16 bits.
1287 *
1288 * @param dest pointer to buffer of destCapacity 16-bit integers.
1289 * May be NULL only if destCapacity is zero.
1290 * @param destCapacity size of dest, or zero. Must not be negative.
1291 * @param ec error code. Will be set to U_INDEX_OUTOFBOUNDS_ERROR
1292 * if n+2*m > 0x7FFF. Will be set to U_BUFFER_OVERFLOW_ERROR if
1293 * n+2*m+(m!=0?2:1) > destCapacity.
1294 * @return the total length of the serialized format, including
1295 * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
1296 * than U_BUFFER_OVERFLOW_ERROR.
1297 * @stable ICU 2.4
1298 */
1299 int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1300
1301 /**
1302 * Reallocate this objects internal structures to take up the least
1303 * possible space, without changing this object's value.
1304 * A frozen set will not be modified.
1305 * @stable ICU 2.4
1306 */
1307 virtual UnicodeSet& compact();
1308
1309 /**
1310 * Return the class ID for this class. This is useful only for
1311 * comparing to a return value from getDynamicClassID(). For example:
1312 * <pre>
1313 * . Base* polymorphic_pointer = createPolymorphicObject();
1314 * . if (polymorphic_pointer->getDynamicClassID() ==
1315 * . Derived::getStaticClassID()) ...
1316 * </pre>
1317 * @return The class ID for all objects of this class.
1318 * @stable ICU 2.0
1319 */
1320 static UClassID U_EXPORT2 getStaticClassID(void);
1321
1322 /**
1323 * Implement UnicodeFunctor API.
1324 *
1325 * @return The class ID for this object. All objects of a given
1326 * class have the same class ID. Objects of other classes have
1327 * different class IDs.
1328 * @stable ICU 2.4
1329 */
1330 virtual UClassID getDynamicClassID(void) const;
1331
1332 private:
1333
1334 // Private API for the USet API
1335
1336 friend class USetAccess;
1337
1338 int32_t getStringCount() const;
1339
1340 const UnicodeString* getString(int32_t index) const;
1341
1342 //----------------------------------------------------------------
1343 // RuleBasedTransliterator support
1344 //----------------------------------------------------------------
1345
1346 private:
1347
1348 /**
1349 * Returns <tt>true</tt> if this set contains any character whose low byte
1350 * is the given value. This is used by <tt>RuleBasedTransliterator</tt> for
1351 * indexing.
1352 */
1353 virtual UBool matchesIndexValue(uint8_t v) const;
1354
1355 private:
1356
1357 //----------------------------------------------------------------
1358 // Implementation: Clone as thawed (see ICU4J Freezable)
1359 //----------------------------------------------------------------
1360
1361 UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1362
1363 //----------------------------------------------------------------
1364 // Implementation: Pattern parsing
1365 //----------------------------------------------------------------
1366
1367 void applyPattern(RuleCharacterIterator& chars,
1368 const SymbolTable* symbols,
1369 UnicodeString& rebuiltPat,
1370 uint32_t options,
1371 UErrorCode& ec);
1372
1373 //----------------------------------------------------------------
1374 // Implementation: Utility methods
1375 //----------------------------------------------------------------
1376
1377 void ensureCapacity(int32_t newLen);
1378
1379 void ensureBufferCapacity(int32_t newLen);
1380
1381 void swapBuffers(void);
1382
1383 UBool allocateStrings(UErrorCode &status);
1384
1385 UnicodeString& _toPattern(UnicodeString& result,
1386 UBool escapeUnprintable) const;
1387
1388 UnicodeString& _generatePattern(UnicodeString& result,
1389 UBool escapeUnprintable) const;
1390
1391 static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1392
1393 static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1394
1395 //----------------------------------------------------------------
1396 // Implementation: Fundamental operators
1397 //----------------------------------------------------------------
1398
1399 void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1400
1401 void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1402
1403 void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1404
1405 /**
1406 * Return true if the given position, in the given pattern, appears
1407 * to be the start of a property set pattern [:foo:], \\p{foo}, or
1408 * \\P{foo}, or \\N{name}.
1409 */
1410 static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1411 int32_t pos);
1412
1413 static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1414 int32_t iterOpts);
1415
1416 /**
1417 * Parse the given property pattern at the given parse position
1418 * and set this UnicodeSet to the result.
1419 *
1420 * The original design document is out of date, but still useful.
1421 * Ignore the property and value names:
1422 * http://source.icu-project.org/repos/icu/icuhtml/trunk/design/unicodeset_properties.html
1423 *
1424 * Recognized syntax:
1425 *
1426 * [:foo:] [:^foo:] - white space not allowed within "[:" or ":]"
1427 * \\p{foo} \\P{foo} - white space not allowed within "\\p" or "\\P"
1428 * \\N{name} - white space not allowed within "\\N"
1429 *
1430 * Other than the above restrictions, white space is ignored. Case
1431 * is ignored except in "\\p" and "\\P" and "\\N". In 'name' leading
1432 * and trailing space is deleted, and internal runs of whitespace
1433 * are collapsed to a single space.
1434 *
1435 * We support binary properties, enumerated properties, and the
1436 * following non-enumerated properties:
1437 *
1438 * Numeric_Value
1439 * Name
1440 * Unicode_1_Name
1441 *
1442 * @param pattern the pattern string
1443 * @param ppos on entry, the position at which to begin parsing.
1444 * This should be one of the locations marked '^':
1445 *
1446 * [:blah:] \\p{blah} \\P{blah} \\N{name}
1447 * ^ % ^ % ^ % ^ %
1448 *
1449 * On return, the position after the last character parsed, that is,
1450 * the locations marked '%'. If the parse fails, ppos is returned
1451 * unchanged.
1452 * @return a reference to this.
1453 */
1454 UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1455 ParsePosition& ppos,
1456 UErrorCode &ec);
1457
1458 void applyPropertyPattern(RuleCharacterIterator& chars,
1459 UnicodeString& rebuiltPat,
1460 UErrorCode& ec);
1461
1462 /**
1463 * A filter that returns TRUE if the given code point should be
1464 * included in the UnicodeSet being constructed.
1465 */
1466 typedef UBool (*Filter)(UChar32 codePoint, void* context);
1467
1468 /**
1469 * Given a filter, set this UnicodeSet to the code points
1470 * contained by that filter. The filter MUST be
1471 * property-conformant. That is, if it returns value v for one
1472 * code point, then it must return v for all affiliated code
1473 * points, as defined by the inclusions list. See
1474 * getInclusions().
1475 * src is a UPropertySource value.
1476 */
1477 void applyFilter(Filter filter,
1478 void* context,
1479 int32_t src,
1480 UErrorCode &status);
1481
1482 /**
1483 * Set the new pattern to cache.
1484 */
1485 void setPattern(const UnicodeString& newPat);
1486 /**
1487 * Release existing cached pattern.
1488 */
1489 void releasePattern();
1490
1491 friend class UnicodeSetIterator;
1492 };
1493
1494 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
1495 return !operator==(o);
1496 }
1497
isFrozen()1498 inline UBool UnicodeSet::isFrozen() const {
1499 return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
1500 }
1501
containsSome(UChar32 start,UChar32 end)1502 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1503 return !containsNone(start, end);
1504 }
1505
containsSome(const UnicodeSet & s)1506 inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
1507 return !containsNone(s);
1508 }
1509
containsSome(const UnicodeString & s)1510 inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
1511 return !containsNone(s);
1512 }
1513
1514 U_NAMESPACE_END
1515
1516 #endif
1517