• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2002-2014, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  uset.h
11 *   encoding:   UTF-8
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2002mar07
16 *   created by: Markus W. Scherer
17 *
18 *   C version of UnicodeSet.
19 */
20 
21 
22 /**
23  * \file
24  * \brief C API: Unicode Set
25  *
26  * <p>This is a C wrapper around the C++ UnicodeSet class.</p>
27  */
28 
29 #ifndef __USET_H__
30 #define __USET_H__
31 
32 #include "unicode/utypes.h"
33 #include "unicode/uchar.h"
34 
35 #if U_SHOW_CPLUSPLUS_API
36 #include <string_view>
37 #include "unicode/char16ptr.h"
38 #include "unicode/localpointer.h"
39 #include "unicode/unistr.h"
40 #endif   // U_SHOW_CPLUSPLUS_API
41 
42 #ifndef USET_DEFINED
43 
44 #ifndef U_IN_DOXYGEN
45 #define USET_DEFINED
46 #endif
47 /**
48  * USet is the C API type corresponding to C++ class UnicodeSet.
49  * Use the uset_* API to manipulate.  Create with
50  * uset_open*, and destroy with uset_close.
51  * @stable ICU 2.4
52  */
53 typedef struct USet USet;
54 #endif
55 
56 /**
57  * Bitmask values to be passed to uset_openPatternOptions() or
58  * uset_applyPattern() taking an option parameter.
59  *
60  * Use at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
61  * These case options are mutually exclusive.
62  *
63  * Undefined options bits are ignored, and reserved for future use.
64  *
65  * @stable ICU 2.4
66  */
67 enum {
68     /**
69      * Ignore white space within patterns unless quoted or escaped.
70      * @stable ICU 2.4
71      */
72     USET_IGNORE_SPACE = 1,
73 
74     /**
75      * Enable case insensitive matching.  E.g., "[ab]" with this flag
76      * will match 'a', 'A', 'b', and 'B'.  "[^ab]" with this flag will
77      * match all except 'a', 'A', 'b', and 'B'. This performs a full
78      * closure over case mappings, e.g. 'ſ' (U+017F long s) for 's'.
79      *
80      * The resulting set is a superset of the input for the code points but
81      * not for the strings.
82      * It performs a case mapping closure of the code points and adds
83      * full case folding strings for the code points, and reduces strings of
84      * the original set to their full case folding equivalents.
85      *
86      * This is designed for case-insensitive matches, for example
87      * in regular expressions. The full code point case closure allows checking of
88      * an input character directly against the closure set.
89      * Strings are matched by comparing the case-folded form from the closure
90      * set with an incremental case folding of the string in question.
91      *
92      * The closure set will also contain single code points if the original
93      * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.).
94      * This is not necessary (that is, redundant) for the above matching method
95      * but results in the same closure sets regardless of whether the original
96      * set contained the code point or a string.
97      *
98      * @stable ICU 2.4
99      */
100     USET_CASE_INSENSITIVE = 2,
101 
102     /**
103      * Adds all case mappings for each element in the set.
104      * This adds the full lower-, title-, and uppercase mappings as well as the full case folding
105      * of each existing element in the set.
106      *
107      * Unlike the “case insensitive” options, this does not perform a closure.
108      * For example, it does not add 'ſ' (U+017F long s) for 's',
109      * 'K' (U+212A Kelvin sign) for 'k', or replace set strings by their case-folded versions.
110      *
111      * @stable ICU 3.2
112      */
113     USET_ADD_CASE_MAPPINGS = 4,
114 
115     /**
116      * Enable case insensitive matching.
117      * Same as USET_CASE_INSENSITIVE but using only Simple_Case_Folding (scf) mappings,
118      * which map each code point to one code point,
119      * not full Case_Folding (cf) mappings, which map some code points to multiple code points.
120      *
121      * This is designed for case-insensitive matches, for example in certain
122      * regular expression implementations where only Simple_Case_Folding mappings are used,
123      * such as in ECMAScript (JavaScript) regular expressions.
124      *
125      * @stable ICU 73
126      */
127     USET_SIMPLE_CASE_INSENSITIVE = 6
128 };
129 
130 /**
131  * Argument values for whether span() and similar functions continue while
132  * the current character is contained vs. not contained in the set.
133  *
134  * The functionality is straightforward for sets with only single code points,
135  * without strings (which is the common case):
136  * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same.
137  * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED.
138  * - span() and spanBack() partition any string the same way when
139  *   alternating between span(USET_SPAN_NOT_CONTAINED) and
140  *   span(either "contained" condition).
141  * - Using a complemented (inverted) set and the opposite span conditions
142  *   yields the same results.
143  *
144  * When a set contains multi-code point strings, then these statements may not
145  * be true, depending on the strings in the set (for example, whether they
146  * overlap with each other) and the string that is processed.
147  * For a set with strings:
148  * - The complement of the set contains the opposite set of code points,
149  *   but the same set of strings.
150  *   Therefore, complementing both the set and the span conditions
151  *   may yield different results.
152  * - When starting spans at different positions in a string
153  *   (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different
154  *   because a set string may start before the later position.
155  * - span(USET_SPAN_SIMPLE) may be shorter than
156  *   span(USET_SPAN_CONTAINED) because it will not recursively try
157  *   all possible paths.
158  *   For example, with a set which contains the three strings "xy", "xya" and "ax",
159  *   span("xyax", USET_SPAN_CONTAINED) will return 4 but
160  *   span("xyax", USET_SPAN_SIMPLE) will return 3.
161  *   span(USET_SPAN_SIMPLE) will never be longer than
162  *   span(USET_SPAN_CONTAINED).
163  * - With either "contained" condition, span() and spanBack() may partition
164  *   a string in different ways.
165  *   For example, with a set which contains the two strings "ab" and "ba",
166  *   and when processing the string "aba",
167  *   span() will yield contained/not-contained boundaries of { 0, 2, 3 }
168  *   while spanBack() will yield boundaries of { 0, 1, 3 }.
169  *
170  * Note: If it is important to get the same boundaries whether iterating forward
171  * or backward through a string, then either only span() should be used and
172  * the boundaries cached for backward operation, or an ICU BreakIterator
173  * could be used.
174  *
175  * Note: Unpaired surrogates are treated like surrogate code points.
176  * Similarly, set strings match only on code point boundaries,
177  * never in the middle of a surrogate pair.
178  * Illegal UTF-8 sequences are treated like U+FFFD.
179  * When processing UTF-8 strings, malformed set strings
180  * (strings with unpaired surrogates which cannot be converted to UTF-8)
181  * are ignored.
182  *
183  * @stable ICU 3.8
184  */
185 typedef enum USetSpanCondition {
186     /**
187      * Continues a span() while there is no set element at the current position.
188      * Increments by one code point at a time.
189      * Stops before the first set element (character or string).
190      * (For code points only, this is like while contains(current)==false).
191      *
192      * When span() returns, the substring between where it started and the position
193      * it returned consists only of characters that are not in the set,
194      * and none of its strings overlap with the span.
195      *
196      * @stable ICU 3.8
197      */
198     USET_SPAN_NOT_CONTAINED = 0,
199     /**
200      * Spans the longest substring that is a concatenation of set elements (characters or strings).
201      * (For characters only, this is like while contains(current)==true).
202      *
203      * When span() returns, the substring between where it started and the position
204      * it returned consists only of set elements (characters or strings) that are in the set.
205      *
206      * If a set contains strings, then the span will be the longest substring for which there
207      * exists at least one non-overlapping concatenation of set elements (characters or strings).
208      * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>.
209      * (Java/ICU/Perl regex stops at the first match of an OR.)
210      *
211      * @stable ICU 3.8
212      */
213     USET_SPAN_CONTAINED = 1,
214     /**
215      * Continues a span() while there is a set element at the current position.
216      * Increments by the longest matching element at each position.
217      * (For characters only, this is like while contains(current)==true).
218      *
219      * When span() returns, the substring between where it started and the position
220      * it returned consists only of set elements (characters or strings) that are in the set.
221      *
222      * If a set only contains single characters, then this is the same
223      * as USET_SPAN_CONTAINED.
224      *
225      * If a set contains strings, then the span will be the longest substring
226      * with a match at each position with the longest single set element (character or string).
227      *
228      * Use this span condition together with other longest-match algorithms,
229      * such as ICU converters (ucnv_getUnicodeSet()).
230      *
231      * @stable ICU 3.8
232      */
233     USET_SPAN_SIMPLE = 2,
234 #ifndef U_HIDE_DEPRECATED_API
235     /**
236      * One more than the last span condition.
237      * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
238      */
239     USET_SPAN_CONDITION_COUNT
240 #endif  // U_HIDE_DEPRECATED_API
241 } USetSpanCondition;
242 
243 enum {
244     /**
245      * Capacity of USerializedSet::staticArray.
246      * Enough for any single-code point set.
247      * Also provides padding for nice sizeof(USerializedSet).
248      * @stable ICU 2.4
249      */
250     USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8
251 };
252 
253 /**
254  * A serialized form of a Unicode set.  Limited manipulations are
255  * possible directly on a serialized set.  See below.
256  * @stable ICU 2.4
257  */
258 typedef struct USerializedSet {
259     /**
260      * The serialized Unicode Set.
261      * @stable ICU 2.4
262      */
263     const uint16_t *array;
264     /**
265      * The length of the array that contains BMP characters.
266      * @stable ICU 2.4
267      */
268     int32_t bmpLength;
269     /**
270      * The total length of the array.
271      * @stable ICU 2.4
272      */
273     int32_t length;
274     /**
275      * A small buffer for the array to reduce memory allocations.
276      * @stable ICU 2.4
277      */
278     uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY];
279 } USerializedSet;
280 
281 /*********************************************************************
282  * USet API
283  *********************************************************************/
284 
285 /**
286  * Create an empty USet object.
287  * Equivalent to uset_open(1, 0).
288  * @return a newly created USet.  The caller must call uset_close() on
289  * it when done.
290  * @stable ICU 4.2
291  */
292 U_CAPI USet* U_EXPORT2
293 uset_openEmpty(void);
294 
295 /**
296  * Creates a USet object that contains the range of characters
297  * start..end, inclusive.  If <code>start > end</code>
298  * then an empty set is created (same as using uset_openEmpty()).
299  * @param start first character of the range, inclusive
300  * @param end last character of the range, inclusive
301  * @return a newly created USet.  The caller must call uset_close() on
302  * it when done.
303  * @stable ICU 2.4
304  */
305 U_CAPI USet* U_EXPORT2
306 uset_open(UChar32 start, UChar32 end);
307 
308 /**
309  * Creates a set from the given pattern.  See the UnicodeSet class
310  * description for the syntax of the pattern language.
311  * @param pattern a string specifying what characters are in the set
312  * @param patternLength the length of the pattern, or -1 if null
313  * terminated
314  * @param ec the error code
315  * @stable ICU 2.4
316  */
317 U_CAPI USet* U_EXPORT2
318 uset_openPattern(const UChar* pattern, int32_t patternLength,
319                  UErrorCode* ec);
320 
321 /**
322  * Creates a set from the given pattern.  See the UnicodeSet class
323  * description for the syntax of the pattern language.
324  * @param pattern a string specifying what characters are in the set
325  * @param patternLength the length of the pattern, or -1 if null
326  * terminated
327  * @param options bitmask for options to apply to the pattern.
328  * Valid options are USET_IGNORE_SPACE and
329  * at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
330  * These case options are mutually exclusive.
331  * @param ec the error code
332  * @stable ICU 2.4
333  */
334 U_CAPI USet* U_EXPORT2
335 uset_openPatternOptions(const UChar* pattern, int32_t patternLength,
336                  uint32_t options,
337                  UErrorCode* ec);
338 
339 /**
340  * Disposes of the storage used by a USet object.  This function should
341  * be called exactly once for objects returned by uset_open().
342  * @param set the object to dispose of
343  * @stable ICU 2.4
344  */
345 U_CAPI void U_EXPORT2
346 uset_close(USet* set);
347 
348 #if U_SHOW_CPLUSPLUS_API
349 
350 U_NAMESPACE_BEGIN
351 
352 /**
353  * \class LocalUSetPointer
354  * "Smart pointer" class, closes a USet via uset_close().
355  * For most methods see the LocalPointerBase base class.
356  *
357  * @see LocalPointerBase
358  * @see LocalPointer
359  * @stable ICU 4.4
360  */
361 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close);
362 
363 U_NAMESPACE_END
364 
365 #endif
366 
367 /**
368  * Returns a copy of this object.
369  * If this set is frozen, then the clone will be frozen as well.
370  * Use uset_cloneAsThawed() for a mutable clone of a frozen set.
371  * @param set the original set
372  * @return the newly allocated copy of the set
373  * @see uset_cloneAsThawed
374  * @stable ICU 3.8
375  */
376 U_CAPI USet * U_EXPORT2
377 uset_clone(const USet *set);
378 
379 /**
380  * Determines whether the set has been frozen (made immutable) or not.
381  * See the ICU4J Freezable interface for details.
382  * @param set the set
383  * @return true/false for whether the set has been frozen
384  * @see uset_freeze
385  * @see uset_cloneAsThawed
386  * @stable ICU 3.8
387  */
388 U_CAPI UBool U_EXPORT2
389 uset_isFrozen(const USet *set);
390 
391 /**
392  * Freeze the set (make it immutable).
393  * Once frozen, it cannot be unfrozen and is therefore thread-safe
394  * until it is deleted.
395  * See the ICU4J Freezable interface for details.
396  * Freezing the set may also make some operations faster, for example
397  * uset_contains() and uset_span().
398  * A frozen set will not be modified. (It remains frozen.)
399  * @param set the set
400  * @return the same set, now frozen
401  * @see uset_isFrozen
402  * @see uset_cloneAsThawed
403  * @stable ICU 3.8
404  */
405 U_CAPI void U_EXPORT2
406 uset_freeze(USet *set);
407 
408 /**
409  * Clone the set and make the clone mutable.
410  * See the ICU4J Freezable interface for details.
411  * @param set the set
412  * @return the mutable clone
413  * @see uset_freeze
414  * @see uset_isFrozen
415  * @see uset_clone
416  * @stable ICU 3.8
417  */
418 U_CAPI USet * U_EXPORT2
419 uset_cloneAsThawed(const USet *set);
420 
421 /**
422  * Causes the USet object to represent the range <code>start - end</code>.
423  * If <code>start > end</code> then this USet is set to an empty range.
424  * A frozen set will not be modified.
425  * @param set the object to set to the given range
426  * @param start first character in the set, inclusive
427  * @param end last character in the set, inclusive
428  * @stable ICU 3.2
429  */
430 U_CAPI void U_EXPORT2
431 uset_set(USet* set,
432          UChar32 start, UChar32 end);
433 
434 /**
435  * Modifies the set to represent the set specified by the given
436  * pattern. See the UnicodeSet class description for the syntax of
437  * the pattern language. See also the User Guide chapter about UnicodeSet.
438  * <em>Empties the set passed before applying the pattern.</em>
439  * A frozen set will not be modified.
440  * @param set               The set to which the pattern is to be applied.
441  * @param pattern           A pointer to UChar string specifying what characters are in the set.
442  *                          The character at pattern[0] must be a '['.
443  * @param patternLength     The length of the UChar string. -1 if NUL terminated.
444  * @param options           A bitmask for options to apply to the pattern.
445  *                          Valid options are USET_IGNORE_SPACE and
446  *                          at most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS,
447  *                          USET_SIMPLE_CASE_INSENSITIVE.
448  *                          These case options are mutually exclusive.
449  * @param status            Returns an error if the pattern cannot be parsed.
450  * @return                  Upon successful parse, the value is either
451  *                          the index of the character after the closing ']'
452  *                          of the parsed pattern.
453  *                          If the status code indicates failure, then the return value
454  *                          is the index of the error in the source.
455  *
456  * @stable ICU 2.8
457  */
458 U_CAPI int32_t U_EXPORT2
459 uset_applyPattern(USet *set,
460                   const UChar *pattern, int32_t patternLength,
461                   uint32_t options,
462                   UErrorCode *status);
463 
464 /**
465  * Modifies the set to contain those code points which have the given value
466  * for the given binary or enumerated property, as returned by
467  * u_getIntPropertyValue.  Prior contents of this set are lost.
468  * A frozen set will not be modified.
469  *
470  * @param set the object to contain the code points defined by the property
471  *
472  * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
473  * or UCHAR_INT_START..UCHAR_INT_LIMIT-1
474  * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
475  *
476  * @param value a value in the range u_getIntPropertyMinValue(prop)..
477  * u_getIntPropertyMaxValue(prop), with one exception.  If prop is
478  * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
479  * rather a mask value produced by U_GET_GC_MASK().  This allows grouped
480  * categories such as [:L:] to be represented.
481  *
482  * @param ec error code input/output parameter
483  *
484  * @stable ICU 3.2
485  */
486 U_CAPI void U_EXPORT2
487 uset_applyIntPropertyValue(USet* set,
488                            UProperty prop, int32_t value, UErrorCode* ec);
489 
490 /**
491  * Modifies the set to contain those code points which have the
492  * given value for the given property.  Prior contents of this
493  * set are lost.
494  * A frozen set will not be modified.
495  *
496  * @param set the object to contain the code points defined by the given
497  * property and value alias
498  *
499  * @param prop a string specifying a property alias, either short or long.
500  * The name is matched loosely.  See PropertyAliases.txt for names and a
501  * description of loose matching.  If the value string is empty, then this
502  * string is interpreted as either a General_Category value alias, a Script
503  * value alias, a binary property alias, or a special ID.  Special IDs are
504  * matched loosely and correspond to the following sets:
505  *
506  * "ANY" = [\\u0000-\\U0010FFFF],
507  * "ASCII" = [\\u0000-\\u007F],
508  * "Assigned" = [:^Cn:].
509  *
510  * @param propLength the length of the prop, or -1 if NULL
511  *
512  * @param value a string specifying a value alias, either short or long.
513  * The name is matched loosely.  See PropertyValueAliases.txt for names
514  * and a description of loose matching.  In addition to aliases listed,
515  * numeric values and canonical combining classes may be expressed
516  * numerically, e.g., ("nv", "0.5") or ("ccc", "220").  The value string
517  * may also be empty.
518  *
519  * @param valueLength the length of the value, or -1 if NULL
520  *
521  * @param ec error code input/output parameter
522  *
523  * @stable ICU 3.2
524  */
525 U_CAPI void U_EXPORT2
526 uset_applyPropertyAlias(USet* set,
527                         const UChar *prop, int32_t propLength,
528                         const UChar *value, int32_t valueLength,
529                         UErrorCode* ec);
530 
531 /**
532  * Return true if the given position, in the given pattern, appears
533  * to be the start of a UnicodeSet pattern.
534  *
535  * @param pattern a string specifying the pattern
536  * @param patternLength the length of the pattern, or -1 if NULL
537  * @param pos the given position
538  * @stable ICU 3.2
539  */
540 U_CAPI UBool U_EXPORT2
541 uset_resemblesPattern(const UChar *pattern, int32_t patternLength,
542                       int32_t pos);
543 
544 /**
545  * Returns a string representation of this set.  If the result of
546  * calling this function is passed to a uset_openPattern(), it
547  * will produce another set that is equal to this one.
548  * @param set the set
549  * @param result the string to receive the rules, may be NULL
550  * @param resultCapacity the capacity of result, may be 0 if result is NULL
551  * @param escapeUnprintable if true then convert unprintable
552  * character to their hex escape representations, \\uxxxx or
553  * \\Uxxxxxxxx.  Unprintable characters are those other than
554  * U+000A, U+0020..U+007E.
555  * @param ec error code.
556  * @return length of string, possibly larger than resultCapacity
557  * @stable ICU 2.4
558  */
559 U_CAPI int32_t U_EXPORT2
560 uset_toPattern(const USet* set,
561                UChar* result, int32_t resultCapacity,
562                UBool escapeUnprintable,
563                UErrorCode* ec);
564 
565 /**
566  * Adds the given character to the given USet.  After this call,
567  * uset_contains(set, c) will return true.
568  * A frozen set will not be modified.
569  * @param set the object to which to add the character
570  * @param c the character to add
571  * @stable ICU 2.4
572  */
573 U_CAPI void U_EXPORT2
574 uset_add(USet* set, UChar32 c);
575 
576 /**
577  * Adds all of the elements in the specified set to this set if
578  * they're not already present.  This operation effectively
579  * modifies this set so that its value is the <i>union</i> of the two
580  * sets.  The behavior of this operation is unspecified if the specified
581  * collection is modified while the operation is in progress.
582  * A frozen set will not be modified.
583  *
584  * @param set the object to which to add the set
585  * @param additionalSet the source set whose elements are to be added to this set.
586  * @stable ICU 2.6
587  */
588 U_CAPI void U_EXPORT2
589 uset_addAll(USet* set, const USet *additionalSet);
590 
591 /**
592  * Adds the given range of characters to the given USet.  After this call,
593  * uset_contains(set, start, end) will return true.
594  * A frozen set will not be modified.
595  * @param set the object to which to add the character
596  * @param start the first character of the range to add, inclusive
597  * @param end the last character of the range to add, inclusive
598  * @stable ICU 2.2
599  */
600 U_CAPI void U_EXPORT2
601 uset_addRange(USet* set, UChar32 start, UChar32 end);
602 
603 /**
604  * Adds the given string to the given USet.  After this call,
605  * uset_containsString(set, str, strLen) will return true.
606  * A frozen set will not be modified.
607  * @param set the object to which to add the character
608  * @param str the string to add
609  * @param strLen the length of the string or -1 if null terminated.
610  * @stable ICU 2.4
611  */
612 U_CAPI void U_EXPORT2
613 uset_addString(USet* set, const UChar* str, int32_t strLen);
614 
615 /**
616  * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"}
617  * If this set already contains any particular character, it has no effect on that character.
618  * A frozen set will not be modified.
619  * @param set the object to which to add the character
620  * @param str the source string
621  * @param strLen the length of the string or -1 if null terminated.
622  * @stable ICU 3.4
623  */
624 U_CAPI void U_EXPORT2
625 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen);
626 
627 /**
628  * Removes the given character from the given USet.  After this call,
629  * uset_contains(set, c) will return false.
630  * A frozen set will not be modified.
631  * @param set the object from which to remove the character
632  * @param c the character to remove
633  * @stable ICU 2.4
634  */
635 U_CAPI void U_EXPORT2
636 uset_remove(USet* set, UChar32 c);
637 
638 /**
639  * Removes the given range of characters from the given USet.  After this call,
640  * uset_contains(set, start, end) will return false.
641  * A frozen set will not be modified.
642  * @param set the object to which to add the character
643  * @param start the first character of the range to remove, inclusive
644  * @param end the last character of the range to remove, inclusive
645  * @stable ICU 2.2
646  */
647 U_CAPI void U_EXPORT2
648 uset_removeRange(USet* set, UChar32 start, UChar32 end);
649 
650 /**
651  * Removes the given string to the given USet.  After this call,
652  * uset_containsString(set, str, strLen) will return false.
653  * A frozen set will not be modified.
654  * @param set the object to which to add the character
655  * @param str the string to remove
656  * @param strLen the length of the string or -1 if null terminated.
657  * @stable ICU 2.4
658  */
659 U_CAPI void U_EXPORT2
660 uset_removeString(USet* set, const UChar* str, int32_t strLen);
661 
662 /**
663  * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"}
664  * A frozen set will not be modified.
665  *
666  * @param set the object to be modified
667  * @param str the string
668  * @param length the length of the string, or -1 if NUL-terminated
669  * @stable ICU 69
670  */
671 U_CAPI void U_EXPORT2
672 uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length);
673 
674 /**
675  * Removes from this set all of its elements that are contained in the
676  * specified set.  This operation effectively modifies this
677  * set so that its value is the <i>asymmetric set difference</i> of
678  * the two sets.
679  * A frozen set will not be modified.
680  * @param set the object from which the elements are to be removed
681  * @param removeSet the object that defines which elements will be
682  * removed from this set
683  * @stable ICU 3.2
684  */
685 U_CAPI void U_EXPORT2
686 uset_removeAll(USet* set, const USet* removeSet);
687 
688 /**
689  * Retain only the elements in this set that are contained in the
690  * specified range.  If <code>start > end</code> then an empty range is
691  * retained, leaving the set empty.  This is equivalent to
692  * a boolean logic AND, or a set INTERSECTION.
693  * A frozen set will not be modified.
694  *
695  * @param set the object for which to retain only the specified range
696  * @param start first character, inclusive, of range
697  * @param end last character, inclusive, of range
698  * @stable ICU 3.2
699  */
700 U_CAPI void U_EXPORT2
701 uset_retain(USet* set, UChar32 start, UChar32 end);
702 
703 /**
704  * Retains only the specified string from this set if it is present.
705  * Upon return this set will be empty if it did not contain s, or
706  * will only contain s if it did contain s.
707  * A frozen set will not be modified.
708  *
709  * @param set the object to be modified
710  * @param str the string
711  * @param length the length of the string, or -1 if NUL-terminated
712  * @stable ICU 69
713  */
714 U_CAPI void U_EXPORT2
715 uset_retainString(USet *set, const UChar *str, int32_t length);
716 
717 /**
718  * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"}
719  * A frozen set will not be modified.
720  *
721  * @param set the object to be modified
722  * @param str the string
723  * @param length the length of the string, or -1 if NUL-terminated
724  * @stable ICU 69
725  */
726 U_CAPI void U_EXPORT2
727 uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length);
728 
729 /**
730  * Retains only the elements in this set that are contained in the
731  * specified set.  In other words, removes from this set all of
732  * its elements that are not contained in the specified set.  This
733  * operation effectively modifies this set so that its value is
734  * the <i>intersection</i> of the two sets.
735  * A frozen set will not be modified.
736  *
737  * @param set the object on which to perform the retain
738  * @param retain set that defines which elements this set will retain
739  * @stable ICU 3.2
740  */
741 U_CAPI void U_EXPORT2
742 uset_retainAll(USet* set, const USet* retain);
743 
744 /**
745  * Reallocate this objects internal structures to take up the least
746  * possible space, without changing this object's value.
747  * A frozen set will not be modified.
748  *
749  * @param set the object on which to perform the compact
750  * @stable ICU 3.2
751  */
752 U_CAPI void U_EXPORT2
753 uset_compact(USet* set);
754 
755 /**
756  * This is equivalent to
757  * <code>uset_complementRange(set, 0, 0x10FFFF)</code>.
758  *
759  * <strong>Note:</strong> This performs a symmetric difference with all code points
760  * <em>and thus retains all multicharacter strings</em>.
761  * In order to achieve a “code point complement” (all code points minus this set),
762  * the easiest is to <code>uset_complement(set); uset_removeAllStrings(set);</code>.
763  *
764  * A frozen set will not be modified.
765  * @param set the set
766  * @stable ICU 2.4
767  */
768 U_CAPI void U_EXPORT2
769 uset_complement(USet* set);
770 
771 /**
772  * Complements the specified range in this set.  Any character in
773  * the range will be removed if it is in this set, or will be
774  * added if it is not in this set.  If <code>start > end</code>
775  * then an empty range is complemented, leaving the set unchanged.
776  * This is equivalent to a boolean logic XOR.
777  * A frozen set will not be modified.
778  *
779  * @param set the object to be modified
780  * @param start first character, inclusive, of range
781  * @param end last character, inclusive, of range
782  * @stable ICU 69
783  */
784 U_CAPI void U_EXPORT2
785 uset_complementRange(USet *set, UChar32 start, UChar32 end);
786 
787 /**
788  * Complements the specified string in this set.
789  * The string will be removed if it is in this set, or will be added if it is not in this set.
790  * A frozen set will not be modified.
791  *
792  * @param set the object to be modified
793  * @param str the string
794  * @param length the length of the string, or -1 if NUL-terminated
795  * @stable ICU 69
796  */
797 U_CAPI void U_EXPORT2
798 uset_complementString(USet *set, const UChar *str, int32_t length);
799 
800 /**
801  * Complements EACH of the characters in this string. Note: "ch" == {"c", "h"}
802  * A frozen set will not be modified.
803  *
804  * @param set the object to be modified
805  * @param str the string
806  * @param length the length of the string, or -1 if NUL-terminated
807  * @stable ICU 69
808  */
809 U_CAPI void U_EXPORT2
810 uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length);
811 
812 /**
813  * Complements in this set all elements contained in the specified
814  * set.  Any character in the other set will be removed if it is
815  * in this set, or will be added if it is not in this set.
816  * A frozen set will not be modified.
817  *
818  * @param set the set with which to complement
819  * @param complement set that defines which elements will be xor'ed
820  * from this set.
821  * @stable ICU 3.2
822  */
823 U_CAPI void U_EXPORT2
824 uset_complementAll(USet* set, const USet* complement);
825 
826 /**
827  * Removes all of the elements from this set.  This set will be
828  * empty after this call returns.
829  * A frozen set will not be modified.
830  * @param set the set
831  * @stable ICU 2.4
832  */
833 U_CAPI void U_EXPORT2
834 uset_clear(USet* set);
835 
836 /**
837  * Close this set over the given attribute.  For the attribute
838  * USET_CASE_INSENSITIVE, the result is to modify this set so that:
839  *
840  * 1. For each character or string 'a' in this set, all strings or
841  * characters 'b' such that foldCase(a) == foldCase(b) are added
842  * to this set.
843  *
844  * 2. For each string 'e' in the resulting set, if e !=
845  * foldCase(e), 'e' will be removed.
846  *
847  * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}]
848  *
849  * (Here foldCase(x) refers to the operation u_strFoldCase, and a
850  * == b denotes that the contents are the same, not pointer
851  * comparison.)
852  *
853  * A frozen set will not be modified.
854  *
855  * @param set the set
856  *
857  * @param attributes bitmask for attributes to close over.
858  * Valid options:
859  * At most one of USET_CASE_INSENSITIVE, USET_ADD_CASE_MAPPINGS, USET_SIMPLE_CASE_INSENSITIVE.
860  * These case options are mutually exclusive.
861  * Unrelated options bits are ignored.
862  * @stable ICU 4.2
863  */
864 U_CAPI void U_EXPORT2
865 uset_closeOver(USet* set, int32_t attributes);
866 
867 /**
868  * Remove all strings from this set.
869  *
870  * @param set the set
871  * @stable ICU 4.2
872  */
873 U_CAPI void U_EXPORT2
874 uset_removeAllStrings(USet* set);
875 
876 /**
877  * Returns true if the given USet contains no characters and no
878  * strings.
879  * @param set the set
880  * @return true if set is empty
881  * @stable ICU 2.4
882  */
883 U_CAPI UBool U_EXPORT2
884 uset_isEmpty(const USet* set);
885 
886 /**
887  * @param set the set
888  * @return true if this set contains multi-character strings or the empty string.
889  * @stable ICU 70
890  */
891 U_CAPI UBool U_EXPORT2
892 uset_hasStrings(const USet *set);
893 
894 /**
895  * Returns true if the given USet contains the given character.
896  * This function works faster with a frozen set.
897  * @param set the set
898  * @param c The codepoint to check for within the set
899  * @return true if set contains c
900  * @stable ICU 2.4
901  */
902 U_CAPI UBool U_EXPORT2
903 uset_contains(const USet* set, UChar32 c);
904 
905 /**
906  * Returns true if the given USet contains all characters c
907  * where start <= c && c <= end.
908  * @param set the set
909  * @param start the first character of the range to test, inclusive
910  * @param end the last character of the range to test, inclusive
911  * @return true if set contains the range
912  * @stable ICU 2.2
913  */
914 U_CAPI UBool U_EXPORT2
915 uset_containsRange(const USet* set, UChar32 start, UChar32 end);
916 
917 /**
918  * Returns true if the given USet contains the given string.
919  * @param set the set
920  * @param str the string
921  * @param strLen the length of the string or -1 if null terminated.
922  * @return true if set contains str
923  * @stable ICU 2.4
924  */
925 U_CAPI UBool U_EXPORT2
926 uset_containsString(const USet* set, const UChar* str, int32_t strLen);
927 
928 /**
929  * Returns the index of the given character within this set, where
930  * the set is ordered by ascending code point.  If the character
931  * is not in this set, return -1.  The inverse of this method is
932  * <code>charAt()</code>.
933  * @param set the set
934  * @param c the character to obtain the index for
935  * @return an index from 0..size()-1, or -1
936  * @stable ICU 3.2
937  */
938 U_CAPI int32_t U_EXPORT2
939 uset_indexOf(const USet* set, UChar32 c);
940 
941 /**
942  * Returns the character at the given index within this set, where
943  * the set is ordered by ascending code point.  If the index is
944  * out of range for characters, returns (UChar32)-1.
945  * The inverse of this method is <code>indexOf()</code>.
946  *
947  * For iteration, this is slower than uset_getRangeCount()/uset_getItemCount()
948  * with uset_getItem(), because for each call it skips linearly over <code>index</code>
949  * characters in the ranges.
950  *
951  * @param set the set
952  * @param charIndex an index from 0..size()-1 to obtain the char for
953  * @return the character at the given index, or (UChar32)-1.
954  * @stable ICU 3.2
955  */
956 U_CAPI UChar32 U_EXPORT2
957 uset_charAt(const USet* set, int32_t charIndex);
958 
959 /**
960  * Returns the number of characters and strings contained in this set.
961  * The last uset_getStringCount() == (uset_getItemCount() - uset_getRangeCount()) items are strings.
962  *
963  * This is slower than uset_getRangeCount() and uset_getItemCount() because
964  * it counts the code points of all ranges.
965  *
966  * @param set the set
967  * @return a non-negative integer counting the characters and strings
968  * contained in set
969  * @stable ICU 2.4
970  * @see uset_getRangeCount
971  * @see uset_getStringCount
972  * @see uset_getItemCount
973  */
974 U_CAPI int32_t U_EXPORT2
975 uset_size(const USet* set);
976 
977 /**
978  * @param set the set
979  * @return the number of ranges in this set.
980  * @stable ICU 70
981  * @see uset_getItemCount
982  * @see uset_getItem
983  * @see uset_getStringCount
984  * @see uset_size
985  */
986 U_CAPI int32_t U_EXPORT2
987 uset_getRangeCount(const USet *set);
988 
989 #ifndef U_HIDE_DRAFT_API
990 
991 /**
992  * @param set the set
993  * @return the number of strings in this set.
994  * @draft ICU 76
995  * @see uset_getRangeCount
996  * @see uset_getItemCount
997  * @see uset_size
998  */
999 U_CAPI int32_t U_EXPORT2
1000 uset_getStringCount(const USet *set);
1001 
1002 /**
1003  * Returns the index-th string (empty or multi-character) in the set.
1004  * The string may not be NUL-terminated.
1005  * The output length must be used, and the caller must not read more than that many UChars.
1006  *
1007  * @param set the set
1008  * @param index the string index, 0 .. uset_getStringCount() - 1
1009  * @param pLength the output string length; must not be NULL
1010  * @return the pointer to the string; NULL if the index is out of range or pLength is NULL
1011  * @draft ICU 76
1012  * @see uset_getStringCount
1013  */
1014 U_CAPI const UChar* U_EXPORT2
1015 uset_getString(const USet *set, int32_t index, int32_t *pLength);
1016 
1017 #endif  // U_HIDE_DRAFT_API
1018 
1019 /**
1020  * Returns the number of items in this set.  An item is either a range
1021  * of characters or a single multicharacter string.
1022  * @param set the set
1023  * @return a non-negative integer counting the character ranges
1024  * and/or strings contained in set
1025  * @stable ICU 2.4
1026  * @see uset_getRangeCount
1027  * @see uset_getStringCount
1028  */
1029 U_CAPI int32_t U_EXPORT2
1030 uset_getItemCount(const USet* set);
1031 
1032 /**
1033  * Returns an item of this set.  An item is either a range of
1034  * characters or a single multicharacter string (which can be the empty string).
1035  *
1036  * If <code>itemIndex</code> is less than uset_getRangeCount(), then this function returns 0,
1037  * and the range is <code>*start</code>..<code>*end</code>.
1038  *
1039  * If <code>itemIndex</code> is at least uset_getRangeCount() and less than uset_getItemCount(), then
1040  * this function copies the string into <code>str[strCapacity]</code> and
1041  * returns the length of the string (0 for the empty string).
1042  * See uset_getString() for a function that does not copy the string contents.
1043  *
1044  * If <code>itemIndex</code> is out of range, then this function returns -1.
1045  *
1046  * Note that 0 is returned for each range as well as for the empty string.
1047  *
1048  * @param set the set
1049  * @param itemIndex a non-negative integer in the range 0..uset_getItemCount(set)-1
1050  * @param start pointer to variable to receive first character in range, inclusive;
1051  *              can be NULL for a string item
1052  * @param end pointer to variable to receive last character in range, inclusive;
1053  *            can be NULL for a string item
1054  * @param str buffer to receive the string, may be NULL
1055  * @param strCapacity capacity of str, or 0 if str is NULL
1056  * @param ec error code; U_INDEX_OUTOFBOUNDS_ERROR if the itemIndex is out of range
1057  * @return the length of the string (0 or >= 2), or 0 if the item is a range,
1058  *         or -1 if the itemIndex is out of range
1059  * @stable ICU 2.4
1060  * @see uset_getString
1061  */
1062 U_CAPI int32_t U_EXPORT2
1063 uset_getItem(const USet* set, int32_t itemIndex,
1064              UChar32* start, UChar32* end,
1065              UChar* str, int32_t strCapacity,
1066              UErrorCode* ec);
1067 
1068 /**
1069  * Returns true if set1 contains all the characters and strings
1070  * of set2. It answers the question, 'Is set1 a superset of set2?'
1071  * @param set1 set to be checked for containment
1072  * @param set2 set to be checked for containment
1073  * @return true if the test condition is met
1074  * @stable ICU 3.2
1075  */
1076 U_CAPI UBool U_EXPORT2
1077 uset_containsAll(const USet* set1, const USet* set2);
1078 
1079 /**
1080  * Returns true if this set contains all the characters
1081  * of the given string. This is does not check containment of grapheme
1082  * clusters, like uset_containsString.
1083  * @param set set of characters to be checked for containment
1084  * @param str string containing codepoints to be checked for containment
1085  * @param strLen the length of the string or -1 if null terminated.
1086  * @return true if the test condition is met
1087  * @stable ICU 3.4
1088  */
1089 U_CAPI UBool U_EXPORT2
1090 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen);
1091 
1092 /**
1093  * Returns true if set1 contains none of the characters and strings
1094  * of set2. It answers the question, 'Is set1 a disjoint set of set2?'
1095  * @param set1 set to be checked for containment
1096  * @param set2 set to be checked for containment
1097  * @return true if the test condition is met
1098  * @stable ICU 3.2
1099  */
1100 U_CAPI UBool U_EXPORT2
1101 uset_containsNone(const USet* set1, const USet* set2);
1102 
1103 /**
1104  * Returns true if set1 contains some of the characters and strings
1105  * of set2. It answers the question, 'Does set1 and set2 have an intersection?'
1106  * @param set1 set to be checked for containment
1107  * @param set2 set to be checked for containment
1108  * @return true if the test condition is met
1109  * @stable ICU 3.2
1110  */
1111 U_CAPI UBool U_EXPORT2
1112 uset_containsSome(const USet* set1, const USet* set2);
1113 
1114 /**
1115  * Returns the length of the initial substring of the input string which
1116  * consists only of characters and strings that are contained in this set
1117  * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
1118  * or only of characters and strings that are not contained
1119  * in this set (USET_SPAN_NOT_CONTAINED).
1120  * See USetSpanCondition for details.
1121  * Similar to the strspn() C library function.
1122  * Unpaired surrogates are treated according to contains() of their surrogate code points.
1123  * This function works faster with a frozen set and with a non-negative string length argument.
1124  * @param set the set
1125  * @param s start of the string
1126  * @param length of the string; can be -1 for NUL-terminated
1127  * @param spanCondition specifies the containment condition
1128  * @return the length of the initial substring according to the spanCondition;
1129  *         0 if the start of the string does not fit the spanCondition
1130  * @stable ICU 3.8
1131  * @see USetSpanCondition
1132  */
1133 U_CAPI int32_t U_EXPORT2
1134 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
1135 
1136 /**
1137  * Returns the start of the trailing substring of the input string which
1138  * consists only of characters and strings that are contained in this set
1139  * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
1140  * or only of characters and strings that are not contained
1141  * in this set (USET_SPAN_NOT_CONTAINED).
1142  * See USetSpanCondition for details.
1143  * Unpaired surrogates are treated according to contains() of their surrogate code points.
1144  * This function works faster with a frozen set and with a non-negative string length argument.
1145  * @param set the set
1146  * @param s start of the string
1147  * @param length of the string; can be -1 for NUL-terminated
1148  * @param spanCondition specifies the containment condition
1149  * @return the start of the trailing substring according to the spanCondition;
1150  *         the string length if the end of the string does not fit the spanCondition
1151  * @stable ICU 3.8
1152  * @see USetSpanCondition
1153  */
1154 U_CAPI int32_t U_EXPORT2
1155 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition);
1156 
1157 /**
1158  * Returns the length of the initial substring of the input string which
1159  * consists only of characters and strings that are contained in this set
1160  * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
1161  * or only of characters and strings that are not contained
1162  * in this set (USET_SPAN_NOT_CONTAINED).
1163  * See USetSpanCondition for details.
1164  * Similar to the strspn() C library function.
1165  * Malformed byte sequences are treated according to contains(0xfffd).
1166  * This function works faster with a frozen set and with a non-negative string length argument.
1167  * @param set the set
1168  * @param s start of the string (UTF-8)
1169  * @param length of the string; can be -1 for NUL-terminated
1170  * @param spanCondition specifies the containment condition
1171  * @return the length of the initial substring according to the spanCondition;
1172  *         0 if the start of the string does not fit the spanCondition
1173  * @stable ICU 3.8
1174  * @see USetSpanCondition
1175  */
1176 U_CAPI int32_t U_EXPORT2
1177 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
1178 
1179 /**
1180  * Returns the start of the trailing substring of the input string which
1181  * consists only of characters and strings that are contained in this set
1182  * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
1183  * or only of characters and strings that are not contained
1184  * in this set (USET_SPAN_NOT_CONTAINED).
1185  * See USetSpanCondition for details.
1186  * Malformed byte sequences are treated according to contains(0xfffd).
1187  * This function works faster with a frozen set and with a non-negative string length argument.
1188  * @param set the set
1189  * @param s start of the string (UTF-8)
1190  * @param length of the string; can be -1 for NUL-terminated
1191  * @param spanCondition specifies the containment condition
1192  * @return the start of the trailing substring according to the spanCondition;
1193  *         the string length if the end of the string does not fit the spanCondition
1194  * @stable ICU 3.8
1195  * @see USetSpanCondition
1196  */
1197 U_CAPI int32_t U_EXPORT2
1198 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition);
1199 
1200 /**
1201  * Returns true if set1 contains all of the characters and strings
1202  * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?'
1203  * @param set1 set to be checked for containment
1204  * @param set2 set to be checked for containment
1205  * @return true if the test condition is met
1206  * @stable ICU 3.2
1207  */
1208 U_CAPI UBool U_EXPORT2
1209 uset_equals(const USet* set1, const USet* set2);
1210 
1211 /*********************************************************************
1212  * Serialized set API
1213  *********************************************************************/
1214 
1215 /**
1216  * Serializes this set into an array of 16-bit integers.  Serialization
1217  * (currently) only records the characters in the set; multicharacter
1218  * strings are ignored.
1219  *
1220  * The array
1221  * has following format (each line is one 16-bit integer):
1222  *
1223  *  length     = (n+2*m) | (m!=0?0x8000:0)
1224  *  bmpLength  = n; present if m!=0
1225  *  bmp[0]
1226  *  bmp[1]
1227  *  ...
1228  *  bmp[n-1]
1229  *  supp-high[0]
1230  *  supp-low[0]
1231  *  supp-high[1]
1232  *  supp-low[1]
1233  *  ...
1234  *  supp-high[m-1]
1235  *  supp-low[m-1]
1236  *
1237  * The array starts with a header.  After the header are n bmp
1238  * code points, then m supplementary code points.  Either n or m
1239  * or both may be zero.  n+2*m is always <= 0x7FFF.
1240  *
1241  * If there are no supplementary characters (if m==0) then the
1242  * header is one 16-bit integer, 'length', with value n.
1243  *
1244  * If there are supplementary characters (if m!=0) then the header
1245  * is two 16-bit integers.  The first, 'length', has value
1246  * (n+2*m)|0x8000.  The second, 'bmpLength', has value n.
1247  *
1248  * After the header the code points are stored in ascending order.
1249  * Supplementary code points are stored as most significant 16
1250  * bits followed by least significant 16 bits.
1251  *
1252  * @param set the set
1253  * @param dest pointer to buffer of destCapacity 16-bit integers.
1254  * May be NULL only if destCapacity is zero.
1255  * @param destCapacity size of dest, or zero.  Must not be negative.
1256  * @param pErrorCode pointer to the error code.  Will be set to
1257  * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF.  Will be set to
1258  * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity.
1259  * @return the total length of the serialized format, including
1260  * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other
1261  * than U_BUFFER_OVERFLOW_ERROR.
1262  * @stable ICU 2.4
1263  */
1264 U_CAPI int32_t U_EXPORT2
1265 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode);
1266 
1267 /**
1268  * Given a serialized array, fill in the given serialized set object.
1269  * @param fillSet pointer to result
1270  * @param src pointer to start of array
1271  * @param srcLength length of array
1272  * @return true if the given array is valid, otherwise false
1273  * @stable ICU 2.4
1274  */
1275 U_CAPI UBool U_EXPORT2
1276 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength);
1277 
1278 /**
1279  * Set the USerializedSet to contain the given character (and nothing
1280  * else).
1281  * @param fillSet pointer to result
1282  * @param c The codepoint to set
1283  * @stable ICU 2.4
1284  */
1285 U_CAPI void U_EXPORT2
1286 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c);
1287 
1288 /**
1289  * Returns true if the given USerializedSet contains the given
1290  * character.
1291  * @param set the serialized set
1292  * @param c The codepoint to check for within the set
1293  * @return true if set contains c
1294  * @stable ICU 2.4
1295  */
1296 U_CAPI UBool U_EXPORT2
1297 uset_serializedContains(const USerializedSet* set, UChar32 c);
1298 
1299 /**
1300  * Returns the number of disjoint ranges of characters contained in
1301  * the given serialized set.  Ignores any strings contained in the
1302  * set.
1303  * @param set the serialized set
1304  * @return a non-negative integer counting the character ranges
1305  * contained in set
1306  * @stable ICU 2.4
1307  */
1308 U_CAPI int32_t U_EXPORT2
1309 uset_getSerializedRangeCount(const USerializedSet* set);
1310 
1311 /**
1312  * Returns a range of characters contained in the given serialized
1313  * set.
1314  * @param set the serialized set
1315  * @param rangeIndex a non-negative integer in the range 0..
1316  * uset_getSerializedRangeCount(set)-1
1317  * @param pStart pointer to variable to receive first character
1318  * in range, inclusive
1319  * @param pEnd pointer to variable to receive last character in range,
1320  * inclusive
1321  * @return true if rangeIndex is valid, otherwise false
1322  * @stable ICU 2.4
1323  */
1324 U_CAPI UBool U_EXPORT2
1325 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex,
1326                         UChar32* pStart, UChar32* pEnd);
1327 
1328 #if U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
1329 #ifndef U_HIDE_DRAFT_API
1330 
1331 namespace U_HEADER_ONLY_NAMESPACE {
1332 
1333 // Note: Not U_COMMON_API, and not a subclass of UMemory, because this is a header-only class,
1334 // not intended to be used via export from the ICU DLL.
1335 
1336 /**
1337  * Iterator returned by USetCodePoints.
1338  * @draft ICU 76
1339  */
1340 class USetCodePointIterator {
1341 public:
1342     /** @draft ICU 76 */
1343     USetCodePointIterator(const USetCodePointIterator &other) = default;
1344 
1345     /** @draft ICU 76 */
1346     bool operator==(const USetCodePointIterator &other) const {
1347         // No need to compare rangeCount & end given private constructor
1348         // and assuming we don't compare iterators across the set being modified.
1349         // And comparing rangeIndex is redundant with comparing c.
1350         // We might even skip comparing uset.
1351         // Unless we want operator==() to be "correct" for more than iteration.
1352         return uset == other.uset && c == other.c;
1353     }
1354 
1355     /** @draft ICU 76 */
1356     bool operator!=(const USetCodePointIterator &other) const { return !operator==(other); }
1357 
1358     /** @draft ICU 76 */
1359     UChar32 operator*() const { return c; }
1360 
1361     /**
1362      * Pre-increment.
1363      * @draft ICU 76
1364      */
1365     USetCodePointIterator &operator++() {
1366         if (c < end) {
1367             ++c;
1368         } else if (rangeIndex < rangeCount) {
1369             UErrorCode errorCode = U_ZERO_ERROR;
1370             int32_t result = uset_getItem(uset, rangeIndex, &c, &end, nullptr, 0, &errorCode);
1371             if (U_SUCCESS(errorCode) && result == 0) {
1372                 ++rangeIndex;
1373             } else {
1374                 c = end = U_SENTINEL;
1375             }
1376         } else {
1377             c = end = U_SENTINEL;
1378         }
1379         return *this;
1380     }
1381 
1382     /**
1383      * Post-increment.
1384      * @draft ICU 76
1385      */
1386     USetCodePointIterator operator++(int) {
1387         USetCodePointIterator result(*this);
1388         operator++();
1389         return result;
1390     }
1391 
1392 private:
1393     friend class USetCodePoints;
1394 
USetCodePointIterator(const USet * uset,int32_t rangeIndex,int32_t rangeCount)1395     USetCodePointIterator(const USet *uset, int32_t rangeIndex, int32_t rangeCount)
1396             : uset(uset), rangeIndex(rangeIndex), rangeCount(rangeCount),
1397                 c(U_SENTINEL), end(U_SENTINEL) {
1398         // Fetch the first range.
1399         operator++();
1400     }
1401 
1402     const USet *uset;
1403     int32_t rangeIndex;
1404     int32_t rangeCount;
1405     UChar32 c, end;
1406 };
1407 
1408 /**
1409  * C++ "range" for iterating over the code points of a USet.
1410  *
1411  * \code
1412  * using U_HEADER_NESTED_NAMESPACE::USetCodePoints;
1413  * LocalUSetPointer uset(uset_openPattern(u"[abcçカ��]", -1, &errorCode));
1414  * for (UChar32 c : USetCodePoints(uset.getAlias())) {
1415  *     printf("uset.codePoint U+%04lx\n", (long)c);
1416  * }
1417  * \endcode
1418  *
1419  * C++ UnicodeSet has member functions for iteration, including codePoints().
1420  *
1421  * @draft ICU 76
1422  * @see USetRanges
1423  * @see USetStrings
1424  * @see USetElements
1425  */
1426 class USetCodePoints {
1427 public:
1428     /**
1429      * Constructs a C++ "range" object over the code points of the USet.
1430      * @draft ICU 76
1431      */
USetCodePoints(const USet * uset)1432     USetCodePoints(const USet *uset) : uset(uset), rangeCount(uset_getRangeCount(uset)) {}
1433 
1434     /** @draft ICU 76 */
1435     USetCodePoints(const USetCodePoints &other) = default;
1436 
1437     /** @draft ICU 76 */
begin()1438     USetCodePointIterator begin() const {
1439         return USetCodePointIterator(uset, 0, rangeCount);
1440     }
1441 
1442     /** @draft ICU 76 */
end()1443     USetCodePointIterator end() const {
1444         return USetCodePointIterator(uset, rangeCount, rangeCount);
1445     }
1446 
1447 private:
1448     const USet *uset;
1449     int32_t rangeCount;
1450 };
1451 
1452 /**
1453  * A contiguous range of code points in a USet/UnicodeSet.
1454  * Returned by USetRangeIterator which is returned by USetRanges.
1455  * Both the rangeStart and rangeEnd are in the range.
1456  * (end() returns an iterator corresponding to rangeEnd+1.)
1457  * @draft ICU 76
1458  */
1459 struct CodePointRange {
1460     /** @draft ICU 76 */
1461     struct iterator {
1462         /** @draft ICU 76 */
iteratorCodePointRange::iterator1463         iterator(UChar32 c) : c(c) {}
1464 
1465         /** @draft ICU 76 */
1466         bool operator==(const iterator &other) const { return c == other.c; }
1467         /** @draft ICU 76 */
1468         bool operator!=(const iterator &other) const { return !operator==(other); }
1469 
1470         /** @draft ICU 76 */
1471         UChar32 operator*() const { return c; }
1472 
1473         /**
1474          * Pre-increment.
1475          * @draft ICU 76
1476          */
1477         iterator &operator++() {
1478             ++c;
1479             return *this;
1480         }
1481 
1482         /**
1483          * Post-increment.
1484          * @draft ICU 76
1485          */
1486         iterator operator++(int) {
1487             return c++;
1488         }
1489 
1490         /**
1491          * The current code point in the range.
1492          * @draft ICU 76
1493          */
1494         UChar32 c;
1495     };
1496 
1497     /** @draft ICU 76 */
CodePointRangeCodePointRange1498     CodePointRange(UChar32 start, UChar32 end) : rangeStart(start), rangeEnd(end) {}
1499     /** @draft ICU 76 */
1500     CodePointRange(const CodePointRange &other) = default;
1501     /** @draft ICU 76 */
sizeCodePointRange1502     size_t size() const { return (rangeEnd + 1) - rangeStart; }
1503     /** @draft ICU 76 */
beginCodePointRange1504     iterator begin() const { return rangeStart; }
1505     /** @draft ICU 76 */
endCodePointRange1506     iterator end() const { return rangeEnd + 1; }
1507 
1508     /**
1509      * Start of a USet/UnicodeSet range of code points.
1510      * @draft ICU 76
1511      */
1512     UChar32 rangeStart;
1513     /**
1514      * Inclusive end of a USet/UnicodeSet range of code points.
1515      * @draft ICU 76
1516      */
1517     UChar32 rangeEnd;
1518 };
1519 
1520 /**
1521  * Iterator returned by USetRanges.
1522  * @draft ICU 76
1523  */
1524 class USetRangeIterator {
1525 public:
1526     /** @draft ICU 76 */
1527     USetRangeIterator(const USetRangeIterator &other) = default;
1528 
1529     /** @draft ICU 76 */
1530     bool operator==(const USetRangeIterator &other) const {
1531         // No need to compare rangeCount given private constructor
1532         // and assuming we don't compare iterators across the set being modified.
1533         // We might even skip comparing uset.
1534         // Unless we want operator==() to be "correct" for more than iteration.
1535         return uset == other.uset && rangeIndex == other.rangeIndex;
1536     }
1537 
1538     /** @draft ICU 76 */
1539     bool operator!=(const USetRangeIterator &other) const { return !operator==(other); }
1540 
1541     /** @draft ICU 76 */
1542     CodePointRange operator*() const {
1543         if (rangeIndex < rangeCount) {
1544             UChar32 start, end;
1545             UErrorCode errorCode = U_ZERO_ERROR;
1546             int32_t result = uset_getItem(uset, rangeIndex, &start, &end, nullptr, 0, &errorCode);
1547             if (U_SUCCESS(errorCode) && result == 0) {
1548                 return CodePointRange(start, end);
1549             }
1550         }
1551         return CodePointRange(U_SENTINEL, U_SENTINEL);
1552     }
1553 
1554     /**
1555      * Pre-increment.
1556      * @draft ICU 76
1557      */
1558     USetRangeIterator &operator++() {
1559         ++rangeIndex;
1560         return *this;
1561     }
1562 
1563     /**
1564      * Post-increment.
1565      * @draft ICU 76
1566      */
1567     USetRangeIterator operator++(int) {
1568         USetRangeIterator result(*this);
1569         ++rangeIndex;
1570         return result;
1571     }
1572 
1573 private:
1574     friend class USetRanges;
1575 
USetRangeIterator(const USet * uset,int32_t rangeIndex,int32_t rangeCount)1576     USetRangeIterator(const USet *uset, int32_t rangeIndex, int32_t rangeCount)
1577             : uset(uset), rangeIndex(rangeIndex), rangeCount(rangeCount) {}
1578 
1579     const USet *uset;
1580     int32_t rangeIndex;
1581     int32_t rangeCount;
1582 };
1583 
1584 /**
1585  * C++ "range" for iterating over the code point ranges of a USet.
1586  *
1587  * \code
1588  * using U_HEADER_NESTED_NAMESPACE::USetRanges;
1589  * LocalUSetPointer uset(uset_openPattern(u"[abcçカ��]", -1, &errorCode));
1590  * for (auto [start, end] : USetRanges(uset.getAlias())) {
1591  *     printf("uset.range U+%04lx..U+%04lx\n", (long)start, (long)end);
1592  * }
1593  * for (auto range : USetRanges(uset.getAlias())) {
1594  *     for (UChar32 c : range) {
1595  *         printf("uset.range.c U+%04lx\n", (long)c);
1596  *     }
1597  * }
1598  * \endcode
1599  *
1600  * C++ UnicodeSet has member functions for iteration, including ranges().
1601  *
1602  * @draft ICU 76
1603  * @see USetCodePoints
1604  * @see USetStrings
1605  * @see USetElements
1606  */
1607 class USetRanges {
1608 public:
1609     /**
1610      * Constructs a C++ "range" object over the code point ranges of the USet.
1611      * @draft ICU 76
1612      */
USetRanges(const USet * uset)1613     USetRanges(const USet *uset) : uset(uset), rangeCount(uset_getRangeCount(uset)) {}
1614 
1615     /** @draft ICU 76 */
1616     USetRanges(const USetRanges &other) = default;
1617 
1618     /** @draft ICU 76 */
begin()1619     USetRangeIterator begin() const {
1620         return USetRangeIterator(uset, 0, rangeCount);
1621     }
1622 
1623     /** @draft ICU 76 */
end()1624     USetRangeIterator end() const {
1625         return USetRangeIterator(uset, rangeCount, rangeCount);
1626     }
1627 
1628 private:
1629     const USet *uset;
1630     int32_t rangeCount;
1631 };
1632 
1633 /**
1634  * Iterator returned by USetStrings.
1635  * @draft ICU 76
1636  */
1637 class USetStringIterator {
1638 public:
1639     /** @draft ICU 76 */
1640     USetStringIterator(const USetStringIterator &other) = default;
1641 
1642     /** @draft ICU 76 */
1643     bool operator==(const USetStringIterator &other) const {
1644         // No need to compare count given private constructor
1645         // and assuming we don't compare iterators across the set being modified.
1646         // We might even skip comparing uset.
1647         // Unless we want operator==() to be "correct" for more than iteration.
1648         return uset == other.uset && index == other.index;
1649     }
1650 
1651     /** @draft ICU 76 */
1652     bool operator!=(const USetStringIterator &other) const { return !operator==(other); }
1653 
1654     /** @draft ICU 76 */
1655     std::u16string_view operator*() const {
1656         if (index < count) {
1657             int32_t length;
1658             const UChar *uchars = uset_getString(uset, index, &length);
1659             // assert uchars != nullptr;
1660             return {ConstChar16Ptr(uchars), static_cast<uint32_t>(length)};
1661         }
1662         return {};
1663     }
1664 
1665     /**
1666      * Pre-increment.
1667      * @draft ICU 76
1668      */
1669     USetStringIterator &operator++() {
1670         ++index;
1671         return *this;
1672     }
1673 
1674     /**
1675      * Post-increment.
1676      * @draft ICU 76
1677      */
1678     USetStringIterator operator++(int) {
1679         USetStringIterator result(*this);
1680         ++index;
1681         return result;
1682     }
1683 
1684 private:
1685     friend class USetStrings;
1686 
USetStringIterator(const USet * uset,int32_t index,int32_t count)1687     USetStringIterator(const USet *uset, int32_t index, int32_t count)
1688             : uset(uset), index(index), count(count) {}
1689 
1690     const USet *uset;
1691     int32_t index;
1692     int32_t count;
1693 };
1694 
1695 /**
1696  * C++ "range" for iterating over the empty and multi-character strings of a USet.
1697  *
1698  * \code
1699  * using U_HEADER_NESTED_NAMESPACE::USetStrings;
1700  * LocalUSetPointer uset(uset_openPattern(u"[abcçカ��{}{abc}{de}]", -1, &errorCode));
1701  * for (auto s : USetStrings(uset.getAlias())) {
1702  *     UnicodeString us(s);
1703  *     std::string u8;
1704  *     printf("uset.string length %ld \"%s\"\n", (long)s.length(), us.toUTF8String(u8).c_str());
1705  * }
1706  * \endcode
1707  *
1708  * C++ UnicodeSet has member functions for iteration, including strings().
1709  *
1710  * @draft ICU 76
1711  * @see USetCodePoints
1712  * @see USetRanges
1713  * @see USetElements
1714  */
1715 class USetStrings {
1716 public:
1717     /**
1718      * Constructs a C++ "range" object over the strings of the USet.
1719      * @draft ICU 76
1720      */
USetStrings(const USet * uset)1721     USetStrings(const USet *uset) : uset(uset), count(uset_getStringCount(uset)) {}
1722 
1723     /** @draft ICU 76 */
1724     USetStrings(const USetStrings &other) = default;
1725 
1726     /** @draft ICU 76 */
begin()1727     USetStringIterator begin() const {
1728         return USetStringIterator(uset, 0, count);
1729     }
1730 
1731     /** @draft ICU 76 */
end()1732     USetStringIterator end() const {
1733         return USetStringIterator(uset, count, count);
1734     }
1735 
1736 private:
1737     const USet *uset;
1738     int32_t count;
1739 };
1740 
1741 /**
1742  * Iterator returned by USetElements.
1743  * @draft ICU 76
1744  */
1745 class USetElementIterator {
1746 public:
1747     /** @draft ICU 76 */
1748     USetElementIterator(const USetElementIterator &other) = default;
1749 
1750     /** @draft ICU 76 */
1751     bool operator==(const USetElementIterator &other) const {
1752         // No need to compare rangeCount & end given private constructor
1753         // and assuming we don't compare iterators across the set being modified.
1754         // We might even skip comparing uset.
1755         // Unless we want operator==() to be "correct" for more than iteration.
1756         return uset == other.uset && c == other.c && index == other.index;
1757     }
1758 
1759     /** @draft ICU 76 */
1760     bool operator!=(const USetElementIterator &other) const { return !operator==(other); }
1761 
1762     /** @draft ICU 76 */
1763     UnicodeString operator*() const {
1764         if (c >= 0) {
1765             return UnicodeString(c);
1766         } else if (index < totalCount) {
1767             int32_t length;
1768             const UChar *uchars = uset_getString(uset, index - rangeCount, &length);
1769             // assert uchars != nullptr;
1770             return UnicodeString(uchars, length);
1771         } else {
1772             return UnicodeString();
1773         }
1774     }
1775 
1776     /**
1777      * Pre-increment.
1778      * @draft ICU 76
1779      */
1780     USetElementIterator &operator++() {
1781         if (c < end) {
1782             ++c;
1783         } else if (index < rangeCount) {
1784             UErrorCode errorCode = U_ZERO_ERROR;
1785             int32_t result = uset_getItem(uset, index, &c, &end, nullptr, 0, &errorCode);
1786             if (U_SUCCESS(errorCode) && result == 0) {
1787                 ++index;
1788             } else {
1789                 c = end = U_SENTINEL;
1790             }
1791         } else if (c >= 0) {
1792             // assert index == rangeCount;
1793             // Switch from the last range to the first string.
1794             c = end = U_SENTINEL;
1795         } else {
1796             ++index;
1797         }
1798         return *this;
1799     }
1800 
1801     /**
1802      * Post-increment.
1803      * @draft ICU 76
1804      */
1805     USetElementIterator operator++(int) {
1806         USetElementIterator result(*this);
1807         operator++();
1808         return result;
1809     }
1810 
1811 private:
1812     friend class USetElements;
1813 
USetElementIterator(const USet * uset,int32_t index,int32_t rangeCount,int32_t totalCount)1814     USetElementIterator(const USet *uset, int32_t index, int32_t rangeCount, int32_t totalCount)
1815             : uset(uset), index(index), rangeCount(rangeCount), totalCount(totalCount),
1816                 c(U_SENTINEL), end(U_SENTINEL) {
1817         if (index < rangeCount) {
1818             // Fetch the first range.
1819             operator++();
1820         }
1821         // Otherwise don't move beyond the (index - rangeCount)-th string.
1822     }
1823 
1824     const USet *uset;
1825     int32_t index;
1826     /** Number of UnicodeSet/USet code point ranges. */
1827     int32_t rangeCount;
1828     /**
1829      * Number of code point ranges plus number of strings.
1830      * index starts from 0, counts ranges while less than rangeCount,
1831      * then counts strings while at least rangeCount and less than totalCount.
1832      *
1833      * Note that totalCount is the same as uset_getItemCount(), but usually
1834      * smaller than the number of elements returned by this iterator
1835      * because we return each code point of each range.
1836      */
1837     int32_t totalCount;
1838     UChar32 c, end;
1839 };
1840 
1841 /**
1842  * A C++ "range" for iterating over all of the elements of a USet.
1843  * Convenient all-in one iteration, but creates a UnicodeString for each
1844  * code point or string.
1845  *
1846  * Code points are returned first, then empty and multi-character strings.
1847  *
1848  * \code
1849  * using U_HEADER_NESTED_NAMESPACE::USetElements;
1850  * LocalUSetPointer uset(uset_openPattern(u"[abcçカ��{}{abc}{de}]", -1, &errorCode));
1851  * for (auto el : USetElements(uset.getAlias())) {
1852  *     std::string u8;
1853  *     printf("uset.string length %ld \"%s\"\n", (long)el.length(), el.toUTF8String(u8).c_str());
1854  * }
1855  * \endcode
1856  *
1857  * C++ UnicodeSet has member functions for iteration, including begin() and end().
1858  *
1859  * @return an all-elements iterator.
1860  * @draft ICU 76
1861  * @see USetCodePoints
1862  * @see USetRanges
1863  * @see USetStrings
1864  */
1865 class USetElements {
1866 public:
1867     /**
1868      * Constructs a C++ "range" object over all of the elements of the USet.
1869      * @draft ICU 76
1870      */
USetElements(const USet * uset)1871     USetElements(const USet *uset)
1872         : uset(uset), rangeCount(uset_getRangeCount(uset)),
1873             stringCount(uset_getStringCount(uset)) {}
1874 
1875     /** @draft ICU 76 */
1876     USetElements(const USetElements &other) = default;
1877 
1878     /** @draft ICU 76 */
begin()1879     USetElementIterator begin() const {
1880         return USetElementIterator(uset, 0, rangeCount, rangeCount + stringCount);
1881     }
1882 
1883     /** @draft ICU 76 */
end()1884     USetElementIterator end() const {
1885         return USetElementIterator(uset, rangeCount + stringCount, rangeCount, rangeCount + stringCount);
1886     }
1887 
1888 private:
1889     const USet *uset;
1890     int32_t rangeCount, stringCount;
1891 };
1892 
1893 }  // namespace U_HEADER_ONLY_NAMESPACE
1894 
1895 #endif  // U_HIDE_DRAFT_API
1896 #endif  // U_SHOW_CPLUSPLUS_API || U_SHOW_CPLUSPLUS_HEADER_API
1897 
1898 #endif  // __USET_H__
1899