1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2002-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: uset.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2002mar07 16 * created by: Markus W. Scherer 17 * 18 * C version of UnicodeSet. 19 */ 20 21 22 /** 23 * \file 24 * \brief C API: Unicode Set 25 * 26 * <p>This is a C wrapper around the C++ UnicodeSet class.</p> 27 */ 28 29 #ifndef __USET_H__ 30 #define __USET_H__ 31 32 #include "unicode/utypes.h" 33 #include "unicode/uchar.h" 34 35 #if U_SHOW_CPLUSPLUS_API 36 #include "unicode/localpointer.h" 37 #endif // U_SHOW_CPLUSPLUS_API 38 39 #ifndef USET_DEFINED 40 41 #ifndef U_IN_DOXYGEN 42 #define USET_DEFINED 43 #endif 44 /** 45 * USet is the C API type corresponding to C++ class UnicodeSet. 46 * Use the uset_* API to manipulate. Create with 47 * uset_open*, and destroy with uset_close. 48 * @stable ICU 2.4 49 */ 50 typedef struct USet USet; 51 #endif 52 53 /** 54 * Bitmask values to be passed to uset_openPatternOptions() or 55 * uset_applyPattern() taking an option parameter. 56 * @stable ICU 2.4 57 */ 58 enum { 59 /** 60 * Ignore white space within patterns unless quoted or escaped. 61 * @stable ICU 2.4 62 */ 63 USET_IGNORE_SPACE = 1, 64 65 /** 66 * Enable case insensitive matching. E.g., "[ab]" with this flag 67 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 68 * match all except 'a', 'A', 'b', and 'B'. This performs a full 69 * closure over case mappings, e.g. U+017F for s. 70 * 71 * The resulting set is a superset of the input for the code points but 72 * not for the strings. 73 * It performs a case mapping closure of the code points and adds 74 * full case folding strings for the code points, and reduces strings of 75 * the original set to their full case folding equivalents. 76 * 77 * This is designed for case-insensitive matches, for example 78 * in regular expressions. The full code point case closure allows checking of 79 * an input character directly against the closure set. 80 * Strings are matched by comparing the case-folded form from the closure 81 * set with an incremental case folding of the string in question. 82 * 83 * The closure set will also contain single code points if the original 84 * set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). 85 * This is not necessary (that is, redundant) for the above matching method 86 * but results in the same closure sets regardless of whether the original 87 * set contained the code point or a string. 88 * 89 * @stable ICU 2.4 90 */ 91 USET_CASE_INSENSITIVE = 2, 92 93 /** 94 * Enable case insensitive matching. E.g., "[ab]" with this flag 95 * will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will 96 * match all except 'a', 'A', 'b', and 'B'. This adds the lower-, 97 * title-, and uppercase mappings as well as the case folding 98 * of each existing element in the set. 99 * @stable ICU 3.2 100 */ 101 USET_ADD_CASE_MAPPINGS = 4 102 }; 103 104 /** 105 * Argument values for whether span() and similar functions continue while 106 * the current character is contained vs. not contained in the set. 107 * 108 * The functionality is straightforward for sets with only single code points, 109 * without strings (which is the common case): 110 * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same. 111 * - USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED. 112 * - span() and spanBack() partition any string the same way when 113 * alternating between span(USET_SPAN_NOT_CONTAINED) and 114 * span(either "contained" condition). 115 * - Using a complemented (inverted) set and the opposite span conditions 116 * yields the same results. 117 * 118 * When a set contains multi-code point strings, then these statements may not 119 * be true, depending on the strings in the set (for example, whether they 120 * overlap with each other) and the string that is processed. 121 * For a set with strings: 122 * - The complement of the set contains the opposite set of code points, 123 * but the same set of strings. 124 * Therefore, complementing both the set and the span conditions 125 * may yield different results. 126 * - When starting spans at different positions in a string 127 * (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different 128 * because a set string may start before the later position. 129 * - span(USET_SPAN_SIMPLE) may be shorter than 130 * span(USET_SPAN_CONTAINED) because it will not recursively try 131 * all possible paths. 132 * For example, with a set which contains the three strings "xy", "xya" and "ax", 133 * span("xyax", USET_SPAN_CONTAINED) will return 4 but 134 * span("xyax", USET_SPAN_SIMPLE) will return 3. 135 * span(USET_SPAN_SIMPLE) will never be longer than 136 * span(USET_SPAN_CONTAINED). 137 * - With either "contained" condition, span() and spanBack() may partition 138 * a string in different ways. 139 * For example, with a set which contains the two strings "ab" and "ba", 140 * and when processing the string "aba", 141 * span() will yield contained/not-contained boundaries of { 0, 2, 3 } 142 * while spanBack() will yield boundaries of { 0, 1, 3 }. 143 * 144 * Note: If it is important to get the same boundaries whether iterating forward 145 * or backward through a string, then either only span() should be used and 146 * the boundaries cached for backward operation, or an ICU BreakIterator 147 * could be used. 148 * 149 * Note: Unpaired surrogates are treated like surrogate code points. 150 * Similarly, set strings match only on code point boundaries, 151 * never in the middle of a surrogate pair. 152 * Illegal UTF-8 sequences are treated like U+FFFD. 153 * When processing UTF-8 strings, malformed set strings 154 * (strings with unpaired surrogates which cannot be converted to UTF-8) 155 * are ignored. 156 * 157 * @stable ICU 3.8 158 */ 159 typedef enum USetSpanCondition { 160 /** 161 * Continues a span() while there is no set element at the current position. 162 * Increments by one code point at a time. 163 * Stops before the first set element (character or string). 164 * (For code points only, this is like while contains(current)==false). 165 * 166 * When span() returns, the substring between where it started and the position 167 * it returned consists only of characters that are not in the set, 168 * and none of its strings overlap with the span. 169 * 170 * @stable ICU 3.8 171 */ 172 USET_SPAN_NOT_CONTAINED = 0, 173 /** 174 * Spans the longest substring that is a concatenation of set elements (characters or strings). 175 * (For characters only, this is like while contains(current)==true). 176 * 177 * When span() returns, the substring between where it started and the position 178 * it returned consists only of set elements (characters or strings) that are in the set. 179 * 180 * If a set contains strings, then the span will be the longest substring for which there 181 * exists at least one non-overlapping concatenation of set elements (characters or strings). 182 * This is equivalent to a POSIX regular expression for <code>(OR of each set element)*</code>. 183 * (Java/ICU/Perl regex stops at the first match of an OR.) 184 * 185 * @stable ICU 3.8 186 */ 187 USET_SPAN_CONTAINED = 1, 188 /** 189 * Continues a span() while there is a set element at the current position. 190 * Increments by the longest matching element at each position. 191 * (For characters only, this is like while contains(current)==true). 192 * 193 * When span() returns, the substring between where it started and the position 194 * it returned consists only of set elements (characters or strings) that are in the set. 195 * 196 * If a set only contains single characters, then this is the same 197 * as USET_SPAN_CONTAINED. 198 * 199 * If a set contains strings, then the span will be the longest substring 200 * with a match at each position with the longest single set element (character or string). 201 * 202 * Use this span condition together with other longest-match algorithms, 203 * such as ICU converters (ucnv_getUnicodeSet()). 204 * 205 * @stable ICU 3.8 206 */ 207 USET_SPAN_SIMPLE = 2, 208 #ifndef U_HIDE_DEPRECATED_API 209 /** 210 * One more than the last span condition. 211 * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. 212 */ 213 USET_SPAN_CONDITION_COUNT 214 #endif // U_HIDE_DEPRECATED_API 215 } USetSpanCondition; 216 217 enum { 218 /** 219 * Capacity of USerializedSet::staticArray. 220 * Enough for any single-code point set. 221 * Also provides padding for nice sizeof(USerializedSet). 222 * @stable ICU 2.4 223 */ 224 USET_SERIALIZED_STATIC_ARRAY_CAPACITY=8 225 }; 226 227 /** 228 * A serialized form of a Unicode set. Limited manipulations are 229 * possible directly on a serialized set. See below. 230 * @stable ICU 2.4 231 */ 232 typedef struct USerializedSet { 233 /** 234 * The serialized Unicode Set. 235 * @stable ICU 2.4 236 */ 237 const uint16_t *array; 238 /** 239 * The length of the array that contains BMP characters. 240 * @stable ICU 2.4 241 */ 242 int32_t bmpLength; 243 /** 244 * The total length of the array. 245 * @stable ICU 2.4 246 */ 247 int32_t length; 248 /** 249 * A small buffer for the array to reduce memory allocations. 250 * @stable ICU 2.4 251 */ 252 uint16_t staticArray[USET_SERIALIZED_STATIC_ARRAY_CAPACITY]; 253 } USerializedSet; 254 255 /********************************************************************* 256 * USet API 257 *********************************************************************/ 258 259 /** 260 * Create an empty USet object. 261 * Equivalent to uset_open(1, 0). 262 * @return a newly created USet. The caller must call uset_close() on 263 * it when done. 264 * @stable ICU 4.2 265 */ 266 U_CAPI USet* U_EXPORT2 267 uset_openEmpty(void); 268 269 /** 270 * Creates a USet object that contains the range of characters 271 * start..end, inclusive. If <code>start > end</code> 272 * then an empty set is created (same as using uset_openEmpty()). 273 * @param start first character of the range, inclusive 274 * @param end last character of the range, inclusive 275 * @return a newly created USet. The caller must call uset_close() on 276 * it when done. 277 * @stable ICU 2.4 278 */ 279 U_CAPI USet* U_EXPORT2 280 uset_open(UChar32 start, UChar32 end); 281 282 /** 283 * Creates a set from the given pattern. See the UnicodeSet class 284 * description for the syntax of the pattern language. 285 * @param pattern a string specifying what characters are in the set 286 * @param patternLength the length of the pattern, or -1 if null 287 * terminated 288 * @param ec the error code 289 * @stable ICU 2.4 290 */ 291 U_CAPI USet* U_EXPORT2 292 uset_openPattern(const UChar* pattern, int32_t patternLength, 293 UErrorCode* ec); 294 295 /** 296 * Creates a set from the given pattern. See the UnicodeSet class 297 * description for the syntax of the pattern language. 298 * @param pattern a string specifying what characters are in the set 299 * @param patternLength the length of the pattern, or -1 if null 300 * terminated 301 * @param options bitmask for options to apply to the pattern. 302 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 303 * @param ec the error code 304 * @stable ICU 2.4 305 */ 306 U_CAPI USet* U_EXPORT2 307 uset_openPatternOptions(const UChar* pattern, int32_t patternLength, 308 uint32_t options, 309 UErrorCode* ec); 310 311 /** 312 * Disposes of the storage used by a USet object. This function should 313 * be called exactly once for objects returned by uset_open(). 314 * @param set the object to dispose of 315 * @stable ICU 2.4 316 */ 317 U_CAPI void U_EXPORT2 318 uset_close(USet* set); 319 320 #if U_SHOW_CPLUSPLUS_API 321 322 U_NAMESPACE_BEGIN 323 324 /** 325 * \class LocalUSetPointer 326 * "Smart pointer" class, closes a USet via uset_close(). 327 * For most methods see the LocalPointerBase base class. 328 * 329 * @see LocalPointerBase 330 * @see LocalPointer 331 * @stable ICU 4.4 332 */ 333 U_DEFINE_LOCAL_OPEN_POINTER(LocalUSetPointer, USet, uset_close); 334 335 U_NAMESPACE_END 336 337 #endif 338 339 /** 340 * Returns a copy of this object. 341 * If this set is frozen, then the clone will be frozen as well. 342 * Use uset_cloneAsThawed() for a mutable clone of a frozen set. 343 * @param set the original set 344 * @return the newly allocated copy of the set 345 * @see uset_cloneAsThawed 346 * @stable ICU 3.8 347 */ 348 U_CAPI USet * U_EXPORT2 349 uset_clone(const USet *set); 350 351 /** 352 * Determines whether the set has been frozen (made immutable) or not. 353 * See the ICU4J Freezable interface for details. 354 * @param set the set 355 * @return true/false for whether the set has been frozen 356 * @see uset_freeze 357 * @see uset_cloneAsThawed 358 * @stable ICU 3.8 359 */ 360 U_CAPI UBool U_EXPORT2 361 uset_isFrozen(const USet *set); 362 363 /** 364 * Freeze the set (make it immutable). 365 * Once frozen, it cannot be unfrozen and is therefore thread-safe 366 * until it is deleted. 367 * See the ICU4J Freezable interface for details. 368 * Freezing the set may also make some operations faster, for example 369 * uset_contains() and uset_span(). 370 * A frozen set will not be modified. (It remains frozen.) 371 * @param set the set 372 * @return the same set, now frozen 373 * @see uset_isFrozen 374 * @see uset_cloneAsThawed 375 * @stable ICU 3.8 376 */ 377 U_CAPI void U_EXPORT2 378 uset_freeze(USet *set); 379 380 /** 381 * Clone the set and make the clone mutable. 382 * See the ICU4J Freezable interface for details. 383 * @param set the set 384 * @return the mutable clone 385 * @see uset_freeze 386 * @see uset_isFrozen 387 * @see uset_clone 388 * @stable ICU 3.8 389 */ 390 U_CAPI USet * U_EXPORT2 391 uset_cloneAsThawed(const USet *set); 392 393 /** 394 * Causes the USet object to represent the range <code>start - end</code>. 395 * If <code>start > end</code> then this USet is set to an empty range. 396 * A frozen set will not be modified. 397 * @param set the object to set to the given range 398 * @param start first character in the set, inclusive 399 * @param end last character in the set, inclusive 400 * @stable ICU 3.2 401 */ 402 U_CAPI void U_EXPORT2 403 uset_set(USet* set, 404 UChar32 start, UChar32 end); 405 406 /** 407 * Modifies the set to represent the set specified by the given 408 * pattern. See the UnicodeSet class description for the syntax of 409 * the pattern language. See also the User Guide chapter about UnicodeSet. 410 * <em>Empties the set passed before applying the pattern.</em> 411 * A frozen set will not be modified. 412 * @param set The set to which the pattern is to be applied. 413 * @param pattern A pointer to UChar string specifying what characters are in the set. 414 * The character at pattern[0] must be a '['. 415 * @param patternLength The length of the UChar string. -1 if NUL terminated. 416 * @param options A bitmask for options to apply to the pattern. 417 * Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE. 418 * @param status Returns an error if the pattern cannot be parsed. 419 * @return Upon successful parse, the value is either 420 * the index of the character after the closing ']' 421 * of the parsed pattern. 422 * If the status code indicates failure, then the return value 423 * is the index of the error in the source. 424 * 425 * @stable ICU 2.8 426 */ 427 U_CAPI int32_t U_EXPORT2 428 uset_applyPattern(USet *set, 429 const UChar *pattern, int32_t patternLength, 430 uint32_t options, 431 UErrorCode *status); 432 433 /** 434 * Modifies the set to contain those code points which have the given value 435 * for the given binary or enumerated property, as returned by 436 * u_getIntPropertyValue. Prior contents of this set are lost. 437 * A frozen set will not be modified. 438 * 439 * @param set the object to contain the code points defined by the property 440 * 441 * @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 442 * or UCHAR_INT_START..UCHAR_INT_LIMIT-1 443 * or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1. 444 * 445 * @param value a value in the range u_getIntPropertyMinValue(prop).. 446 * u_getIntPropertyMaxValue(prop), with one exception. If prop is 447 * UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but 448 * rather a mask value produced by U_GET_GC_MASK(). This allows grouped 449 * categories such as [:L:] to be represented. 450 * 451 * @param ec error code input/output parameter 452 * 453 * @stable ICU 3.2 454 */ 455 U_CAPI void U_EXPORT2 456 uset_applyIntPropertyValue(USet* set, 457 UProperty prop, int32_t value, UErrorCode* ec); 458 459 /** 460 * Modifies the set to contain those code points which have the 461 * given value for the given property. Prior contents of this 462 * set are lost. 463 * A frozen set will not be modified. 464 * 465 * @param set the object to contain the code points defined by the given 466 * property and value alias 467 * 468 * @param prop a string specifying a property alias, either short or long. 469 * The name is matched loosely. See PropertyAliases.txt for names and a 470 * description of loose matching. If the value string is empty, then this 471 * string is interpreted as either a General_Category value alias, a Script 472 * value alias, a binary property alias, or a special ID. Special IDs are 473 * matched loosely and correspond to the following sets: 474 * 475 * "ANY" = [\\u0000-\\U0010FFFF], 476 * "ASCII" = [\\u0000-\\u007F], 477 * "Assigned" = [:^Cn:]. 478 * 479 * @param propLength the length of the prop, or -1 if NULL 480 * 481 * @param value a string specifying a value alias, either short or long. 482 * The name is matched loosely. See PropertyValueAliases.txt for names 483 * and a description of loose matching. In addition to aliases listed, 484 * numeric values and canonical combining classes may be expressed 485 * numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string 486 * may also be empty. 487 * 488 * @param valueLength the length of the value, or -1 if NULL 489 * 490 * @param ec error code input/output parameter 491 * 492 * @stable ICU 3.2 493 */ 494 U_CAPI void U_EXPORT2 495 uset_applyPropertyAlias(USet* set, 496 const UChar *prop, int32_t propLength, 497 const UChar *value, int32_t valueLength, 498 UErrorCode* ec); 499 500 /** 501 * Return true if the given position, in the given pattern, appears 502 * to be the start of a UnicodeSet pattern. 503 * 504 * @param pattern a string specifying the pattern 505 * @param patternLength the length of the pattern, or -1 if NULL 506 * @param pos the given position 507 * @stable ICU 3.2 508 */ 509 U_CAPI UBool U_EXPORT2 510 uset_resemblesPattern(const UChar *pattern, int32_t patternLength, 511 int32_t pos); 512 513 /** 514 * Returns a string representation of this set. If the result of 515 * calling this function is passed to a uset_openPattern(), it 516 * will produce another set that is equal to this one. 517 * @param set the set 518 * @param result the string to receive the rules, may be NULL 519 * @param resultCapacity the capacity of result, may be 0 if result is NULL 520 * @param escapeUnprintable if true then convert unprintable 521 * character to their hex escape representations, \\uxxxx or 522 * \\Uxxxxxxxx. Unprintable characters are those other than 523 * U+000A, U+0020..U+007E. 524 * @param ec error code. 525 * @return length of string, possibly larger than resultCapacity 526 * @stable ICU 2.4 527 */ 528 U_CAPI int32_t U_EXPORT2 529 uset_toPattern(const USet* set, 530 UChar* result, int32_t resultCapacity, 531 UBool escapeUnprintable, 532 UErrorCode* ec); 533 534 /** 535 * Adds the given character to the given USet. After this call, 536 * uset_contains(set, c) will return true. 537 * A frozen set will not be modified. 538 * @param set the object to which to add the character 539 * @param c the character to add 540 * @stable ICU 2.4 541 */ 542 U_CAPI void U_EXPORT2 543 uset_add(USet* set, UChar32 c); 544 545 /** 546 * Adds all of the elements in the specified set to this set if 547 * they're not already present. This operation effectively 548 * modifies this set so that its value is the <i>union</i> of the two 549 * sets. The behavior of this operation is unspecified if the specified 550 * collection is modified while the operation is in progress. 551 * A frozen set will not be modified. 552 * 553 * @param set the object to which to add the set 554 * @param additionalSet the source set whose elements are to be added to this set. 555 * @stable ICU 2.6 556 */ 557 U_CAPI void U_EXPORT2 558 uset_addAll(USet* set, const USet *additionalSet); 559 560 /** 561 * Adds the given range of characters to the given USet. After this call, 562 * uset_contains(set, start, end) will return true. 563 * A frozen set will not be modified. 564 * @param set the object to which to add the character 565 * @param start the first character of the range to add, inclusive 566 * @param end the last character of the range to add, inclusive 567 * @stable ICU 2.2 568 */ 569 U_CAPI void U_EXPORT2 570 uset_addRange(USet* set, UChar32 start, UChar32 end); 571 572 /** 573 * Adds the given string to the given USet. After this call, 574 * uset_containsString(set, str, strLen) will return true. 575 * A frozen set will not be modified. 576 * @param set the object to which to add the character 577 * @param str the string to add 578 * @param strLen the length of the string or -1 if null terminated. 579 * @stable ICU 2.4 580 */ 581 U_CAPI void U_EXPORT2 582 uset_addString(USet* set, const UChar* str, int32_t strLen); 583 584 /** 585 * Adds each of the characters in this string to the set. Note: "ch" => {"c", "h"} 586 * If this set already contains any particular character, it has no effect on that character. 587 * A frozen set will not be modified. 588 * @param set the object to which to add the character 589 * @param str the source string 590 * @param strLen the length of the string or -1 if null terminated. 591 * @stable ICU 3.4 592 */ 593 U_CAPI void U_EXPORT2 594 uset_addAllCodePoints(USet* set, const UChar *str, int32_t strLen); 595 596 /** 597 * Removes the given character from the given USet. After this call, 598 * uset_contains(set, c) will return false. 599 * A frozen set will not be modified. 600 * @param set the object from which to remove the character 601 * @param c the character to remove 602 * @stable ICU 2.4 603 */ 604 U_CAPI void U_EXPORT2 605 uset_remove(USet* set, UChar32 c); 606 607 /** 608 * Removes the given range of characters from the given USet. After this call, 609 * uset_contains(set, start, end) will return false. 610 * A frozen set will not be modified. 611 * @param set the object to which to add the character 612 * @param start the first character of the range to remove, inclusive 613 * @param end the last character of the range to remove, inclusive 614 * @stable ICU 2.2 615 */ 616 U_CAPI void U_EXPORT2 617 uset_removeRange(USet* set, UChar32 start, UChar32 end); 618 619 /** 620 * Removes the given string to the given USet. After this call, 621 * uset_containsString(set, str, strLen) will return false. 622 * A frozen set will not be modified. 623 * @param set the object to which to add the character 624 * @param str the string to remove 625 * @param strLen the length of the string or -1 if null terminated. 626 * @stable ICU 2.4 627 */ 628 U_CAPI void U_EXPORT2 629 uset_removeString(USet* set, const UChar* str, int32_t strLen); 630 631 #ifndef U_HIDE_DRAFT_API 632 /** 633 * Removes EACH of the characters in this string. Note: "ch" == {"c", "h"} 634 * A frozen set will not be modified. 635 * 636 * @param set the object to be modified 637 * @param str the string 638 * @param length the length of the string, or -1 if NUL-terminated 639 * @draft ICU 69 640 */ 641 U_CAPI void U_EXPORT2 642 uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length); 643 #endif // U_HIDE_DRAFT_API 644 645 /** 646 * Removes from this set all of its elements that are contained in the 647 * specified set. This operation effectively modifies this 648 * set so that its value is the <i>asymmetric set difference</i> of 649 * the two sets. 650 * A frozen set will not be modified. 651 * @param set the object from which the elements are to be removed 652 * @param removeSet the object that defines which elements will be 653 * removed from this set 654 * @stable ICU 3.2 655 */ 656 U_CAPI void U_EXPORT2 657 uset_removeAll(USet* set, const USet* removeSet); 658 659 /** 660 * Retain only the elements in this set that are contained in the 661 * specified range. If <code>start > end</code> then an empty range is 662 * retained, leaving the set empty. This is equivalent to 663 * a boolean logic AND, or a set INTERSECTION. 664 * A frozen set will not be modified. 665 * 666 * @param set the object for which to retain only the specified range 667 * @param start first character, inclusive, of range 668 * @param end last character, inclusive, of range 669 * @stable ICU 3.2 670 */ 671 U_CAPI void U_EXPORT2 672 uset_retain(USet* set, UChar32 start, UChar32 end); 673 674 #ifndef U_HIDE_DRAFT_API 675 /** 676 * Retains only the specified string from this set if it is present. 677 * Upon return this set will be empty if it did not contain s, or 678 * will only contain s if it did contain s. 679 * A frozen set will not be modified. 680 * 681 * @param set the object to be modified 682 * @param str the string 683 * @param length the length of the string, or -1 if NUL-terminated 684 * @draft ICU 69 685 */ 686 U_CAPI void U_EXPORT2 687 uset_retainString(USet *set, const UChar *str, int32_t length); 688 689 /** 690 * Retains EACH of the characters in this string. Note: "ch" == {"c", "h"} 691 * A frozen set will not be modified. 692 * 693 * @param set the object to be modified 694 * @param str the string 695 * @param length the length of the string, or -1 if NUL-terminated 696 * @draft ICU 69 697 */ 698 U_CAPI void U_EXPORT2 699 uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length); 700 #endif // U_HIDE_DRAFT_API 701 702 /** 703 * Retains only the elements in this set that are contained in the 704 * specified set. In other words, removes from this set all of 705 * its elements that are not contained in the specified set. This 706 * operation effectively modifies this set so that its value is 707 * the <i>intersection</i> of the two sets. 708 * A frozen set will not be modified. 709 * 710 * @param set the object on which to perform the retain 711 * @param retain set that defines which elements this set will retain 712 * @stable ICU 3.2 713 */ 714 U_CAPI void U_EXPORT2 715 uset_retainAll(USet* set, const USet* retain); 716 717 /** 718 * Reallocate this objects internal structures to take up the least 719 * possible space, without changing this object's value. 720 * A frozen set will not be modified. 721 * 722 * @param set the object on which to perform the compact 723 * @stable ICU 3.2 724 */ 725 U_CAPI void U_EXPORT2 726 uset_compact(USet* set); 727 728 /** 729 * This is equivalent to 730 * <code>uset_complementRange(set, 0, 0x10FFFF)</code>. 731 * 732 * <strong>Note:</strong> This performs a symmetric difference with all code points 733 * <em>and thus retains all multicharacter strings</em>. 734 * In order to achieve a “code point complement” (all code points minus this set), 735 * the easiest is to <code>uset_complement(set); uset_removeAllStrings(set);</code>. 736 * 737 * A frozen set will not be modified. 738 * @param set the set 739 * @stable ICU 2.4 740 */ 741 U_CAPI void U_EXPORT2 742 uset_complement(USet* set); 743 744 #ifndef U_HIDE_DRAFT_API 745 /** 746 * Complements the specified range in this set. Any character in 747 * the range will be removed if it is in this set, or will be 748 * added if it is not in this set. If <code>start > end</code> 749 * then an empty range is complemented, leaving the set unchanged. 750 * This is equivalent to a boolean logic XOR. 751 * A frozen set will not be modified. 752 * 753 * @param set the object to be modified 754 * @param start first character, inclusive, of range 755 * @param end last character, inclusive, of range 756 * @draft ICU 69 757 */ 758 U_CAPI void U_EXPORT2 759 uset_complementRange(USet *set, UChar32 start, UChar32 end); 760 761 /** 762 * Complements the specified string in this set. 763 * The string will be removed if it is in this set, or will be added if it is not in this set. 764 * A frozen set will not be modified. 765 * 766 * @param set the object to be modified 767 * @param str the string 768 * @param length the length of the string, or -1 if NUL-terminated 769 * @draft ICU 69 770 */ 771 U_CAPI void U_EXPORT2 772 uset_complementString(USet *set, const UChar *str, int32_t length); 773 774 /** 775 * Complements EACH of the characters in this string. Note: "ch" == {"c", "h"} 776 * A frozen set will not be modified. 777 * 778 * @param set the object to be modified 779 * @param str the string 780 * @param length the length of the string, or -1 if NUL-terminated 781 * @draft ICU 69 782 */ 783 U_CAPI void U_EXPORT2 784 uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length); 785 #endif // U_HIDE_DRAFT_API 786 787 /** 788 * Complements in this set all elements contained in the specified 789 * set. Any character in the other set will be removed if it is 790 * in this set, or will be added if it is not in this set. 791 * A frozen set will not be modified. 792 * 793 * @param set the set with which to complement 794 * @param complement set that defines which elements will be xor'ed 795 * from this set. 796 * @stable ICU 3.2 797 */ 798 U_CAPI void U_EXPORT2 799 uset_complementAll(USet* set, const USet* complement); 800 801 /** 802 * Removes all of the elements from this set. This set will be 803 * empty after this call returns. 804 * A frozen set will not be modified. 805 * @param set the set 806 * @stable ICU 2.4 807 */ 808 U_CAPI void U_EXPORT2 809 uset_clear(USet* set); 810 811 /** 812 * Close this set over the given attribute. For the attribute 813 * USET_CASE, the result is to modify this set so that: 814 * 815 * 1. For each character or string 'a' in this set, all strings or 816 * characters 'b' such that foldCase(a) == foldCase(b) are added 817 * to this set. 818 * 819 * 2. For each string 'e' in the resulting set, if e != 820 * foldCase(e), 'e' will be removed. 821 * 822 * Example: [aq\\u00DF{Bc}{bC}{Fi}] => [aAqQ\\u00DF\\uFB01{ss}{bc}{fi}] 823 * 824 * (Here foldCase(x) refers to the operation u_strFoldCase, and a 825 * == b denotes that the contents are the same, not pointer 826 * comparison.) 827 * 828 * A frozen set will not be modified. 829 * 830 * @param set the set 831 * 832 * @param attributes bitmask for attributes to close over. 833 * Currently only the USET_CASE bit is supported. Any undefined bits 834 * are ignored. 835 * @stable ICU 4.2 836 */ 837 U_CAPI void U_EXPORT2 838 uset_closeOver(USet* set, int32_t attributes); 839 840 /** 841 * Remove all strings from this set. 842 * 843 * @param set the set 844 * @stable ICU 4.2 845 */ 846 U_CAPI void U_EXPORT2 847 uset_removeAllStrings(USet* set); 848 849 /** 850 * Returns true if the given USet contains no characters and no 851 * strings. 852 * @param set the set 853 * @return true if set is empty 854 * @stable ICU 2.4 855 */ 856 U_CAPI UBool U_EXPORT2 857 uset_isEmpty(const USet* set); 858 859 #ifndef U_HIDE_DRAFT_API 860 /** 861 * @param set the set 862 * @return true if this set contains multi-character strings or the empty string. 863 * @draft ICU 70 864 */ 865 U_CAPI UBool U_EXPORT2 866 uset_hasStrings(const USet *set); 867 #endif // U_HIDE_DRAFT_API 868 869 /** 870 * Returns true if the given USet contains the given character. 871 * This function works faster with a frozen set. 872 * @param set the set 873 * @param c The codepoint to check for within the set 874 * @return true if set contains c 875 * @stable ICU 2.4 876 */ 877 U_CAPI UBool U_EXPORT2 878 uset_contains(const USet* set, UChar32 c); 879 880 /** 881 * Returns true if the given USet contains all characters c 882 * where start <= c && c <= end. 883 * @param set the set 884 * @param start the first character of the range to test, inclusive 885 * @param end the last character of the range to test, inclusive 886 * @return true if set contains the range 887 * @stable ICU 2.2 888 */ 889 U_CAPI UBool U_EXPORT2 890 uset_containsRange(const USet* set, UChar32 start, UChar32 end); 891 892 /** 893 * Returns true if the given USet contains the given string. 894 * @param set the set 895 * @param str the string 896 * @param strLen the length of the string or -1 if null terminated. 897 * @return true if set contains str 898 * @stable ICU 2.4 899 */ 900 U_CAPI UBool U_EXPORT2 901 uset_containsString(const USet* set, const UChar* str, int32_t strLen); 902 903 /** 904 * Returns the index of the given character within this set, where 905 * the set is ordered by ascending code point. If the character 906 * is not in this set, return -1. The inverse of this method is 907 * <code>charAt()</code>. 908 * @param set the set 909 * @param c the character to obtain the index for 910 * @return an index from 0..size()-1, or -1 911 * @stable ICU 3.2 912 */ 913 U_CAPI int32_t U_EXPORT2 914 uset_indexOf(const USet* set, UChar32 c); 915 916 /** 917 * Returns the character at the given index within this set, where 918 * the set is ordered by ascending code point. If the index is 919 * out of range for characters, returns (UChar32)-1. 920 * The inverse of this method is <code>indexOf()</code>. 921 * 922 * For iteration, this is slower than uset_getRangeCount()/uset_getItemCount() 923 * with uset_getItem(), because for each call it skips linearly over <code>index</code> 924 * characters in the ranges. 925 * 926 * @param set the set 927 * @param charIndex an index from 0..size()-1 to obtain the char for 928 * @return the character at the given index, or (UChar32)-1. 929 * @stable ICU 3.2 930 */ 931 U_CAPI UChar32 U_EXPORT2 932 uset_charAt(const USet* set, int32_t charIndex); 933 934 /** 935 * Returns the number of characters and strings contained in this set. 936 * The last (uset_getItemCount() - uset_getRangeCount()) items are strings. 937 * 938 * This is slower than uset_getRangeCount() and uset_getItemCount() because 939 * it counts the code points of all ranges. 940 * 941 * @param set the set 942 * @return a non-negative integer counting the characters and strings 943 * contained in set 944 * @stable ICU 2.4 945 * @see uset_getRangeCount 946 */ 947 U_CAPI int32_t U_EXPORT2 948 uset_size(const USet* set); 949 950 #ifndef U_HIDE_DRAFT_API 951 /** 952 * @param set the set 953 * @return the number of ranges in this set. 954 * @draft ICU 70 955 * @see uset_getItemCount 956 * @see uset_getItem 957 * @see uset_size 958 */ 959 U_CAPI int32_t U_EXPORT2 960 uset_getRangeCount(const USet *set); 961 #endif // U_HIDE_DRAFT_API 962 963 /** 964 * Returns the number of items in this set. An item is either a range 965 * of characters or a single multicharacter string. 966 * @param set the set 967 * @return a non-negative integer counting the character ranges 968 * and/or strings contained in set 969 * @stable ICU 2.4 970 */ 971 U_CAPI int32_t U_EXPORT2 972 uset_getItemCount(const USet* set); 973 974 /** 975 * Returns an item of this set. An item is either a range of 976 * characters or a single multicharacter string (which can be the empty string). 977 * 978 * If <code>itemIndex</code> is less than uset_getRangeCount(), then this function returns 0, 979 * and the range is <code>*start</code>..<code>*end</code>. 980 * 981 * If <code>itemIndex</code> is at least uset_getRangeCount() and less than uset_getItemCount(), then 982 * this function copies the string into <code>str[strCapacity]</code> and 983 * returns the length of the string (0 for the empty string). 984 * 985 * If <code>itemIndex</code> is out of range, then this function returns -1. 986 * 987 * Note that 0 is returned for each range as well as for the empty string. 988 * 989 * @param set the set 990 * @param itemIndex a non-negative integer in the range 0..uset_getItemCount(set)-1 991 * @param start pointer to variable to receive first character in range, inclusive; 992 * can be NULL for a string item 993 * @param end pointer to variable to receive last character in range, inclusive; 994 * can be NULL for a string item 995 * @param str buffer to receive the string, may be NULL 996 * @param strCapacity capacity of str, or 0 if str is NULL 997 * @param ec error code; U_INDEX_OUTOFBOUNDS_ERROR if the itemIndex is out of range 998 * @return the length of the string (0 or >= 2), or 0 if the item is a range, 999 * or -1 if the itemIndex is out of range 1000 * @stable ICU 2.4 1001 */ 1002 U_CAPI int32_t U_EXPORT2 1003 uset_getItem(const USet* set, int32_t itemIndex, 1004 UChar32* start, UChar32* end, 1005 UChar* str, int32_t strCapacity, 1006 UErrorCode* ec); 1007 1008 /** 1009 * Returns true if set1 contains all the characters and strings 1010 * of set2. It answers the question, 'Is set1 a superset of set2?' 1011 * @param set1 set to be checked for containment 1012 * @param set2 set to be checked for containment 1013 * @return true if the test condition is met 1014 * @stable ICU 3.2 1015 */ 1016 U_CAPI UBool U_EXPORT2 1017 uset_containsAll(const USet* set1, const USet* set2); 1018 1019 /** 1020 * Returns true if this set contains all the characters 1021 * of the given string. This is does not check containment of grapheme 1022 * clusters, like uset_containsString. 1023 * @param set set of characters to be checked for containment 1024 * @param str string containing codepoints to be checked for containment 1025 * @param strLen the length of the string or -1 if null terminated. 1026 * @return true if the test condition is met 1027 * @stable ICU 3.4 1028 */ 1029 U_CAPI UBool U_EXPORT2 1030 uset_containsAllCodePoints(const USet* set, const UChar *str, int32_t strLen); 1031 1032 /** 1033 * Returns true if set1 contains none of the characters and strings 1034 * of set2. It answers the question, 'Is set1 a disjoint set of set2?' 1035 * @param set1 set to be checked for containment 1036 * @param set2 set to be checked for containment 1037 * @return true if the test condition is met 1038 * @stable ICU 3.2 1039 */ 1040 U_CAPI UBool U_EXPORT2 1041 uset_containsNone(const USet* set1, const USet* set2); 1042 1043 /** 1044 * Returns true if set1 contains some of the characters and strings 1045 * of set2. It answers the question, 'Does set1 and set2 have an intersection?' 1046 * @param set1 set to be checked for containment 1047 * @param set2 set to be checked for containment 1048 * @return true if the test condition is met 1049 * @stable ICU 3.2 1050 */ 1051 U_CAPI UBool U_EXPORT2 1052 uset_containsSome(const USet* set1, const USet* set2); 1053 1054 /** 1055 * Returns the length of the initial substring of the input string which 1056 * consists only of characters and strings that are contained in this set 1057 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1058 * or only of characters and strings that are not contained 1059 * in this set (USET_SPAN_NOT_CONTAINED). 1060 * See USetSpanCondition for details. 1061 * Similar to the strspn() C library function. 1062 * Unpaired surrogates are treated according to contains() of their surrogate code points. 1063 * This function works faster with a frozen set and with a non-negative string length argument. 1064 * @param set the set 1065 * @param s start of the string 1066 * @param length of the string; can be -1 for NUL-terminated 1067 * @param spanCondition specifies the containment condition 1068 * @return the length of the initial substring according to the spanCondition; 1069 * 0 if the start of the string does not fit the spanCondition 1070 * @stable ICU 3.8 1071 * @see USetSpanCondition 1072 */ 1073 U_CAPI int32_t U_EXPORT2 1074 uset_span(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); 1075 1076 /** 1077 * Returns the start of the trailing substring of the input string which 1078 * consists only of characters and strings that are contained in this set 1079 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1080 * or only of characters and strings that are not contained 1081 * in this set (USET_SPAN_NOT_CONTAINED). 1082 * See USetSpanCondition for details. 1083 * Unpaired surrogates are treated according to contains() of their surrogate code points. 1084 * This function works faster with a frozen set and with a non-negative string length argument. 1085 * @param set the set 1086 * @param s start of the string 1087 * @param length of the string; can be -1 for NUL-terminated 1088 * @param spanCondition specifies the containment condition 1089 * @return the start of the trailing substring according to the spanCondition; 1090 * the string length if the end of the string does not fit the spanCondition 1091 * @stable ICU 3.8 1092 * @see USetSpanCondition 1093 */ 1094 U_CAPI int32_t U_EXPORT2 1095 uset_spanBack(const USet *set, const UChar *s, int32_t length, USetSpanCondition spanCondition); 1096 1097 /** 1098 * Returns the length of the initial substring of the input string which 1099 * consists only of characters and strings that are contained in this set 1100 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1101 * or only of characters and strings that are not contained 1102 * in this set (USET_SPAN_NOT_CONTAINED). 1103 * See USetSpanCondition for details. 1104 * Similar to the strspn() C library function. 1105 * Malformed byte sequences are treated according to contains(0xfffd). 1106 * This function works faster with a frozen set and with a non-negative string length argument. 1107 * @param set the set 1108 * @param s start of the string (UTF-8) 1109 * @param length of the string; can be -1 for NUL-terminated 1110 * @param spanCondition specifies the containment condition 1111 * @return the length of the initial substring according to the spanCondition; 1112 * 0 if the start of the string does not fit the spanCondition 1113 * @stable ICU 3.8 1114 * @see USetSpanCondition 1115 */ 1116 U_CAPI int32_t U_EXPORT2 1117 uset_spanUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); 1118 1119 /** 1120 * Returns the start of the trailing substring of the input string which 1121 * consists only of characters and strings that are contained in this set 1122 * (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), 1123 * or only of characters and strings that are not contained 1124 * in this set (USET_SPAN_NOT_CONTAINED). 1125 * See USetSpanCondition for details. 1126 * Malformed byte sequences are treated according to contains(0xfffd). 1127 * This function works faster with a frozen set and with a non-negative string length argument. 1128 * @param set the set 1129 * @param s start of the string (UTF-8) 1130 * @param length of the string; can be -1 for NUL-terminated 1131 * @param spanCondition specifies the containment condition 1132 * @return the start of the trailing substring according to the spanCondition; 1133 * the string length if the end of the string does not fit the spanCondition 1134 * @stable ICU 3.8 1135 * @see USetSpanCondition 1136 */ 1137 U_CAPI int32_t U_EXPORT2 1138 uset_spanBackUTF8(const USet *set, const char *s, int32_t length, USetSpanCondition spanCondition); 1139 1140 /** 1141 * Returns true if set1 contains all of the characters and strings 1142 * of set2, and vis versa. It answers the question, 'Is set1 equal to set2?' 1143 * @param set1 set to be checked for containment 1144 * @param set2 set to be checked for containment 1145 * @return true if the test condition is met 1146 * @stable ICU 3.2 1147 */ 1148 U_CAPI UBool U_EXPORT2 1149 uset_equals(const USet* set1, const USet* set2); 1150 1151 /********************************************************************* 1152 * Serialized set API 1153 *********************************************************************/ 1154 1155 /** 1156 * Serializes this set into an array of 16-bit integers. Serialization 1157 * (currently) only records the characters in the set; multicharacter 1158 * strings are ignored. 1159 * 1160 * The array 1161 * has following format (each line is one 16-bit integer): 1162 * 1163 * length = (n+2*m) | (m!=0?0x8000:0) 1164 * bmpLength = n; present if m!=0 1165 * bmp[0] 1166 * bmp[1] 1167 * ... 1168 * bmp[n-1] 1169 * supp-high[0] 1170 * supp-low[0] 1171 * supp-high[1] 1172 * supp-low[1] 1173 * ... 1174 * supp-high[m-1] 1175 * supp-low[m-1] 1176 * 1177 * The array starts with a header. After the header are n bmp 1178 * code points, then m supplementary code points. Either n or m 1179 * or both may be zero. n+2*m is always <= 0x7FFF. 1180 * 1181 * If there are no supplementary characters (if m==0) then the 1182 * header is one 16-bit integer, 'length', with value n. 1183 * 1184 * If there are supplementary characters (if m!=0) then the header 1185 * is two 16-bit integers. The first, 'length', has value 1186 * (n+2*m)|0x8000. The second, 'bmpLength', has value n. 1187 * 1188 * After the header the code points are stored in ascending order. 1189 * Supplementary code points are stored as most significant 16 1190 * bits followed by least significant 16 bits. 1191 * 1192 * @param set the set 1193 * @param dest pointer to buffer of destCapacity 16-bit integers. 1194 * May be NULL only if destCapacity is zero. 1195 * @param destCapacity size of dest, or zero. Must not be negative. 1196 * @param pErrorCode pointer to the error code. Will be set to 1197 * U_INDEX_OUTOFBOUNDS_ERROR if n+2*m > 0x7FFF. Will be set to 1198 * U_BUFFER_OVERFLOW_ERROR if n+2*m+(m!=0?2:1) > destCapacity. 1199 * @return the total length of the serialized format, including 1200 * the header, that is, n+2*m+(m!=0?2:1), or 0 on error other 1201 * than U_BUFFER_OVERFLOW_ERROR. 1202 * @stable ICU 2.4 1203 */ 1204 U_CAPI int32_t U_EXPORT2 1205 uset_serialize(const USet* set, uint16_t* dest, int32_t destCapacity, UErrorCode* pErrorCode); 1206 1207 /** 1208 * Given a serialized array, fill in the given serialized set object. 1209 * @param fillSet pointer to result 1210 * @param src pointer to start of array 1211 * @param srcLength length of array 1212 * @return true if the given array is valid, otherwise false 1213 * @stable ICU 2.4 1214 */ 1215 U_CAPI UBool U_EXPORT2 1216 uset_getSerializedSet(USerializedSet* fillSet, const uint16_t* src, int32_t srcLength); 1217 1218 /** 1219 * Set the USerializedSet to contain the given character (and nothing 1220 * else). 1221 * @param fillSet pointer to result 1222 * @param c The codepoint to set 1223 * @stable ICU 2.4 1224 */ 1225 U_CAPI void U_EXPORT2 1226 uset_setSerializedToOne(USerializedSet* fillSet, UChar32 c); 1227 1228 /** 1229 * Returns true if the given USerializedSet contains the given 1230 * character. 1231 * @param set the serialized set 1232 * @param c The codepoint to check for within the set 1233 * @return true if set contains c 1234 * @stable ICU 2.4 1235 */ 1236 U_CAPI UBool U_EXPORT2 1237 uset_serializedContains(const USerializedSet* set, UChar32 c); 1238 1239 /** 1240 * Returns the number of disjoint ranges of characters contained in 1241 * the given serialized set. Ignores any strings contained in the 1242 * set. 1243 * @param set the serialized set 1244 * @return a non-negative integer counting the character ranges 1245 * contained in set 1246 * @stable ICU 2.4 1247 */ 1248 U_CAPI int32_t U_EXPORT2 1249 uset_getSerializedRangeCount(const USerializedSet* set); 1250 1251 /** 1252 * Returns a range of characters contained in the given serialized 1253 * set. 1254 * @param set the serialized set 1255 * @param rangeIndex a non-negative integer in the range 0.. 1256 * uset_getSerializedRangeCount(set)-1 1257 * @param pStart pointer to variable to receive first character 1258 * in range, inclusive 1259 * @param pEnd pointer to variable to receive last character in range, 1260 * inclusive 1261 * @return true if rangeIndex is valid, otherwise false 1262 * @stable ICU 2.4 1263 */ 1264 U_CAPI UBool U_EXPORT2 1265 uset_getSerializedRange(const USerializedSet* set, int32_t rangeIndex, 1266 UChar32* pStart, UChar32* pEnd); 1267 1268 #endif 1269