1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2002-2004, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: uiter.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002jan18 14 * created by: Markus W. Scherer 15 */ 16 17 #ifndef __UITER_H__ 18 #define __UITER_H__ 19 20 /** 21 * \file 22 * \brief C API: Unicode Character Iteration 23 * 24 * @see UCharIterator 25 */ 26 27 #include "unicode/utypes.h" 28 29 #ifdef XP_CPLUSPLUS 30 U_NAMESPACE_BEGIN 31 32 class CharacterIterator; 33 class Replaceable; 34 35 U_NAMESPACE_END 36 #endif 37 38 U_CDECL_BEGIN 39 40 struct UCharIterator; 41 typedef struct UCharIterator UCharIterator; /**< C typedef for struct UCharIterator. @stable ICU 2.1 */ 42 43 /** 44 * Origin constants for UCharIterator.getIndex() and UCharIterator.move(). 45 * @see UCharIteratorMove 46 * @see UCharIterator 47 * @stable ICU 2.1 48 */ 49 typedef enum UCharIteratorOrigin { 50 UITER_START, UITER_CURRENT, UITER_LIMIT, UITER_ZERO, UITER_LENGTH 51 } UCharIteratorOrigin; 52 53 /** Constants for UCharIterator. @stable ICU 2.6 */ 54 enum { 55 /** 56 * Constant value that may be returned by UCharIteratorMove 57 * indicating that the final UTF-16 index is not known, but that the move succeeded. 58 * This can occur when moving relative to limit or length, or 59 * when moving relative to the current index after a setState() 60 * when the current UTF-16 index is not known. 61 * 62 * It would be very inefficient to have to count from the beginning of the text 63 * just to get the current/limit/length index after moving relative to it. 64 * The actual index can be determined with getIndex(UITER_CURRENT) 65 * which will count the UChars if necessary. 66 * 67 * @stable ICU 2.6 68 */ 69 UITER_UNKNOWN_INDEX=-2 70 }; 71 72 73 /** 74 * Constant for UCharIterator getState() indicating an error or 75 * an unknown state. 76 * Returned by uiter_getState()/UCharIteratorGetState 77 * when an error occurs. 78 * Also, some UCharIterator implementations may not be able to return 79 * a valid state for each position. This will be clearly documented 80 * for each such iterator (none of the public ones here). 81 * 82 * @stable ICU 2.6 83 */ 84 #define UITER_NO_STATE ((uint32_t)0xffffffff) 85 86 /** 87 * Function type declaration for UCharIterator.getIndex(). 88 * 89 * Gets the current position, or the start or limit of the 90 * iteration range. 91 * 92 * This function may perform slowly for UITER_CURRENT after setState() was called, 93 * or for UITER_LENGTH, because an iterator implementation may have to count 94 * UChars if the underlying storage is not UTF-16. 95 * 96 * @param iter the UCharIterator structure ("this pointer") 97 * @param origin get the 0, start, limit, length, or current index 98 * @return the requested index, or U_SENTINEL in an error condition 99 * 100 * @see UCharIteratorOrigin 101 * @see UCharIterator 102 * @stable ICU 2.1 103 */ 104 typedef int32_t U_CALLCONV 105 UCharIteratorGetIndex(UCharIterator *iter, UCharIteratorOrigin origin); 106 107 /** 108 * Function type declaration for UCharIterator.move(). 109 * 110 * Use iter->move(iter, index, UITER_ZERO) like CharacterIterator::setIndex(index). 111 * 112 * Moves the current position relative to the start or limit of the 113 * iteration range, or relative to the current position itself. 114 * The movement is expressed in numbers of code units forward 115 * or backward by specifying a positive or negative delta. 116 * Out of bounds movement will be pinned to the start or limit. 117 * 118 * This function may perform slowly for moving relative to UITER_LENGTH 119 * because an iterator implementation may have to count the rest of the 120 * UChars if the native storage is not UTF-16. 121 * 122 * When moving relative to the limit or length, or 123 * relative to the current position after setState() was called, 124 * move() may return UITER_UNKNOWN_INDEX (-2) to avoid an inefficient 125 * determination of the actual UTF-16 index. 126 * The actual index can be determined with getIndex(UITER_CURRENT) 127 * which will count the UChars if necessary. 128 * See UITER_UNKNOWN_INDEX for details. 129 * 130 * @param iter the UCharIterator structure ("this pointer") 131 * @param delta can be positive, zero, or negative 132 * @param origin move relative to the 0, start, limit, length, or current index 133 * @return the new index, or U_SENTINEL on an error condition, 134 * or UITER_UNKNOWN_INDEX when the index is not known. 135 * 136 * @see UCharIteratorOrigin 137 * @see UCharIterator 138 * @see UITER_UNKNOWN_INDEX 139 * @stable ICU 2.1 140 */ 141 typedef int32_t U_CALLCONV 142 UCharIteratorMove(UCharIterator *iter, int32_t delta, UCharIteratorOrigin origin); 143 144 /** 145 * Function type declaration for UCharIterator.hasNext(). 146 * 147 * Check if current() and next() can still 148 * return another code unit. 149 * 150 * @param iter the UCharIterator structure ("this pointer") 151 * @return boolean value for whether current() and next() can still return another code unit 152 * 153 * @see UCharIterator 154 * @stable ICU 2.1 155 */ 156 typedef UBool U_CALLCONV 157 UCharIteratorHasNext(UCharIterator *iter); 158 159 /** 160 * Function type declaration for UCharIterator.hasPrevious(). 161 * 162 * Check if previous() can still return another code unit. 163 * 164 * @param iter the UCharIterator structure ("this pointer") 165 * @return boolean value for whether previous() can still return another code unit 166 * 167 * @see UCharIterator 168 * @stable ICU 2.1 169 */ 170 typedef UBool U_CALLCONV 171 UCharIteratorHasPrevious(UCharIterator *iter); 172 173 /** 174 * Function type declaration for UCharIterator.current(). 175 * 176 * Return the code unit at the current position, 177 * or U_SENTINEL if there is none (index is at the limit). 178 * 179 * @param iter the UCharIterator structure ("this pointer") 180 * @return the current code unit 181 * 182 * @see UCharIterator 183 * @stable ICU 2.1 184 */ 185 typedef UChar32 U_CALLCONV 186 UCharIteratorCurrent(UCharIterator *iter); 187 188 /** 189 * Function type declaration for UCharIterator.next(). 190 * 191 * Return the code unit at the current index and increment 192 * the index (post-increment, like s[i++]), 193 * or return U_SENTINEL if there is none (index is at the limit). 194 * 195 * @param iter the UCharIterator structure ("this pointer") 196 * @return the current code unit (and post-increment the current index) 197 * 198 * @see UCharIterator 199 * @stable ICU 2.1 200 */ 201 typedef UChar32 U_CALLCONV 202 UCharIteratorNext(UCharIterator *iter); 203 204 /** 205 * Function type declaration for UCharIterator.previous(). 206 * 207 * Decrement the index and return the code unit from there 208 * (pre-decrement, like s[--i]), 209 * or return U_SENTINEL if there is none (index is at the start). 210 * 211 * @param iter the UCharIterator structure ("this pointer") 212 * @return the previous code unit (after pre-decrementing the current index) 213 * 214 * @see UCharIterator 215 * @stable ICU 2.1 216 */ 217 typedef UChar32 U_CALLCONV 218 UCharIteratorPrevious(UCharIterator *iter); 219 220 /** 221 * Function type declaration for UCharIterator.reservedFn(). 222 * Reserved for future use. 223 * 224 * @param iter the UCharIterator structure ("this pointer") 225 * @param something some integer argument 226 * @return some integer 227 * 228 * @see UCharIterator 229 * @stable ICU 2.1 230 */ 231 typedef int32_t U_CALLCONV 232 UCharIteratorReserved(UCharIterator *iter, int32_t something); 233 234 /** 235 * Function type declaration for UCharIterator.getState(). 236 * 237 * Get the "state" of the iterator in the form of a single 32-bit word. 238 * It is recommended that the state value be calculated to be as small as 239 * is feasible. For strings with limited lengths, fewer than 32 bits may 240 * be sufficient. 241 * 242 * This is used together with setState()/UCharIteratorSetState 243 * to save and restore the iterator position more efficiently than with 244 * getIndex()/move(). 245 * 246 * The iterator state is defined as a uint32_t value because it is designed 247 * for use in ucol_nextSortKeyPart() which provides 32 bits to store the state 248 * of the character iterator. 249 * 250 * With some UCharIterator implementations (e.g., UTF-8), 251 * getting and setting the UTF-16 index with existing functions 252 * (getIndex(UITER_CURRENT) followed by move(pos, UITER_ZERO)) is possible but 253 * relatively slow because the iterator has to "walk" from a known index 254 * to the requested one. 255 * This takes more time the farther it needs to go. 256 * 257 * An opaque state value allows an iterator implementation to provide 258 * an internal index (UTF-8: the source byte array index) for 259 * fast, constant-time restoration. 260 * 261 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because 262 * the UTF-16 index may not be restored as well, but the iterator can deliver 263 * the correct text contents and move relative to the current position 264 * without performance degradation. 265 * 266 * Some UCharIterator implementations may not be able to return 267 * a valid state for each position, in which case they return UITER_NO_STATE instead. 268 * This will be clearly documented for each such iterator (none of the public ones here). 269 * 270 * @param iter the UCharIterator structure ("this pointer") 271 * @return the state word 272 * 273 * @see UCharIterator 274 * @see UCharIteratorSetState 275 * @see UITER_NO_STATE 276 * @stable ICU 2.6 277 */ 278 typedef uint32_t U_CALLCONV 279 UCharIteratorGetState(const UCharIterator *iter); 280 281 /** 282 * Function type declaration for UCharIterator.setState(). 283 * 284 * Restore the "state" of the iterator using a state word from a getState() call. 285 * The iterator object need not be the same one as for which getState() was called, 286 * but it must be of the same type (set up using the same uiter_setXYZ function) 287 * and it must iterate over the same string 288 * (binary identical regardless of memory address). 289 * For more about the state word see UCharIteratorGetState. 290 * 291 * After calling setState(), a getIndex(UITER_CURRENT) may be slow because 292 * the UTF-16 index may not be restored as well, but the iterator can deliver 293 * the correct text contents and move relative to the current position 294 * without performance degradation. 295 * 296 * @param iter the UCharIterator structure ("this pointer") 297 * @param state the state word from a getState() call 298 * on a same-type, same-string iterator 299 * @param pErrorCode Must be a valid pointer to an error code value, 300 * which must not indicate a failure before the function call. 301 * 302 * @see UCharIterator 303 * @see UCharIteratorGetState 304 * @stable ICU 2.6 305 */ 306 typedef void U_CALLCONV 307 UCharIteratorSetState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode); 308 309 310 /** 311 * C API for code unit iteration. 312 * This can be used as a C wrapper around 313 * CharacterIterator, Replaceable, or implemented using simple strings, etc. 314 * 315 * There are two roles for using UCharIterator: 316 * 317 * A "provider" sets the necessary function pointers and controls the "protected" 318 * fields of the UCharIterator structure. A "provider" passes a UCharIterator 319 * into C APIs that need a UCharIterator as an abstract, flexible string interface. 320 * 321 * Implementations of such C APIs are "callers" of UCharIterator functions; 322 * they only use the "public" function pointers and never access the "protected" 323 * fields directly. 324 * 325 * The current() and next() functions only check the current index against the 326 * limit, and previous() only checks the current index against the start, 327 * to see if the iterator already reached the end of the iteration range. 328 * 329 * The assumption - in all iterators - is that the index is moved via the API, 330 * which means it won't go out of bounds, or the index is modified by 331 * user code that knows enough about the iterator implementation to set valid 332 * index values. 333 * 334 * UCharIterator functions return code unit values 0..0xffff, 335 * or U_SENTINEL if the iteration bounds are reached. 336 * 337 * @stable ICU 2.1 338 */ 339 struct UCharIterator { 340 /** 341 * (protected) Pointer to string or wrapped object or similar. 342 * Not used by caller. 343 * @stable ICU 2.1 344 */ 345 const void *context; 346 347 /** 348 * (protected) Length of string or similar. 349 * Not used by caller. 350 * @stable ICU 2.1 351 */ 352 int32_t length; 353 354 /** 355 * (protected) Start index or similar. 356 * Not used by caller. 357 * @stable ICU 2.1 358 */ 359 int32_t start; 360 361 /** 362 * (protected) Current index or similar. 363 * Not used by caller. 364 * @stable ICU 2.1 365 */ 366 int32_t index; 367 368 /** 369 * (protected) Limit index or similar. 370 * Not used by caller. 371 * @stable ICU 2.1 372 */ 373 int32_t limit; 374 375 /** 376 * (protected) Used by UTF-8 iterators and possibly others. 377 * @stable ICU 2.1 378 */ 379 int32_t reservedField; 380 381 /** 382 * (public) Returns the current position or the 383 * start or limit index of the iteration range. 384 * 385 * @see UCharIteratorGetIndex 386 * @stable ICU 2.1 387 */ 388 UCharIteratorGetIndex *getIndex; 389 390 /** 391 * (public) Moves the current position relative to the start or limit of the 392 * iteration range, or relative to the current position itself. 393 * The movement is expressed in numbers of code units forward 394 * or backward by specifying a positive or negative delta. 395 * 396 * @see UCharIteratorMove 397 * @stable ICU 2.1 398 */ 399 UCharIteratorMove *move; 400 401 /** 402 * (public) Check if current() and next() can still 403 * return another code unit. 404 * 405 * @see UCharIteratorHasNext 406 * @stable ICU 2.1 407 */ 408 UCharIteratorHasNext *hasNext; 409 410 /** 411 * (public) Check if previous() can still return another code unit. 412 * 413 * @see UCharIteratorHasPrevious 414 * @stable ICU 2.1 415 */ 416 UCharIteratorHasPrevious *hasPrevious; 417 418 /** 419 * (public) Return the code unit at the current position, 420 * or U_SENTINEL if there is none (index is at the limit). 421 * 422 * @see UCharIteratorCurrent 423 * @stable ICU 2.1 424 */ 425 UCharIteratorCurrent *current; 426 427 /** 428 * (public) Return the code unit at the current index and increment 429 * the index (post-increment, like s[i++]), 430 * or return U_SENTINEL if there is none (index is at the limit). 431 * 432 * @see UCharIteratorNext 433 * @stable ICU 2.1 434 */ 435 UCharIteratorNext *next; 436 437 /** 438 * (public) Decrement the index and return the code unit from there 439 * (pre-decrement, like s[--i]), 440 * or return U_SENTINEL if there is none (index is at the start). 441 * 442 * @see UCharIteratorPrevious 443 * @stable ICU 2.1 444 */ 445 UCharIteratorPrevious *previous; 446 447 /** 448 * (public) Reserved for future use. Currently NULL. 449 * 450 * @see UCharIteratorReserved 451 * @stable ICU 2.1 452 */ 453 UCharIteratorReserved *reservedFn; 454 455 /** 456 * (public) Return the state of the iterator, to be restored later with setState(). 457 * This function pointer is NULL if the iterator does not implement it. 458 * 459 * @see UCharIteratorGet 460 * @stable ICU 2.6 461 */ 462 UCharIteratorGetState *getState; 463 464 /** 465 * (public) Restore the iterator state from the state word from a call 466 * to getState(). 467 * This function pointer is NULL if the iterator does not implement it. 468 * 469 * @see UCharIteratorSet 470 * @stable ICU 2.6 471 */ 472 UCharIteratorSetState *setState; 473 }; 474 475 /** 476 * Helper function for UCharIterator to get the code point 477 * at the current index. 478 * 479 * Return the code point that includes the code unit at the current position, 480 * or U_SENTINEL if there is none (index is at the limit). 481 * If the current code unit is a lead or trail surrogate, 482 * then the following or preceding surrogate is used to form 483 * the code point value. 484 * 485 * @param iter the UCharIterator structure ("this pointer") 486 * @return the current code point 487 * 488 * @see UCharIterator 489 * @see U16_GET 490 * @see UnicodeString::char32At() 491 * @stable ICU 2.1 492 */ 493 U_STABLE UChar32 U_EXPORT2 494 uiter_current32(UCharIterator *iter); 495 496 /** 497 * Helper function for UCharIterator to get the next code point. 498 * 499 * Return the code point at the current index and increment 500 * the index (post-increment, like s[i++]), 501 * or return U_SENTINEL if there is none (index is at the limit). 502 * 503 * @param iter the UCharIterator structure ("this pointer") 504 * @return the current code point (and post-increment the current index) 505 * 506 * @see UCharIterator 507 * @see U16_NEXT 508 * @stable ICU 2.1 509 */ 510 U_STABLE UChar32 U_EXPORT2 511 uiter_next32(UCharIterator *iter); 512 513 /** 514 * Helper function for UCharIterator to get the previous code point. 515 * 516 * Decrement the index and return the code point from there 517 * (pre-decrement, like s[--i]), 518 * or return U_SENTINEL if there is none (index is at the start). 519 * 520 * @param iter the UCharIterator structure ("this pointer") 521 * @return the previous code point (after pre-decrementing the current index) 522 * 523 * @see UCharIterator 524 * @see U16_PREV 525 * @stable ICU 2.1 526 */ 527 U_STABLE UChar32 U_EXPORT2 528 uiter_previous32(UCharIterator *iter); 529 530 /** 531 * Get the "state" of the iterator in the form of a single 32-bit word. 532 * This is a convenience function that calls iter->getState(iter) 533 * if iter->getState is not NULL; 534 * if it is NULL or any other error occurs, then UITER_NO_STATE is returned. 535 * 536 * Some UCharIterator implementations may not be able to return 537 * a valid state for each position, in which case they return UITER_NO_STATE instead. 538 * This will be clearly documented for each such iterator (none of the public ones here). 539 * 540 * @param iter the UCharIterator structure ("this pointer") 541 * @return the state word 542 * 543 * @see UCharIterator 544 * @see UCharIteratorGetState 545 * @see UITER_NO_STATE 546 * @stable ICU 2.6 547 */ 548 U_STABLE uint32_t U_EXPORT2 549 uiter_getState(const UCharIterator *iter); 550 551 /** 552 * Restore the "state" of the iterator using a state word from a getState() call. 553 * This is a convenience function that calls iter->setState(iter, state, pErrorCode) 554 * if iter->setState is not NULL; if it is NULL, then U_UNSUPPORTED_ERROR is set. 555 * 556 * @param iter the UCharIterator structure ("this pointer") 557 * @param state the state word from a getState() call 558 * on a same-type, same-string iterator 559 * @param pErrorCode Must be a valid pointer to an error code value, 560 * which must not indicate a failure before the function call. 561 * 562 * @see UCharIterator 563 * @see UCharIteratorSetState 564 * @stable ICU 2.6 565 */ 566 U_STABLE void U_EXPORT2 567 uiter_setState(UCharIterator *iter, uint32_t state, UErrorCode *pErrorCode); 568 569 /** 570 * Set up a UCharIterator to iterate over a string. 571 * 572 * Sets the UCharIterator function pointers for iteration over the string s 573 * with iteration boundaries start=index=0 and length=limit=string length. 574 * The "provider" may set the start, index, and limit values at any time 575 * within the range 0..length. 576 * The length field will be ignored. 577 * 578 * The string pointer s is set into UCharIterator.context without copying 579 * or reallocating the string contents. 580 * 581 * getState() simply returns the current index. 582 * move() will always return the final index. 583 * 584 * @param iter UCharIterator structure to be set for iteration 585 * @param s String to iterate over 586 * @param length Length of s, or -1 if NUL-terminated 587 * 588 * @see UCharIterator 589 * @stable ICU 2.1 590 */ 591 U_STABLE void U_EXPORT2 592 uiter_setString(UCharIterator *iter, const UChar *s, int32_t length); 593 594 /** 595 * Set up a UCharIterator to iterate over a UTF-16BE string 596 * (byte vector with a big-endian pair of bytes per UChar). 597 * 598 * Everything works just like with a normal UChar iterator (uiter_setString), 599 * except that UChars are assembled from byte pairs, 600 * and that the length argument here indicates an even number of bytes. 601 * 602 * getState() simply returns the current index. 603 * move() will always return the final index. 604 * 605 * @param iter UCharIterator structure to be set for iteration 606 * @param s UTF-16BE string to iterate over 607 * @param length Length of s as an even number of bytes, or -1 if NUL-terminated 608 * (NUL means pair of 0 bytes at even index from s) 609 * 610 * @see UCharIterator 611 * @see uiter_setString 612 * @stable ICU 2.6 613 */ 614 U_STABLE void U_EXPORT2 615 uiter_setUTF16BE(UCharIterator *iter, const char *s, int32_t length); 616 617 /** 618 * Set up a UCharIterator to iterate over a UTF-8 string. 619 * 620 * Sets the UCharIterator function pointers for iteration over the UTF-8 string s 621 * with UTF-8 iteration boundaries 0 and length. 622 * The implementation counts the UTF-16 index on the fly and 623 * lazily evaluates the UTF-16 length of the text. 624 * 625 * The start field is used as the UTF-8 offset, the limit field as the UTF-8 length. 626 * When the reservedField is not 0, then it contains a supplementary code point 627 * and the UTF-16 index is between the two corresponding surrogates. 628 * At that point, the UTF-8 index is behind that code point. 629 * 630 * The UTF-8 string pointer s is set into UCharIterator.context without copying 631 * or reallocating the string contents. 632 * 633 * getState() returns a state value consisting of 634 * - the current UTF-8 source byte index (bits 31..1) 635 * - a flag (bit 0) that indicates whether the UChar position is in the middle 636 * of a surrogate pair 637 * (from a 4-byte UTF-8 sequence for the corresponding supplementary code point) 638 * 639 * getState() cannot also encode the UTF-16 index in the state value. 640 * move(relative to limit or length), or 641 * move(relative to current) after setState(), may return UITER_UNKNOWN_INDEX. 642 * 643 * @param iter UCharIterator structure to be set for iteration 644 * @param s UTF-8 string to iterate over 645 * @param length Length of s in bytes, or -1 if NUL-terminated 646 * 647 * @see UCharIterator 648 * @stable ICU 2.6 649 */ 650 U_STABLE void U_EXPORT2 651 uiter_setUTF8(UCharIterator *iter, const char *s, int32_t length); 652 653 #ifdef XP_CPLUSPLUS 654 655 /** 656 * Set up a UCharIterator to wrap around a C++ CharacterIterator. 657 * 658 * Sets the UCharIterator function pointers for iteration using the 659 * CharacterIterator charIter. 660 * 661 * The CharacterIterator pointer charIter is set into UCharIterator.context 662 * without copying or cloning the CharacterIterator object. 663 * The other "protected" UCharIterator fields are set to 0 and will be ignored. 664 * The iteration index and boundaries are controlled by the CharacterIterator. 665 * 666 * getState() simply returns the current index. 667 * move() will always return the final index. 668 * 669 * @param iter UCharIterator structure to be set for iteration 670 * @param charIter CharacterIterator to wrap 671 * 672 * @see UCharIterator 673 * @stable ICU 2.1 674 */ 675 U_STABLE void U_EXPORT2 676 uiter_setCharacterIterator(UCharIterator *iter, CharacterIterator *charIter); 677 678 /** 679 * Set up a UCharIterator to iterate over a C++ Replaceable. 680 * 681 * Sets the UCharIterator function pointers for iteration over the 682 * Replaceable rep with iteration boundaries start=index=0 and 683 * length=limit=rep->length(). 684 * The "provider" may set the start, index, and limit values at any time 685 * within the range 0..length=rep->length(). 686 * The length field will be ignored. 687 * 688 * The Replaceable pointer rep is set into UCharIterator.context without copying 689 * or cloning/reallocating the Replaceable object. 690 * 691 * getState() simply returns the current index. 692 * move() will always return the final index. 693 * 694 * @param iter UCharIterator structure to be set for iteration 695 * @param rep Replaceable to iterate over 696 * 697 * @see UCharIterator 698 * @stable ICU 2.1 699 */ 700 U_STABLE void U_EXPORT2 701 uiter_setReplaceable(UCharIterator *iter, const Replaceable *rep); 702 703 #endif 704 705 U_CDECL_END 706 707 #endif 708