1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2004-2012, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: utext.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2004oct06 16 * created by: Markus W. Scherer 17 */ 18 19 #ifndef __UTEXT_H__ 20 #define __UTEXT_H__ 21 22 /** 23 * \file 24 * \brief C API: Abstract Unicode Text API 25 * 26 * The Text Access API provides a means to allow text that is stored in alternative 27 * formats to work with ICU services. ICU normally operates on text that is 28 * stored in UTF-16 format, in (UChar *) arrays for the C APIs or as type 29 * UnicodeString for C++ APIs. 30 * 31 * ICU Text Access allows other formats, such as UTF-8 or non-contiguous 32 * UTF-16 strings, to be placed in a UText wrapper and then passed to ICU services. 33 * 34 * There are three general classes of usage for UText: 35 * 36 * Application Level Use. This is the simplest usage - applications would 37 * use one of the utext_open() functions on their input text, and pass 38 * the resulting UText to the desired ICU service. 39 * 40 * Second is usage in ICU Services, such as break iteration, that will need to 41 * operate on input presented to them as a UText. These implementations 42 * will need to use the iteration and related UText functions to gain 43 * access to the actual text. 44 * 45 * The third class of UText users are "text providers." These are the 46 * UText implementations for the various text storage formats. An application 47 * or system with a unique text storage format can implement a set of 48 * UText provider functions for that format, which will then allow 49 * ICU services to operate on that format. 50 * 51 * 52 * <em>Iterating over text</em> 53 * 54 * Here is sample code for a forward iteration over the contents of a UText 55 * 56 * \code 57 * UChar32 c; 58 * UText *ut = whatever(); 59 * 60 * for (c=utext_next32From(ut, 0); c>=0; c=utext_next32(ut)) { 61 * // do whatever with the codepoint c here. 62 * } 63 * \endcode 64 * 65 * And here is similar code to iterate in the reverse direction, from the end 66 * of the text towards the beginning. 67 * 68 * \code 69 * UChar32 c; 70 * UText *ut = whatever(); 71 * int textLength = utext_nativeLength(ut); 72 * for (c=utext_previous32From(ut, textLength); c>=0; c=utext_previous32(ut)) { 73 * // do whatever with the codepoint c here. 74 * } 75 * \endcode 76 * 77 * <em>Characters and Indexing</em> 78 * 79 * Indexing into text by UText functions is nearly always in terms of the native 80 * indexing of the underlying text storage. The storage format could be UTF-8 81 * or UTF-32, for example. When coding to the UText access API, no assumptions 82 * can be made regarding the size of characters, or how far an index 83 * may move when iterating between characters. 84 * 85 * All indices supplied to UText functions are pinned to the length of the 86 * text. An out-of-bounds index is not considered to be an error, but is 87 * adjusted to be in the range 0 <= index <= length of input text. 88 * 89 * 90 * When an index position is returned from a UText function, it will be 91 * a native index to the underlying text. In the case of multi-unit characters, 92 * it will always refer to the first position of the character, 93 * never to the interior. This is essentially the same thing as saying that 94 * a returned index will always point to a boundary between characters. 95 * 96 * When a native index is supplied to a UText function, all indices that 97 * refer to any part of a multi-unit character representation are considered 98 * to be equivalent. In the case of multi-unit characters, an incoming index 99 * will be logically normalized to refer to the start of the character. 100 * 101 * It is possible to test whether a native index is on a code point boundary 102 * by doing a utext_setNativeIndex() followed by a utext_getNativeIndex(). 103 * If the index is returned unchanged, it was on a code point boundary. If 104 * an adjusted index is returned, the original index referred to the 105 * interior of a character. 106 * 107 * <em>Conventions for calling UText functions</em> 108 * 109 * Most UText access functions have as their first parameter a (UText *) pointer, 110 * which specifies the UText to be used. Unless otherwise noted, the 111 * pointer must refer to a valid, open UText. Attempting to 112 * use a closed UText or passing a NULL pointer is a programming error and 113 * will produce undefined results or NULL pointer exceptions. 114 * 115 * The UText_Open family of functions can either open an existing (closed) 116 * UText, or heap allocate a new UText. Here is sample code for creating 117 * a stack-allocated UText. 118 * 119 * \code 120 * char *s = whatever(); // A utf-8 string 121 * U_ErrorCode status = U_ZERO_ERROR; 122 * UText ut = UTEXT_INITIALIZER; 123 * utext_openUTF8(ut, s, -1, &status); 124 * if (U_FAILURE(status)) { 125 * // error handling 126 * } else { 127 * // work with the UText 128 * } 129 * \endcode 130 * 131 * Any existing UText passed to an open function _must_ have been initialized, 132 * either by the UTEXT_INITIALIZER, or by having been originally heap-allocated 133 * by an open function. Passing NULL will cause the open function to 134 * heap-allocate and fully initialize a new UText. 135 * 136 */ 137 138 139 140 #include "unicode/utypes.h" 141 #include "unicode/uchar.h" 142 #if U_SHOW_CPLUSPLUS_API 143 #include "unicode/localpointer.h" 144 #include "unicode/rep.h" 145 #include "unicode/unistr.h" 146 #include "unicode/chariter.h" 147 #endif 148 149 150 U_CDECL_BEGIN 151 152 struct UText; 153 typedef struct UText UText; /**< C typedef for struct UText. @stable ICU 3.6 */ 154 155 156 /*************************************************************************************** 157 * 158 * C Functions for creating UText wrappers around various kinds of text strings. 159 * 160 ****************************************************************************************/ 161 162 163 /** 164 * Close function for UText instances. 165 * Cleans up, releases any resources being held by an open UText. 166 * <p> 167 * If the UText was originally allocated by one of the utext_open functions, 168 * the storage associated with the utext will also be freed. 169 * If the UText storage originated with the application, as it would with 170 * a local or static instance, the storage will not be deleted. 171 * 172 * An open UText can be reset to refer to new string by using one of the utext_open() 173 * functions without first closing the UText. 174 * 175 * @param ut The UText to be closed. 176 * @return NULL if the UText struct was deleted by the close. If the UText struct 177 * was originally provided by the caller to the open function, it is 178 * returned by this function, and may be safely used again in 179 * a subsequent utext_open. 180 * 181 * @stable ICU 3.4 182 */ 183 U_STABLE UText * U_EXPORT2 184 utext_close(UText *ut); 185 186 #if U_SHOW_CPLUSPLUS_API 187 188 U_NAMESPACE_BEGIN 189 190 /** 191 * \class LocalUTextPointer 192 * "Smart pointer" class, closes a UText via utext_close(). 193 * For most methods see the LocalPointerBase base class. 194 * 195 * @see LocalPointerBase 196 * @see LocalPointer 197 * @stable ICU 4.4 198 */ 199 U_DEFINE_LOCAL_OPEN_POINTER(LocalUTextPointer, UText, utext_close); 200 201 U_NAMESPACE_END 202 203 #endif 204 205 /** 206 * Open a read-only UText implementation for UTF-8 strings. 207 * 208 * \htmlonly 209 * Any invalid UTF-8 in the input will be handled in this way: 210 * a sequence of bytes that has the form of a truncated, but otherwise valid, 211 * UTF-8 sequence will be replaced by a single unicode replacement character, \uFFFD. 212 * Any other illegal bytes will each be replaced by a \uFFFD. 213 * \endhtmlonly 214 * 215 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 216 * If non-NULL, must refer to an initialized UText struct, which will then 217 * be reset to reference the specified UTF-8 string. 218 * @param s A UTF-8 string. Must not be NULL. 219 * @param length The length of the UTF-8 string in bytes, or -1 if the string is 220 * zero terminated. 221 * @param status Errors are returned here. 222 * @return A pointer to the UText. If a pre-allocated UText was provided, it 223 * will always be used and returned. 224 * @stable ICU 3.4 225 */ 226 U_STABLE UText * U_EXPORT2 227 utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status); 228 229 230 /** 231 * Open a read-only UText for UChar * string. 232 * 233 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 234 * If non-NULL, must refer to an initialized UText struct, which will then 235 * be reset to reference the specified UChar string. 236 * @param s A UChar (UTF-16) string 237 * @param length The number of UChars in the input string, or -1 if the string is 238 * zero terminated. 239 * @param status Errors are returned here. 240 * @return A pointer to the UText. If a pre-allocated UText was provided, it 241 * will always be used and returned. 242 * @stable ICU 3.4 243 */ 244 U_STABLE UText * U_EXPORT2 245 utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status); 246 247 248 #if U_SHOW_CPLUSPLUS_API 249 /** 250 * Open a writable UText for a non-const UnicodeString. 251 * 252 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 253 * If non-NULL, must refer to an initialized UText struct, which will then 254 * be reset to reference the specified input string. 255 * @param s A UnicodeString. 256 * @param status Errors are returned here. 257 * @return Pointer to the UText. If a UText was supplied as input, this 258 * will always be used and returned. 259 * @stable ICU 3.4 260 */ 261 U_STABLE UText * U_EXPORT2 262 utext_openUnicodeString(UText *ut, icu::UnicodeString *s, UErrorCode *status); 263 264 265 /** 266 * Open a UText for a const UnicodeString. The resulting UText will not be writable. 267 * 268 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 269 * If non-NULL, must refer to an initialized UText struct, which will then 270 * be reset to reference the specified input string. 271 * @param s A const UnicodeString to be wrapped. 272 * @param status Errors are returned here. 273 * @return Pointer to the UText. If a UText was supplied as input, this 274 * will always be used and returned. 275 * @stable ICU 3.4 276 */ 277 U_STABLE UText * U_EXPORT2 278 utext_openConstUnicodeString(UText *ut, const icu::UnicodeString *s, UErrorCode *status); 279 280 281 /** 282 * Open a writable UText implementation for an ICU Replaceable object. 283 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 284 * If non-NULL, must refer to an already existing UText, which will then 285 * be reset to reference the specified replaceable text. 286 * @param rep A Replaceable text object. 287 * @param status Errors are returned here. 288 * @return Pointer to the UText. If a UText was supplied as input, this 289 * will always be used and returned. 290 * @see Replaceable 291 * @stable ICU 3.4 292 */ 293 U_STABLE UText * U_EXPORT2 294 utext_openReplaceable(UText *ut, icu::Replaceable *rep, UErrorCode *status); 295 296 /** 297 * Open a UText implementation over an ICU CharacterIterator. 298 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 299 * If non-NULL, must refer to an already existing UText, which will then 300 * be reset to reference the specified replaceable text. 301 * @param ci A Character Iterator. 302 * @param status Errors are returned here. 303 * @return Pointer to the UText. If a UText was supplied as input, this 304 * will always be used and returned. 305 * @see Replaceable 306 * @stable ICU 3.4 307 */ 308 U_STABLE UText * U_EXPORT2 309 utext_openCharacterIterator(UText *ut, icu::CharacterIterator *ci, UErrorCode *status); 310 311 #endif 312 313 314 /** 315 * Clone a UText. This is much like opening a UText where the source text is itself 316 * another UText. 317 * 318 * A deep clone will copy both the UText data structures and the underlying text. 319 * The original and cloned UText will operate completely independently; modifications 320 * made to the text in one will not affect the other. Text providers are not 321 * required to support deep clones. The user of clone() must check the status return 322 * and be prepared to handle failures. 323 * 324 * The standard UText implementations for UTF8, UChar *, UnicodeString and 325 * Replaceable all support deep cloning. 326 * 327 * The UText returned from a deep clone will be writable, assuming that the text 328 * provider is able to support writing, even if the source UText had been made 329 * non-writable by means of UText_freeze(). 330 * 331 * A shallow clone replicates only the UText data structures; it does not make 332 * a copy of the underlying text. Shallow clones can be used as an efficient way to 333 * have multiple iterators active in a single text string that is not being 334 * modified. 335 * 336 * A shallow clone operation will not fail, barring truly exceptional conditions such 337 * as memory allocation failures. 338 * 339 * Shallow UText clones should be avoided if the UText functions that modify the 340 * text are expected to be used, either on the original or the cloned UText. 341 * Any such modifications can cause unpredictable behavior. Read Only 342 * shallow clones provide some protection against errors of this type by 343 * disabling text modification via the cloned UText. 344 * 345 * A shallow clone made with the readOnly parameter == FALSE will preserve the 346 * utext_isWritable() state of the source object. Note, however, that 347 * write operations must be avoided while more than one UText exists that refer 348 * to the same underlying text. 349 * 350 * A UText and its clone may be safely concurrently accessed by separate threads. 351 * This is true for read access only with shallow clones, and for both read and 352 * write access with deep clones. 353 * It is the responsibility of the Text Provider to ensure that this thread safety 354 * constraint is met. 355 * 356 * @param dest A UText struct to be filled in with the result of the clone operation, 357 * or NULL if the clone function should heap-allocate a new UText struct. 358 * If non-NULL, must refer to an already existing UText, which will then 359 * be reset to become the clone. 360 * @param src The UText to be cloned. 361 * @param deep TRUE to request a deep clone, FALSE for a shallow clone. 362 * @param readOnly TRUE to request that the cloned UText have read only access to the 363 * underlying text. 364 365 * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR 366 * will be returned if the text provider is unable to clone the 367 * original text. 368 * @return The newly created clone, or NULL if the clone operation failed. 369 * @stable ICU 3.4 370 */ 371 U_STABLE UText * U_EXPORT2 372 utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status); 373 374 375 /** 376 * Compare two UText objects for equality. 377 * UTexts are equal if they are iterating over the same text, and 378 * have the same iteration position within the text. 379 * If either or both of the parameters are NULL, the comparison is FALSE. 380 * 381 * @param a The first of the two UTexts to compare. 382 * @param b The other UText to be compared. 383 * @return TRUE if the two UTexts are equal. 384 * @stable ICU 3.6 385 */ 386 U_STABLE UBool U_EXPORT2 387 utext_equals(const UText *a, const UText *b); 388 389 390 /***************************************************************************** 391 * 392 * Functions to work with the text represented by a UText wrapper 393 * 394 *****************************************************************************/ 395 396 /** 397 * Get the length of the text. Depending on the characteristics 398 * of the underlying text representation, this may be expensive. 399 * @see utext_isLengthExpensive() 400 * 401 * 402 * @param ut the text to be accessed. 403 * @return the length of the text, expressed in native units. 404 * 405 * @stable ICU 3.4 406 */ 407 U_STABLE int64_t U_EXPORT2 408 utext_nativeLength(UText *ut); 409 410 /** 411 * Return TRUE if calculating the length of the text could be expensive. 412 * Finding the length of NUL terminated strings is considered to be expensive. 413 * 414 * Note that the value of this function may change 415 * as the result of other operations on a UText. 416 * Once the length of a string has been discovered, it will no longer 417 * be expensive to report it. 418 * 419 * @param ut the text to be accessed. 420 * @return TRUE if determining the length of the text could be time consuming. 421 * @stable ICU 3.4 422 */ 423 U_STABLE UBool U_EXPORT2 424 utext_isLengthExpensive(const UText *ut); 425 426 /** 427 * Returns the code point at the requested index, 428 * or U_SENTINEL (-1) if it is out of bounds. 429 * 430 * If the specified index points to the interior of a multi-unit 431 * character - one of the trail bytes of a UTF-8 sequence, for example - 432 * the complete code point will be returned. 433 * 434 * The iteration position will be set to the start of the returned code point. 435 * 436 * This function is roughly equivalent to the sequence 437 * utext_setNativeIndex(index); 438 * utext_current32(); 439 * (There is a subtle difference if the index is out of bounds by being less than zero - 440 * utext_setNativeIndex(negative value) sets the index to zero, after which utext_current() 441 * will return the char at zero. utext_char32At(negative index), on the other hand, will 442 * return the U_SENTINEL value of -1.) 443 * 444 * @param ut the text to be accessed 445 * @param nativeIndex the native index of the character to be accessed. If the index points 446 * to other than the first unit of a multi-unit character, it will be adjusted 447 * to the start of the character. 448 * @return the code point at the specified index. 449 * @stable ICU 3.4 450 */ 451 U_STABLE UChar32 U_EXPORT2 452 utext_char32At(UText *ut, int64_t nativeIndex); 453 454 455 /** 456 * 457 * Get the code point at the current iteration position, 458 * or U_SENTINEL (-1) if the iteration has reached the end of 459 * the input text. 460 * 461 * @param ut the text to be accessed. 462 * @return the Unicode code point at the current iterator position. 463 * @stable ICU 3.4 464 */ 465 U_STABLE UChar32 U_EXPORT2 466 utext_current32(UText *ut); 467 468 469 /** 470 * Get the code point at the current iteration position of the UText, and 471 * advance the position to the first index following the character. 472 * 473 * If the position is at the end of the text (the index following 474 * the last character, which is also the length of the text), 475 * return U_SENTINEL (-1) and do not advance the index. 476 * 477 * This is a post-increment operation. 478 * 479 * An inline macro version of this function, UTEXT_NEXT32(), 480 * is available for performance critical use. 481 * 482 * @param ut the text to be accessed. 483 * @return the Unicode code point at the iteration position. 484 * @see UTEXT_NEXT32 485 * @stable ICU 3.4 486 */ 487 U_STABLE UChar32 U_EXPORT2 488 utext_next32(UText *ut); 489 490 491 /** 492 * Move the iterator position to the character (code point) whose 493 * index precedes the current position, and return that character. 494 * This is a pre-decrement operation. 495 * 496 * If the initial position is at the start of the text (index of 0) 497 * return U_SENTINEL (-1), and leave the position unchanged. 498 * 499 * An inline macro version of this function, UTEXT_PREVIOUS32(), 500 * is available for performance critical use. 501 * 502 * @param ut the text to be accessed. 503 * @return the previous UChar32 code point, or U_SENTINEL (-1) 504 * if the iteration has reached the start of the text. 505 * @see UTEXT_PREVIOUS32 506 * @stable ICU 3.4 507 */ 508 U_STABLE UChar32 U_EXPORT2 509 utext_previous32(UText *ut); 510 511 512 /** 513 * Set the iteration index and return the code point at that index. 514 * Leave the iteration index at the start of the following code point. 515 * 516 * This function is the most efficient and convenient way to 517 * begin a forward iteration. The results are identical to the those 518 * from the sequence 519 * \code 520 * utext_setIndex(); 521 * utext_next32(); 522 * \endcode 523 * 524 * @param ut the text to be accessed. 525 * @param nativeIndex Iteration index, in the native units of the text provider. 526 * @return Code point which starts at or before index, 527 * or U_SENTINEL (-1) if it is out of bounds. 528 * @stable ICU 3.4 529 */ 530 U_STABLE UChar32 U_EXPORT2 531 utext_next32From(UText *ut, int64_t nativeIndex); 532 533 534 535 /** 536 * Set the iteration index, and return the code point preceding the 537 * one specified by the initial index. Leave the iteration position 538 * at the start of the returned code point. 539 * 540 * This function is the most efficient and convenient way to 541 * begin a backwards iteration. 542 * 543 * @param ut the text to be accessed. 544 * @param nativeIndex Iteration index in the native units of the text provider. 545 * @return Code point preceding the one at the initial index, 546 * or U_SENTINEL (-1) if it is out of bounds. 547 * 548 * @stable ICU 3.4 549 */ 550 U_STABLE UChar32 U_EXPORT2 551 utext_previous32From(UText *ut, int64_t nativeIndex); 552 553 /** 554 * Get the current iterator position, which can range from 0 to 555 * the length of the text. 556 * The position is a native index into the input text, in whatever format it 557 * may have (possibly UTF-8 for example), and may not always be the same as 558 * the corresponding UChar (UTF-16) index. 559 * The returned position will always be aligned to a code point boundary. 560 * 561 * @param ut the text to be accessed. 562 * @return the current index position, in the native units of the text provider. 563 * @stable ICU 3.4 564 */ 565 U_STABLE int64_t U_EXPORT2 566 utext_getNativeIndex(const UText *ut); 567 568 /** 569 * Set the current iteration position to the nearest code point 570 * boundary at or preceding the specified index. 571 * The index is in the native units of the original input text. 572 * If the index is out of range, it will be pinned to be within 573 * the range of the input text. 574 * <p> 575 * It will usually be more efficient to begin an iteration 576 * using the functions utext_next32From() or utext_previous32From() 577 * rather than setIndex(). 578 * <p> 579 * Moving the index position to an adjacent character is best done 580 * with utext_next32(), utext_previous32() or utext_moveIndex32(). 581 * Attempting to do direct arithmetic on the index position is 582 * complicated by the fact that the size (in native units) of a 583 * character depends on the underlying representation of the character 584 * (UTF-8, UTF-16, UTF-32, arbitrary codepage), and is not 585 * easily knowable. 586 * 587 * @param ut the text to be accessed. 588 * @param nativeIndex the native unit index of the new iteration position. 589 * @stable ICU 3.4 590 */ 591 U_STABLE void U_EXPORT2 592 utext_setNativeIndex(UText *ut, int64_t nativeIndex); 593 594 /** 595 * Move the iterator position by delta code points. The number of code points 596 * is a signed number; a negative delta will move the iterator backwards, 597 * towards the start of the text. 598 * <p> 599 * The index is moved by <code>delta</code> code points 600 * forward or backward, but no further backward than to 0 and 601 * no further forward than to utext_nativeLength(). 602 * The resulting index value will be in between 0 and length, inclusive. 603 * 604 * @param ut the text to be accessed. 605 * @param delta the signed number of code points to move the iteration position. 606 * @return TRUE if the position could be moved the requested number of positions while 607 * staying within the range [0 - text length]. 608 * @stable ICU 3.4 609 */ 610 U_STABLE UBool U_EXPORT2 611 utext_moveIndex32(UText *ut, int32_t delta); 612 613 /** 614 * Get the native index of the character preceding the current position. 615 * If the iteration position is already at the start of the text, zero 616 * is returned. 617 * The value returned is the same as that obtained from the following sequence, 618 * but without the side effect of changing the iteration position. 619 * 620 * \code 621 * UText *ut = whatever; 622 * ... 623 * utext_previous(ut) 624 * utext_getNativeIndex(ut); 625 * \endcode 626 * 627 * This function is most useful during forwards iteration, where it will get the 628 * native index of the character most recently returned from utext_next(). 629 * 630 * @param ut the text to be accessed 631 * @return the native index of the character preceding the current index position, 632 * or zero if the current position is at the start of the text. 633 * @stable ICU 3.6 634 */ 635 U_STABLE int64_t U_EXPORT2 636 utext_getPreviousNativeIndex(UText *ut); 637 638 639 /** 640 * 641 * Extract text from a UText into a UChar buffer. The range of text to be extracted 642 * is specified in the native indices of the UText provider. These may not necessarily 643 * be UTF-16 indices. 644 * <p> 645 * The size (number of 16 bit UChars) of the data to be extracted is returned. The 646 * full number of UChars is returned, even when the extracted text is truncated 647 * because the specified buffer size is too small. 648 * <p> 649 * The extracted string will (if you are a user) / must (if you are a text provider) 650 * be NUL-terminated if there is sufficient space in the destination buffer. This 651 * terminating NUL is not included in the returned length. 652 * <p> 653 * The iteration index is left at the position following the last extracted character. 654 * 655 * @param ut the UText from which to extract data. 656 * @param nativeStart the native index of the first character to extract.\ 657 * If the specified index is out of range, 658 * it will be pinned to be within 0 <= index <= textLength 659 * @param nativeLimit the native string index of the position following the last 660 * character to extract. If the specified index is out of range, 661 * it will be pinned to be within 0 <= index <= textLength. 662 * nativeLimit must be >= nativeStart. 663 * @param dest the UChar (UTF-16) buffer into which the extracted text is placed 664 * @param destCapacity The size, in UChars, of the destination buffer. May be zero 665 * for precomputing the required size. 666 * @param status receives any error status. 667 * U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the 668 * buffer was too small. Returns number of UChars for preflighting. 669 * @return Number of UChars in the data to be extracted. Does not include a trailing NUL. 670 * 671 * @stable ICU 3.4 672 */ 673 U_STABLE int32_t U_EXPORT2 674 utext_extract(UText *ut, 675 int64_t nativeStart, int64_t nativeLimit, 676 UChar *dest, int32_t destCapacity, 677 UErrorCode *status); 678 679 680 681 /************************************************************************************ 682 * 683 * #define inline versions of selected performance-critical text access functions 684 * Caution: do not use auto increment++ or decrement-- expressions 685 * as parameters to these macros. 686 * 687 * For most use, where there is no extreme performance constraint, the 688 * normal, non-inline functions are a better choice. The resulting code 689 * will be smaller, and, if the need ever arises, easier to debug. 690 * 691 * These are implemented as #defines rather than real functions 692 * because there is no fully portable way to do inline functions in plain C. 693 * 694 ************************************************************************************/ 695 696 #ifndef U_HIDE_INTERNAL_API 697 /** 698 * inline version of utext_current32(), for performance-critical situations. 699 * 700 * Get the code point at the current iteration position of the UText. 701 * Returns U_SENTINEL (-1) if the position is at the end of the 702 * text. 703 * 704 * @internal ICU 4.4 technology preview 705 */ 706 #define UTEXT_CURRENT32(ut) \ 707 ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ 708 ((ut)->chunkContents)[((ut)->chunkOffset)] : utext_current32(ut)) 709 #endif /* U_HIDE_INTERNAL_API */ 710 711 /** 712 * inline version of utext_next32(), for performance-critical situations. 713 * 714 * Get the code point at the current iteration position of the UText, and 715 * advance the position to the first index following the character. 716 * This is a post-increment operation. 717 * Returns U_SENTINEL (-1) if the position is at the end of the 718 * text. 719 * 720 * @stable ICU 3.4 721 */ 722 #define UTEXT_NEXT32(ut) \ 723 ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ 724 ((ut)->chunkContents)[((ut)->chunkOffset)++] : utext_next32(ut)) 725 726 /** 727 * inline version of utext_previous32(), for performance-critical situations. 728 * 729 * Move the iterator position to the character (code point) whose 730 * index precedes the current position, and return that character. 731 * This is a pre-decrement operation. 732 * Returns U_SENTINEL (-1) if the position is at the start of the text. 733 * 734 * @stable ICU 3.4 735 */ 736 #define UTEXT_PREVIOUS32(ut) \ 737 ((ut)->chunkOffset > 0 && \ 738 (ut)->chunkContents[(ut)->chunkOffset-1] < 0xd800 ? \ 739 (ut)->chunkContents[--((ut)->chunkOffset)] : utext_previous32(ut)) 740 741 /** 742 * inline version of utext_getNativeIndex(), for performance-critical situations. 743 * 744 * Get the current iterator position, which can range from 0 to 745 * the length of the text. 746 * The position is a native index into the input text, in whatever format it 747 * may have (possibly UTF-8 for example), and may not always be the same as 748 * the corresponding UChar (UTF-16) index. 749 * The returned position will always be aligned to a code point boundary. 750 * 751 * @stable ICU 3.6 752 */ 753 #define UTEXT_GETNATIVEINDEX(ut) \ 754 ((ut)->chunkOffset <= (ut)->nativeIndexingLimit? \ 755 (ut)->chunkNativeStart+(ut)->chunkOffset : \ 756 (ut)->pFuncs->mapOffsetToNative(ut)) 757 758 /** 759 * inline version of utext_setNativeIndex(), for performance-critical situations. 760 * 761 * Set the current iteration position to the nearest code point 762 * boundary at or preceding the specified index. 763 * The index is in the native units of the original input text. 764 * If the index is out of range, it will be pinned to be within 765 * the range of the input text. 766 * 767 * @stable ICU 3.8 768 */ 769 #define UTEXT_SETNATIVEINDEX(ut, ix) UPRV_BLOCK_MACRO_BEGIN { \ 770 int64_t __offset = (ix) - (ut)->chunkNativeStart; \ 771 if (__offset>=0 && __offset<(int64_t)(ut)->nativeIndexingLimit && (ut)->chunkContents[__offset]<0xdc00) { \ 772 (ut)->chunkOffset=(int32_t)__offset; \ 773 } else { \ 774 utext_setNativeIndex((ut), (ix)); \ 775 } \ 776 } UPRV_BLOCK_MACRO_END 777 778 779 780 /************************************************************************************ 781 * 782 * Functions related to writing or modifying the text. 783 * These will work only with modifiable UTexts. Attempting to 784 * modify a read-only UText will return an error status. 785 * 786 ************************************************************************************/ 787 788 789 /** 790 * Return TRUE if the text can be written (modified) with utext_replace() or 791 * utext_copy(). For the text to be writable, the text provider must 792 * be of a type that supports writing and the UText must not be frozen. 793 * 794 * Attempting to modify text when utext_isWriteable() is FALSE will fail - 795 * the text will not be modified, and an error will be returned from the function 796 * that attempted the modification. 797 * 798 * @param ut the UText to be tested. 799 * @return TRUE if the text is modifiable. 800 * 801 * @see utext_freeze() 802 * @see utext_replace() 803 * @see utext_copy() 804 * @stable ICU 3.4 805 * 806 */ 807 U_STABLE UBool U_EXPORT2 808 utext_isWritable(const UText *ut); 809 810 811 /** 812 * Test whether there is meta data associated with the text. 813 * @see Replaceable::hasMetaData() 814 * 815 * @param ut The UText to be tested 816 * @return TRUE if the underlying text includes meta data. 817 * @stable ICU 3.4 818 */ 819 U_STABLE UBool U_EXPORT2 820 utext_hasMetaData(const UText *ut); 821 822 823 /** 824 * Replace a range of the original text with a replacement text. 825 * 826 * Leaves the current iteration position at the position following the 827 * newly inserted replacement text. 828 * 829 * This function is only available on UText types that support writing, 830 * that is, ones where utext_isWritable() returns TRUE. 831 * 832 * When using this function, there should be only a single UText opened onto the 833 * underlying native text string. Behavior after a replace operation 834 * on a UText is undefined for any other additional UTexts that refer to the 835 * modified string. 836 * 837 * @param ut the UText representing the text to be operated on. 838 * @param nativeStart the native index of the start of the region to be replaced 839 * @param nativeLimit the native index of the character following the region to be replaced. 840 * @param replacementText pointer to the replacement text 841 * @param replacementLength length of the replacement text, or -1 if the text is NUL terminated. 842 * @param status receives any error status. Possible errors include 843 * U_NO_WRITE_PERMISSION 844 * 845 * @return The signed number of (native) storage units by which 846 * the length of the text expanded or contracted. 847 * 848 * @stable ICU 3.4 849 */ 850 U_STABLE int32_t U_EXPORT2 851 utext_replace(UText *ut, 852 int64_t nativeStart, int64_t nativeLimit, 853 const UChar *replacementText, int32_t replacementLength, 854 UErrorCode *status); 855 856 857 858 /** 859 * 860 * Copy or move a substring from one position to another within the text, 861 * while retaining any metadata associated with the text. 862 * This function is used to duplicate or reorder substrings. 863 * The destination index must not overlap the source range. 864 * 865 * The text to be copied or moved is inserted at destIndex; 866 * it does not replace or overwrite any existing text. 867 * 868 * The iteration position is left following the newly inserted text 869 * at the destination position. 870 * 871 * This function is only available on UText types that support writing, 872 * that is, ones where utext_isWritable() returns TRUE. 873 * 874 * When using this function, there should be only a single UText opened onto the 875 * underlying native text string. Behavior after a copy operation 876 * on a UText is undefined in any other additional UTexts that refer to the 877 * modified string. 878 * 879 * @param ut The UText representing the text to be operated on. 880 * @param nativeStart The native index of the start of the region to be copied or moved 881 * @param nativeLimit The native index of the character position following the region 882 * to be copied. 883 * @param destIndex The native destination index to which the source substring is 884 * copied or moved. 885 * @param move If TRUE, then the substring is moved, not copied/duplicated. 886 * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION 887 * 888 * @stable ICU 3.4 889 */ 890 U_STABLE void U_EXPORT2 891 utext_copy(UText *ut, 892 int64_t nativeStart, int64_t nativeLimit, 893 int64_t destIndex, 894 UBool move, 895 UErrorCode *status); 896 897 898 /** 899 * <p> 900 * Freeze a UText. This prevents any modification to the underlying text itself 901 * by means of functions operating on this UText. 902 * </p> 903 * <p> 904 * Once frozen, a UText can not be unfrozen. The intent is to ensure 905 * that a the text underlying a frozen UText wrapper cannot be modified via that UText. 906 * </p> 907 * <p> 908 * Caution: freezing a UText will disable changes made via the specific 909 * frozen UText wrapper only; it will not have any effect on the ability to 910 * directly modify the text by bypassing the UText. Any such backdoor modifications 911 * are always an error while UText access is occurring because the underlying 912 * text can get out of sync with UText's buffering. 913 * </p> 914 * 915 * @param ut The UText to be frozen. 916 * @see utext_isWritable() 917 * @stable ICU 3.6 918 */ 919 U_STABLE void U_EXPORT2 920 utext_freeze(UText *ut); 921 922 923 /** 924 * UText provider properties (bit field indexes). 925 * 926 * @see UText 927 * @stable ICU 3.4 928 */ 929 enum { 930 /** 931 * It is potentially time consuming for the provider to determine the length of the text. 932 * @stable ICU 3.4 933 */ 934 UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE = 1, 935 /** 936 * Text chunks remain valid and usable until the text object is modified or 937 * deleted, not just until the next time the access() function is called 938 * (which is the default). 939 * @stable ICU 3.4 940 */ 941 UTEXT_PROVIDER_STABLE_CHUNKS = 2, 942 /** 943 * The provider supports modifying the text via the replace() and copy() 944 * functions. 945 * @see Replaceable 946 * @stable ICU 3.4 947 */ 948 UTEXT_PROVIDER_WRITABLE = 3, 949 /** 950 * There is meta data associated with the text. 951 * @see Replaceable::hasMetaData() 952 * @stable ICU 3.4 953 */ 954 UTEXT_PROVIDER_HAS_META_DATA = 4, 955 /** 956 * Text provider owns the text storage. 957 * Generally occurs as the result of a deep clone of the UText. 958 * When closing the UText, the associated text must 959 * also be closed/deleted/freed/ whatever is appropriate. 960 * @stable ICU 3.6 961 */ 962 UTEXT_PROVIDER_OWNS_TEXT = 5 963 }; 964 965 /** 966 * Function type declaration for UText.clone(). 967 * 968 * clone a UText. Much like opening a UText where the source text is itself 969 * another UText. 970 * 971 * A deep clone will copy both the UText data structures and the underlying text. 972 * The original and cloned UText will operate completely independently; modifications 973 * made to the text in one will not effect the other. Text providers are not 974 * required to support deep clones. The user of clone() must check the status return 975 * and be prepared to handle failures. 976 * 977 * A shallow clone replicates only the UText data structures; it does not make 978 * a copy of the underlying text. Shallow clones can be used as an efficient way to 979 * have multiple iterators active in a single text string that is not being 980 * modified. 981 * 982 * A shallow clone operation must not fail except for truly exceptional conditions such 983 * as memory allocation failures. 984 * 985 * A UText and its clone may be safely concurrently accessed by separate threads. 986 * This is true for both shallow and deep clones. 987 * It is the responsibility of the Text Provider to ensure that this thread safety 988 * constraint is met. 989 990 * 991 * @param dest A UText struct to be filled in with the result of the clone operation, 992 * or NULL if the clone function should heap-allocate a new UText struct. 993 * @param src The UText to be cloned. 994 * @param deep TRUE to request a deep clone, FALSE for a shallow clone. 995 * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR 996 * should be returned if the text provider is unable to clone the 997 * original text. 998 * @return The newly created clone, or NULL if the clone operation failed. 999 * 1000 * @stable ICU 3.4 1001 */ 1002 typedef UText * U_CALLCONV 1003 UTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status); 1004 1005 1006 /** 1007 * Function type declaration for UText.nativeLength(). 1008 * 1009 * @param ut the UText to get the length of. 1010 * @return the length, in the native units of the original text string. 1011 * @see UText 1012 * @stable ICU 3.4 1013 */ 1014 typedef int64_t U_CALLCONV 1015 UTextNativeLength(UText *ut); 1016 1017 /** 1018 * Function type declaration for UText.access(). Get the description of the text chunk 1019 * containing the text at a requested native index. The UText's iteration 1020 * position will be left at the requested index. If the index is out 1021 * of bounds, the iteration position will be left at the start or end 1022 * of the string, as appropriate. 1023 * 1024 * Chunks must begin and end on code point boundaries. A single code point 1025 * comprised of multiple storage units must never span a chunk boundary. 1026 * 1027 * 1028 * @param ut the UText being accessed. 1029 * @param nativeIndex Requested index of the text to be accessed. 1030 * @param forward If TRUE, then the returned chunk must contain text 1031 * starting from the index, so that start<=index<limit. 1032 * If FALSE, then the returned chunk must contain text 1033 * before the index, so that start<index<=limit. 1034 * @return True if the requested index could be accessed. The chunk 1035 * will contain the requested text. 1036 * False value if a chunk cannot be accessed 1037 * (the requested index is out of bounds). 1038 * 1039 * @see UText 1040 * @stable ICU 3.4 1041 */ 1042 typedef UBool U_CALLCONV 1043 UTextAccess(UText *ut, int64_t nativeIndex, UBool forward); 1044 1045 /** 1046 * Function type declaration for UText.extract(). 1047 * 1048 * Extract text from a UText into a UChar buffer. The range of text to be extracted 1049 * is specified in the native indices of the UText provider. These may not necessarily 1050 * be UTF-16 indices. 1051 * <p> 1052 * The size (number of 16 bit UChars) in the data to be extracted is returned. The 1053 * full amount is returned, even when the specified buffer size is smaller. 1054 * <p> 1055 * The extracted string will (if you are a user) / must (if you are a text provider) 1056 * be NUL-terminated if there is sufficient space in the destination buffer. 1057 * 1058 * @param ut the UText from which to extract data. 1059 * @param nativeStart the native index of the first character to extract. 1060 * @param nativeLimit the native string index of the position following the last 1061 * character to extract. 1062 * @param dest the UChar (UTF-16) buffer into which the extracted text is placed 1063 * @param destCapacity The size, in UChars, of the destination buffer. May be zero 1064 * for precomputing the required size. 1065 * @param status receives any error status. 1066 * If U_BUFFER_OVERFLOW_ERROR: Returns number of UChars for 1067 * preflighting. 1068 * @return Number of UChars in the data. Does not include a trailing NUL. 1069 * 1070 * @stable ICU 3.4 1071 */ 1072 typedef int32_t U_CALLCONV 1073 UTextExtract(UText *ut, 1074 int64_t nativeStart, int64_t nativeLimit, 1075 UChar *dest, int32_t destCapacity, 1076 UErrorCode *status); 1077 1078 /** 1079 * Function type declaration for UText.replace(). 1080 * 1081 * Replace a range of the original text with a replacement text. 1082 * 1083 * Leaves the current iteration position at the position following the 1084 * newly inserted replacement text. 1085 * 1086 * This function need only be implemented on UText types that support writing. 1087 * 1088 * When using this function, there should be only a single UText opened onto the 1089 * underlying native text string. The function is responsible for updating the 1090 * text chunk within the UText to reflect the updated iteration position, 1091 * taking into account any changes to the underlying string's structure caused 1092 * by the replace operation. 1093 * 1094 * @param ut the UText representing the text to be operated on. 1095 * @param nativeStart the index of the start of the region to be replaced 1096 * @param nativeLimit the index of the character following the region to be replaced. 1097 * @param replacementText pointer to the replacement text 1098 * @param replacmentLength length of the replacement text in UChars, or -1 if the text is NUL terminated. 1099 * @param status receives any error status. Possible errors include 1100 * U_NO_WRITE_PERMISSION 1101 * 1102 * @return The signed number of (native) storage units by which 1103 * the length of the text expanded or contracted. 1104 * 1105 * @stable ICU 3.4 1106 */ 1107 typedef int32_t U_CALLCONV 1108 UTextReplace(UText *ut, 1109 int64_t nativeStart, int64_t nativeLimit, 1110 const UChar *replacementText, int32_t replacmentLength, 1111 UErrorCode *status); 1112 1113 /** 1114 * Function type declaration for UText.copy(). 1115 * 1116 * Copy or move a substring from one position to another within the text, 1117 * while retaining any metadata associated with the text. 1118 * This function is used to duplicate or reorder substrings. 1119 * The destination index must not overlap the source range. 1120 * 1121 * The text to be copied or moved is inserted at destIndex; 1122 * it does not replace or overwrite any existing text. 1123 * 1124 * This function need only be implemented for UText types that support writing. 1125 * 1126 * When using this function, there should be only a single UText opened onto the 1127 * underlying native text string. The function is responsible for updating the 1128 * text chunk within the UText to reflect the updated iteration position, 1129 * taking into account any changes to the underlying string's structure caused 1130 * by the replace operation. 1131 * 1132 * @param ut The UText representing the text to be operated on. 1133 * @param nativeStart The index of the start of the region to be copied or moved 1134 * @param nativeLimit The index of the character following the region to be replaced. 1135 * @param nativeDest The destination index to which the source substring is copied or moved. 1136 * @param move If TRUE, then the substring is moved, not copied/duplicated. 1137 * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION 1138 * 1139 * @stable ICU 3.4 1140 */ 1141 typedef void U_CALLCONV 1142 UTextCopy(UText *ut, 1143 int64_t nativeStart, int64_t nativeLimit, 1144 int64_t nativeDest, 1145 UBool move, 1146 UErrorCode *status); 1147 1148 /** 1149 * Function type declaration for UText.mapOffsetToNative(). 1150 * Map from the current UChar offset within the current text chunk to 1151 * the corresponding native index in the original source text. 1152 * 1153 * This is required only for text providers that do not use native UTF-16 indexes. 1154 * 1155 * @param ut the UText. 1156 * @return Absolute (native) index corresponding to chunkOffset in the current chunk. 1157 * The returned native index should always be to a code point boundary. 1158 * 1159 * @stable ICU 3.4 1160 */ 1161 typedef int64_t U_CALLCONV 1162 UTextMapOffsetToNative(const UText *ut); 1163 1164 /** 1165 * Function type declaration for UText.mapIndexToUTF16(). 1166 * Map from a native index to a UChar offset within a text chunk. 1167 * Behavior is undefined if the native index does not fall within the 1168 * current chunk. 1169 * 1170 * This function is required only for text providers that do not use native UTF-16 indexes. 1171 * 1172 * @param ut The UText containing the text chunk. 1173 * @param nativeIndex Absolute (native) text index, chunk->start<=index<=chunk->limit. 1174 * @return Chunk-relative UTF-16 offset corresponding to the specified native 1175 * index. 1176 * 1177 * @stable ICU 3.4 1178 */ 1179 typedef int32_t U_CALLCONV 1180 UTextMapNativeIndexToUTF16(const UText *ut, int64_t nativeIndex); 1181 1182 1183 /** 1184 * Function type declaration for UText.utextClose(). 1185 * 1186 * A Text Provider close function is only required for provider types that make 1187 * allocations in their open function (or other functions) that must be 1188 * cleaned when the UText is closed. 1189 * 1190 * The allocation of the UText struct itself and any "extra" storage 1191 * associated with the UText is handled by the common UText implementation 1192 * and does not require provider specific cleanup in a close function. 1193 * 1194 * Most UText provider implementations do not need to implement this function. 1195 * 1196 * @param ut A UText object to be closed. 1197 * 1198 * @stable ICU 3.4 1199 */ 1200 typedef void U_CALLCONV 1201 UTextClose(UText *ut); 1202 1203 1204 /** 1205 * (public) Function dispatch table for UText. 1206 * Conceptually very much like a C++ Virtual Function Table. 1207 * This struct defines the organization of the table. 1208 * Each text provider implementation must provide an 1209 * actual table that is initialized with the appropriate functions 1210 * for the type of text being handled. 1211 * @stable ICU 3.6 1212 */ 1213 struct UTextFuncs { 1214 /** 1215 * (public) Function table size, sizeof(UTextFuncs) 1216 * Intended for use should the table grow to accommodate added 1217 * functions in the future, to allow tests for older format 1218 * function tables that do not contain the extensions. 1219 * 1220 * Fields are placed for optimal alignment on 1221 * 32/64/128-bit-pointer machines, by normally grouping together 1222 * 4 32-bit fields, 1223 * 4 pointers, 1224 * 2 64-bit fields 1225 * in sequence. 1226 * @stable ICU 3.6 1227 */ 1228 int32_t tableSize; 1229 1230 /** 1231 * (private) Alignment padding. 1232 * Do not use, reserved for use by the UText framework only. 1233 * @internal 1234 */ 1235 int32_t reserved1, /** @internal */ reserved2, /** @internal */ reserved3; 1236 1237 1238 /** 1239 * (public) Function pointer for UTextClone 1240 * 1241 * @see UTextClone 1242 * @stable ICU 3.6 1243 */ 1244 UTextClone *clone; 1245 1246 /** 1247 * (public) function pointer for UTextLength 1248 * May be expensive to compute! 1249 * 1250 * @see UTextLength 1251 * @stable ICU 3.6 1252 */ 1253 UTextNativeLength *nativeLength; 1254 1255 /** 1256 * (public) Function pointer for UTextAccess. 1257 * 1258 * @see UTextAccess 1259 * @stable ICU 3.6 1260 */ 1261 UTextAccess *access; 1262 1263 /** 1264 * (public) Function pointer for UTextExtract. 1265 * 1266 * @see UTextExtract 1267 * @stable ICU 3.6 1268 */ 1269 UTextExtract *extract; 1270 1271 /** 1272 * (public) Function pointer for UTextReplace. 1273 * 1274 * @see UTextReplace 1275 * @stable ICU 3.6 1276 */ 1277 UTextReplace *replace; 1278 1279 /** 1280 * (public) Function pointer for UTextCopy. 1281 * 1282 * @see UTextCopy 1283 * @stable ICU 3.6 1284 */ 1285 UTextCopy *copy; 1286 1287 /** 1288 * (public) Function pointer for UTextMapOffsetToNative. 1289 * 1290 * @see UTextMapOffsetToNative 1291 * @stable ICU 3.6 1292 */ 1293 UTextMapOffsetToNative *mapOffsetToNative; 1294 1295 /** 1296 * (public) Function pointer for UTextMapNativeIndexToUTF16. 1297 * 1298 * @see UTextMapNativeIndexToUTF16 1299 * @stable ICU 3.6 1300 */ 1301 UTextMapNativeIndexToUTF16 *mapNativeIndexToUTF16; 1302 1303 /** 1304 * (public) Function pointer for UTextClose. 1305 * 1306 * @see UTextClose 1307 * @stable ICU 3.6 1308 */ 1309 UTextClose *close; 1310 1311 /** 1312 * (private) Spare function pointer 1313 * @internal 1314 */ 1315 UTextClose *spare1; 1316 1317 /** 1318 * (private) Spare function pointer 1319 * @internal 1320 */ 1321 UTextClose *spare2; 1322 1323 /** 1324 * (private) Spare function pointer 1325 * @internal 1326 */ 1327 UTextClose *spare3; 1328 1329 }; 1330 /** 1331 * Function dispatch table for UText 1332 * @see UTextFuncs 1333 */ 1334 typedef struct UTextFuncs UTextFuncs; 1335 1336 /** 1337 * UText struct. Provides the interface between the generic UText access code 1338 * and the UText provider code that works on specific kinds of 1339 * text (UTF-8, noncontiguous UTF-16, whatever.) 1340 * 1341 * Applications that are using predefined types of text providers 1342 * to pass text data to ICU services will have no need to view the 1343 * internals of the UText structs that they open. 1344 * 1345 * @stable ICU 3.6 1346 */ 1347 struct UText { 1348 /** 1349 * (private) Magic. Used to help detect when UText functions are handed 1350 * invalid or uninitialized UText structs. 1351 * utext_openXYZ() functions take an initialized, 1352 * but not necessarily open, UText struct as an 1353 * optional fill-in parameter. This magic field 1354 * is used to check for that initialization. 1355 * Text provider close functions must NOT clear 1356 * the magic field because that would prevent 1357 * reuse of the UText struct. 1358 * @internal 1359 */ 1360 uint32_t magic; 1361 1362 1363 /** 1364 * (private) Flags for managing the allocation and freeing of 1365 * memory associated with this UText. 1366 * @internal 1367 */ 1368 int32_t flags; 1369 1370 1371 /** 1372 * Text provider properties. This set of flags is maintained by the 1373 * text provider implementation. 1374 * @stable ICU 3.4 1375 */ 1376 int32_t providerProperties; 1377 1378 /** 1379 * (public) sizeOfStruct=sizeof(UText) 1380 * Allows possible backward compatible extension. 1381 * 1382 * @stable ICU 3.4 1383 */ 1384 int32_t sizeOfStruct; 1385 1386 /* ------ 16 byte alignment boundary ----------- */ 1387 1388 1389 /** 1390 * (protected) Native index of the first character position following 1391 * the current chunk. 1392 * @stable ICU 3.6 1393 */ 1394 int64_t chunkNativeLimit; 1395 1396 /** 1397 * (protected) Size in bytes of the extra space (pExtra). 1398 * @stable ICU 3.4 1399 */ 1400 int32_t extraSize; 1401 1402 /** 1403 * (protected) The highest chunk offset where native indexing and 1404 * chunk (UTF-16) indexing correspond. For UTF-16 sources, value 1405 * will be equal to chunkLength. 1406 * 1407 * @stable ICU 3.6 1408 */ 1409 int32_t nativeIndexingLimit; 1410 1411 /* ---- 16 byte alignment boundary------ */ 1412 1413 /** 1414 * (protected) Native index of the first character in the text chunk. 1415 * @stable ICU 3.6 1416 */ 1417 int64_t chunkNativeStart; 1418 1419 /** 1420 * (protected) Current iteration position within the text chunk (UTF-16 buffer). 1421 * This is the index to the character that will be returned by utext_next32(). 1422 * @stable ICU 3.6 1423 */ 1424 int32_t chunkOffset; 1425 1426 /** 1427 * (protected) Length the text chunk (UTF-16 buffer), in UChars. 1428 * @stable ICU 3.6 1429 */ 1430 int32_t chunkLength; 1431 1432 /* ---- 16 byte alignment boundary-- */ 1433 1434 1435 /** 1436 * (protected) pointer to a chunk of text in UTF-16 format. 1437 * May refer either to original storage of the source of the text, or 1438 * if conversion was required, to a buffer owned by the UText. 1439 * @stable ICU 3.6 1440 */ 1441 const UChar *chunkContents; 1442 1443 /** 1444 * (public) Pointer to Dispatch table for accessing functions for this UText. 1445 * @stable ICU 3.6 1446 */ 1447 const UTextFuncs *pFuncs; 1448 1449 /** 1450 * (protected) Pointer to additional space requested by the 1451 * text provider during the utext_open operation. 1452 * @stable ICU 3.4 1453 */ 1454 void *pExtra; 1455 1456 /** 1457 * (protected) Pointer to string or text-containing object or similar. 1458 * This is the source of the text that this UText is wrapping, in a format 1459 * that is known to the text provider functions. 1460 * @stable ICU 3.4 1461 */ 1462 const void *context; 1463 1464 /* --- 16 byte alignment boundary--- */ 1465 1466 /** 1467 * (protected) Pointer fields available for use by the text provider. 1468 * Not used by UText common code. 1469 * @stable ICU 3.6 1470 */ 1471 const void *p; 1472 /** 1473 * (protected) Pointer fields available for use by the text provider. 1474 * Not used by UText common code. 1475 * @stable ICU 3.6 1476 */ 1477 const void *q; 1478 /** 1479 * (protected) Pointer fields available for use by the text provider. 1480 * Not used by UText common code. 1481 * @stable ICU 3.6 1482 */ 1483 const void *r; 1484 1485 /** 1486 * Private field reserved for future use by the UText framework 1487 * itself. This is not to be touched by the text providers. 1488 * @internal ICU 3.4 1489 */ 1490 void *privP; 1491 1492 1493 /* --- 16 byte alignment boundary--- */ 1494 1495 1496 /** 1497 * (protected) Integer field reserved for use by the text provider. 1498 * Not used by the UText framework, or by the client (user) of the UText. 1499 * @stable ICU 3.4 1500 */ 1501 int64_t a; 1502 1503 /** 1504 * (protected) Integer field reserved for use by the text provider. 1505 * Not used by the UText framework, or by the client (user) of the UText. 1506 * @stable ICU 3.4 1507 */ 1508 int32_t b; 1509 1510 /** 1511 * (protected) Integer field reserved for use by the text provider. 1512 * Not used by the UText framework, or by the client (user) of the UText. 1513 * @stable ICU 3.4 1514 */ 1515 int32_t c; 1516 1517 /* ---- 16 byte alignment boundary---- */ 1518 1519 1520 /** 1521 * Private field reserved for future use by the UText framework 1522 * itself. This is not to be touched by the text providers. 1523 * @internal ICU 3.4 1524 */ 1525 int64_t privA; 1526 /** 1527 * Private field reserved for future use by the UText framework 1528 * itself. This is not to be touched by the text providers. 1529 * @internal ICU 3.4 1530 */ 1531 int32_t privB; 1532 /** 1533 * Private field reserved for future use by the UText framework 1534 * itself. This is not to be touched by the text providers. 1535 * @internal ICU 3.4 1536 */ 1537 int32_t privC; 1538 }; 1539 1540 1541 /** 1542 * Common function for use by Text Provider implementations to allocate and/or initialize 1543 * a new UText struct. To be called in the implementation of utext_open() functions. 1544 * If the supplied UText parameter is null, a new UText struct will be allocated on the heap. 1545 * If the supplied UText is already open, the provider's close function will be called 1546 * so that the struct can be reused by the open that is in progress. 1547 * 1548 * @param ut pointer to a UText struct to be re-used, or null if a new UText 1549 * should be allocated. 1550 * @param extraSpace The amount of additional space to be allocated as part 1551 * of this UText, for use by types of providers that require 1552 * additional storage. 1553 * @param status Errors are returned here. 1554 * @return pointer to the UText, allocated if necessary, with extra space set up if requested. 1555 * @stable ICU 3.4 1556 */ 1557 U_STABLE UText * U_EXPORT2 1558 utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status); 1559 1560 // do not use #ifndef U_HIDE_INTERNAL_API around the following! 1561 /** 1562 * @internal 1563 * Value used to help identify correctly initialized UText structs. 1564 * Note: must be publicly visible so that UTEXT_INITIALIZER can access it. 1565 */ 1566 enum { 1567 UTEXT_MAGIC = 0x345ad82c 1568 }; 1569 1570 /** 1571 * initializer to be used with local (stack) instances of a UText 1572 * struct. UText structs must be initialized before passing 1573 * them to one of the utext_open functions. 1574 * 1575 * @stable ICU 3.6 1576 */ 1577 #define UTEXT_INITIALIZER { \ 1578 UTEXT_MAGIC, /* magic */ \ 1579 0, /* flags */ \ 1580 0, /* providerProps */ \ 1581 sizeof(UText), /* sizeOfStruct */ \ 1582 0, /* chunkNativeLimit */ \ 1583 0, /* extraSize */ \ 1584 0, /* nativeIndexingLimit */ \ 1585 0, /* chunkNativeStart */ \ 1586 0, /* chunkOffset */ \ 1587 0, /* chunkLength */ \ 1588 NULL, /* chunkContents */ \ 1589 NULL, /* pFuncs */ \ 1590 NULL, /* pExtra */ \ 1591 NULL, /* context */ \ 1592 NULL, NULL, NULL, /* p, q, r */ \ 1593 NULL, /* privP */ \ 1594 0, 0, 0, /* a, b, c */ \ 1595 0, 0, 0 /* privA,B,C, */ \ 1596 } 1597 1598 1599 U_CDECL_END 1600 1601 1602 1603 #endif 1604