1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2004-2012, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: utext.h 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2004oct06 16 * created by: Markus W. Scherer 17 */ 18 19 #ifndef __UTEXT_H__ 20 #define __UTEXT_H__ 21 22 /** 23 * \file 24 * \brief C API: Abstract Unicode Text API 25 * 26 * The Text Access API provides a means to allow text that is stored in alternative 27 * formats to work with ICU services. ICU normally operates on text that is 28 * stored in UTF-16 format, in (UChar *) arrays for the C APIs or as type 29 * UnicodeString for C++ APIs. 30 * 31 * ICU Text Access allows other formats, such as UTF-8 or non-contiguous 32 * UTF-16 strings, to be placed in a UText wrapper and then passed to ICU services. 33 * 34 * There are three general classes of usage for UText: 35 * 36 * Application Level Use. This is the simplest usage - applications would 37 * use one of the utext_open() functions on their input text, and pass 38 * the resulting UText to the desired ICU service. 39 * 40 * Second is usage in ICU Services, such as break iteration, that will need to 41 * operate on input presented to them as a UText. These implementations 42 * will need to use the iteration and related UText functions to gain 43 * access to the actual text. 44 * 45 * The third class of UText users are "text providers." These are the 46 * UText implementations for the various text storage formats. An application 47 * or system with a unique text storage format can implement a set of 48 * UText provider functions for that format, which will then allow 49 * ICU services to operate on that format. 50 * 51 * 52 * <em>Iterating over text</em> 53 * 54 * Here is sample code for a forward iteration over the contents of a UText 55 * 56 * \code 57 * UChar32 c; 58 * UText *ut = whatever(); 59 * 60 * for (c=utext_next32From(ut, 0); c>=0; c=utext_next32(ut)) { 61 * // do whatever with the codepoint c here. 62 * } 63 * \endcode 64 * 65 * And here is similar code to iterate in the reverse direction, from the end 66 * of the text towards the beginning. 67 * 68 * \code 69 * UChar32 c; 70 * UText *ut = whatever(); 71 * int textLength = utext_nativeLength(ut); 72 * for (c=utext_previous32From(ut, textLength); c>=0; c=utext_previous32(ut)) { 73 * // do whatever with the codepoint c here. 74 * } 75 * \endcode 76 * 77 * <em>Characters and Indexing</em> 78 * 79 * Indexing into text by UText functions is nearly always in terms of the native 80 * indexing of the underlying text storage. The storage format could be UTF-8 81 * or UTF-32, for example. When coding to the UText access API, no assumptions 82 * can be made regarding the size of characters, or how far an index 83 * may move when iterating between characters. 84 * 85 * All indices supplied to UText functions are pinned to the length of the 86 * text. An out-of-bounds index is not considered to be an error, but is 87 * adjusted to be in the range 0 <= index <= length of input text. 88 * 89 * 90 * When an index position is returned from a UText function, it will be 91 * a native index to the underlying text. In the case of multi-unit characters, 92 * it will always refer to the first position of the character, 93 * never to the interior. This is essentially the same thing as saying that 94 * a returned index will always point to a boundary between characters. 95 * 96 * When a native index is supplied to a UText function, all indices that 97 * refer to any part of a multi-unit character representation are considered 98 * to be equivalent. In the case of multi-unit characters, an incoming index 99 * will be logically normalized to refer to the start of the character. 100 * 101 * It is possible to test whether a native index is on a code point boundary 102 * by doing a utext_setNativeIndex() followed by a utext_getNativeIndex(). 103 * If the index is returned unchanged, it was on a code point boundary. If 104 * an adjusted index is returned, the original index referred to the 105 * interior of a character. 106 * 107 * <em>Conventions for calling UText functions</em> 108 * 109 * Most UText access functions have as their first parameter a (UText *) pointer, 110 * which specifies the UText to be used. Unless otherwise noted, the 111 * pointer must refer to a valid, open UText. Attempting to 112 * use a closed UText or passing a NULL pointer is a programming error and 113 * will produce undefined results or NULL pointer exceptions. 114 * 115 * The UText_Open family of functions can either open an existing (closed) 116 * UText, or heap allocate a new UText. Here is sample code for creating 117 * a stack-allocated UText. 118 * 119 * \code 120 * char *s = whatever(); // A utf-8 string 121 * U_ErrorCode status = U_ZERO_ERROR; 122 * UText ut = UTEXT_INITIALIZER; 123 * utext_openUTF8(ut, s, -1, &status); 124 * if (U_FAILURE(status)) { 125 * // error handling 126 * } else { 127 * // work with the UText 128 * } 129 * \endcode 130 * 131 * Any existing UText passed to an open function _must_ have been initialized, 132 * either by the UTEXT_INITIALIZER, or by having been originally heap-allocated 133 * by an open function. Passing NULL will cause the open function to 134 * heap-allocate and fully initialize a new UText. 135 * 136 */ 137 138 139 140 #include "unicode/utypes.h" 141 #include "unicode/uchar.h" 142 #if U_SHOW_CPLUSPLUS_API 143 #include "unicode/localpointer.h" 144 #include "unicode/rep.h" 145 #include "unicode/unistr.h" 146 #include "unicode/chariter.h" 147 #endif 148 149 150 U_CDECL_BEGIN 151 152 struct UText; 153 typedef struct UText UText; /**< C typedef for struct UText. @stable ICU 3.6 */ 154 155 156 /*************************************************************************************** 157 * 158 * C Functions for creating UText wrappers around various kinds of text strings. 159 * 160 ****************************************************************************************/ 161 162 163 /** 164 * Close function for UText instances. 165 * Cleans up, releases any resources being held by an open UText. 166 * <p> 167 * If the UText was originally allocated by one of the utext_open functions, 168 * the storage associated with the utext will also be freed. 169 * If the UText storage originated with the application, as it would with 170 * a local or static instance, the storage will not be deleted. 171 * 172 * An open UText can be reset to refer to new string by using one of the utext_open() 173 * functions without first closing the UText. 174 * 175 * @param ut The UText to be closed. 176 * @return NULL if the UText struct was deleted by the close. If the UText struct 177 * was originally provided by the caller to the open function, it is 178 * returned by this function, and may be safely used again in 179 * a subsequent utext_open. 180 * 181 * @stable ICU 3.4 182 */ 183 U_STABLE UText * U_EXPORT2 184 utext_close(UText *ut); 185 186 #if U_SHOW_CPLUSPLUS_API 187 188 U_NAMESPACE_BEGIN 189 190 /** 191 * \class LocalUTextPointer 192 * "Smart pointer" class, closes a UText via utext_close(). 193 * For most methods see the LocalPointerBase base class. 194 * 195 * @see LocalPointerBase 196 * @see LocalPointer 197 * @stable ICU 4.4 198 */ 199 U_DEFINE_LOCAL_OPEN_POINTER(LocalUTextPointer, UText, utext_close); 200 201 U_NAMESPACE_END 202 203 #endif 204 205 /** 206 * Open a read-only UText implementation for UTF-8 strings. 207 * 208 * \htmlonly 209 * Any invalid UTF-8 in the input will be handled in this way: 210 * a sequence of bytes that has the form of a truncated, but otherwise valid, 211 * UTF-8 sequence will be replaced by a single unicode replacement character, \uFFFD. 212 * Any other illegal bytes will each be replaced by a \uFFFD. 213 * \endhtmlonly 214 * 215 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 216 * If non-NULL, must refer to an initialized UText struct, which will then 217 * be reset to reference the specified UTF-8 string. 218 * @param s A UTF-8 string. Must not be NULL. 219 * @param length The length of the UTF-8 string in bytes, or -1 if the string is 220 * zero terminated. 221 * @param status Errors are returned here. 222 * @return A pointer to the UText. If a pre-allocated UText was provided, it 223 * will always be used and returned. 224 * @stable ICU 3.4 225 */ 226 U_STABLE UText * U_EXPORT2 227 utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status); 228 229 230 /** 231 * Open a read-only UText for UChar * string. 232 * 233 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 234 * If non-NULL, must refer to an initialized UText struct, which will then 235 * be reset to reference the specified UChar string. 236 * @param s A UChar (UTF-16) string 237 * @param length The number of UChars in the input string, or -1 if the string is 238 * zero terminated. 239 * @param status Errors are returned here. 240 * @return A pointer to the UText. If a pre-allocated UText was provided, it 241 * will always be used and returned. 242 * @stable ICU 3.4 243 */ 244 U_STABLE UText * U_EXPORT2 245 utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status); 246 247 248 #if U_SHOW_CPLUSPLUS_API 249 /** 250 * Open a writable UText for a non-const UnicodeString. 251 * 252 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 253 * If non-NULL, must refer to an initialized UText struct, which will then 254 * be reset to reference the specified input string. 255 * @param s A UnicodeString. 256 * @param status Errors are returned here. 257 * @return Pointer to the UText. If a UText was supplied as input, this 258 * will always be used and returned. 259 * @stable ICU 3.4 260 */ 261 U_STABLE UText * U_EXPORT2 262 utext_openUnicodeString(UText *ut, icu::UnicodeString *s, UErrorCode *status); 263 264 265 /** 266 * Open a UText for a const UnicodeString. The resulting UText will not be writable. 267 * 268 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 269 * If non-NULL, must refer to an initialized UText struct, which will then 270 * be reset to reference the specified input string. 271 * @param s A const UnicodeString to be wrapped. 272 * @param status Errors are returned here. 273 * @return Pointer to the UText. If a UText was supplied as input, this 274 * will always be used and returned. 275 * @stable ICU 3.4 276 */ 277 U_STABLE UText * U_EXPORT2 278 utext_openConstUnicodeString(UText *ut, const icu::UnicodeString *s, UErrorCode *status); 279 280 281 /** 282 * Open a writable UText implementation for an ICU Replaceable object. 283 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 284 * If non-NULL, must refer to an already existing UText, which will then 285 * be reset to reference the specified replaceable text. 286 * @param rep A Replaceable text object. 287 * @param status Errors are returned here. 288 * @return Pointer to the UText. If a UText was supplied as input, this 289 * will always be used and returned. 290 * @see Replaceable 291 * @stable ICU 3.4 292 */ 293 U_STABLE UText * U_EXPORT2 294 utext_openReplaceable(UText *ut, icu::Replaceable *rep, UErrorCode *status); 295 296 /** 297 * Open a UText implementation over an ICU CharacterIterator. 298 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 299 * If non-NULL, must refer to an already existing UText, which will then 300 * be reset to reference the specified replaceable text. 301 * @param ci A Character Iterator. 302 * @param status Errors are returned here. 303 * @return Pointer to the UText. If a UText was supplied as input, this 304 * will always be used and returned. 305 * @see Replaceable 306 * @stable ICU 3.4 307 */ 308 U_STABLE UText * U_EXPORT2 309 utext_openCharacterIterator(UText *ut, icu::CharacterIterator *ci, UErrorCode *status); 310 311 #endif 312 313 314 /** 315 * Clone a UText. This is much like opening a UText where the source text is itself 316 * another UText. 317 * 318 * A deep clone will copy both the UText data structures and the underlying text. 319 * The original and cloned UText will operate completely independently; modifications 320 * made to the text in one will not affect the other. Text providers are not 321 * required to support deep clones. The user of clone() must check the status return 322 * and be prepared to handle failures. 323 * 324 * The standard UText implementations for UTF8, UChar *, UnicodeString and 325 * Replaceable all support deep cloning. 326 * 327 * The UText returned from a deep clone will be writable, assuming that the text 328 * provider is able to support writing, even if the source UText had been made 329 * non-writable by means of UText_freeze(). 330 * 331 * A shallow clone replicates only the UText data structures; it does not make 332 * a copy of the underlying text. Shallow clones can be used as an efficient way to 333 * have multiple iterators active in a single text string that is not being 334 * modified. 335 * 336 * A shallow clone operation will not fail, barring truly exceptional conditions such 337 * as memory allocation failures. 338 * 339 * Shallow UText clones should be avoided if the UText functions that modify the 340 * text are expected to be used, either on the original or the cloned UText. 341 * Any such modifications can cause unpredictable behavior. Read Only 342 * shallow clones provide some protection against errors of this type by 343 * disabling text modification via the cloned UText. 344 * 345 * A shallow clone made with the readOnly parameter == FALSE will preserve the 346 * utext_isWritable() state of the source object. Note, however, that 347 * write operations must be avoided while more than one UText exists that refer 348 * to the same underlying text. 349 * 350 * A UText and its clone may be safely concurrently accessed by separate threads. 351 * This is true for read access only with shallow clones, and for both read and 352 * write access with deep clones. 353 * It is the responsibility of the Text Provider to ensure that this thread safety 354 * constraint is met. 355 * 356 * @param dest A UText struct to be filled in with the result of the clone operation, 357 * or NULL if the clone function should heap-allocate a new UText struct. 358 * If non-NULL, must refer to an already existing UText, which will then 359 * be reset to become the clone. 360 * @param src The UText to be cloned. 361 * @param deep TRUE to request a deep clone, FALSE for a shallow clone. 362 * @param readOnly TRUE to request that the cloned UText have read only access to the 363 * underlying text. 364 365 * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR 366 * will be returned if the text provider is unable to clone the 367 * original text. 368 * @return The newly created clone, or NULL if the clone operation failed. 369 * @stable ICU 3.4 370 */ 371 U_STABLE UText * U_EXPORT2 372 utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status); 373 374 375 /** 376 * Compare two UText objects for equality. 377 * UTexts are equal if they are iterating over the same text, and 378 * have the same iteration position within the text. 379 * If either or both of the parameters are NULL, the comparison is FALSE. 380 * 381 * @param a The first of the two UTexts to compare. 382 * @param b The other UText to be compared. 383 * @return TRUE if the two UTexts are equal. 384 * @stable ICU 3.6 385 */ 386 U_STABLE UBool U_EXPORT2 387 utext_equals(const UText *a, const UText *b); 388 389 390 /***************************************************************************** 391 * 392 * Functions to work with the text represeted by a UText wrapper 393 * 394 *****************************************************************************/ 395 396 /** 397 * Get the length of the text. Depending on the characteristics 398 * of the underlying text representation, this may be expensive. 399 * @see utext_isLengthExpensive() 400 * 401 * 402 * @param ut the text to be accessed. 403 * @return the length of the text, expressed in native units. 404 * 405 * @stable ICU 3.4 406 */ 407 U_STABLE int64_t U_EXPORT2 408 utext_nativeLength(UText *ut); 409 410 /** 411 * Return TRUE if calculating the length of the text could be expensive. 412 * Finding the length of NUL terminated strings is considered to be expensive. 413 * 414 * Note that the value of this function may change 415 * as the result of other operations on a UText. 416 * Once the length of a string has been discovered, it will no longer 417 * be expensive to report it. 418 * 419 * @param ut the text to be accessed. 420 * @return TRUE if determining the length of the text could be time consuming. 421 * @stable ICU 3.4 422 */ 423 U_STABLE UBool U_EXPORT2 424 utext_isLengthExpensive(const UText *ut); 425 426 /** 427 * Returns the code point at the requested index, 428 * or U_SENTINEL (-1) if it is out of bounds. 429 * 430 * If the specified index points to the interior of a multi-unit 431 * character - one of the trail bytes of a UTF-8 sequence, for example - 432 * the complete code point will be returned. 433 * 434 * The iteration position will be set to the start of the returned code point. 435 * 436 * This function is roughly equivalent to the the sequence 437 * utext_setNativeIndex(index); 438 * utext_current32(); 439 * (There is a subtle difference if the index is out of bounds by being less than zero - 440 * utext_setNativeIndex(negative value) sets the index to zero, after which utext_current() 441 * will return the char at zero. utext_char32At(negative index), on the other hand, will 442 * return the U_SENTINEL value of -1.) 443 * 444 * @param ut the text to be accessed 445 * @param nativeIndex the native index of the character to be accessed. If the index points 446 * to other than the first unit of a multi-unit character, it will be adjusted 447 * to the start of the character. 448 * @return the code point at the specified index. 449 * @stable ICU 3.4 450 */ 451 U_STABLE UChar32 U_EXPORT2 452 utext_char32At(UText *ut, int64_t nativeIndex); 453 454 455 /** 456 * 457 * Get the code point at the current iteration position, 458 * or U_SENTINEL (-1) if the iteration has reached the end of 459 * the input text. 460 * 461 * @param ut the text to be accessed. 462 * @return the Unicode code point at the current iterator position. 463 * @stable ICU 3.4 464 */ 465 U_STABLE UChar32 U_EXPORT2 466 utext_current32(UText *ut); 467 468 469 /** 470 * Get the code point at the current iteration position of the UText, and 471 * advance the position to the first index following the character. 472 * 473 * If the position is at the end of the text (the index following 474 * the last character, which is also the length of the text), 475 * return U_SENTINEL (-1) and do not advance the index. 476 * 477 * This is a post-increment operation. 478 * 479 * An inline macro version of this function, UTEXT_NEXT32(), 480 * is available for performance critical use. 481 * 482 * @param ut the text to be accessed. 483 * @return the Unicode code point at the iteration position. 484 * @see UTEXT_NEXT32 485 * @stable ICU 3.4 486 */ 487 U_STABLE UChar32 U_EXPORT2 488 utext_next32(UText *ut); 489 490 491 /** 492 * Move the iterator position to the character (code point) whose 493 * index precedes the current position, and return that character. 494 * This is a pre-decrement operation. 495 * 496 * If the initial position is at the start of the text (index of 0) 497 * return U_SENTINEL (-1), and leave the position unchanged. 498 * 499 * An inline macro version of this function, UTEXT_PREVIOUS32(), 500 * is available for performance critical use. 501 * 502 * @param ut the text to be accessed. 503 * @return the previous UChar32 code point, or U_SENTINEL (-1) 504 * if the iteration has reached the start of the text. 505 * @see UTEXT_PREVIOUS32 506 * @stable ICU 3.4 507 */ 508 U_STABLE UChar32 U_EXPORT2 509 utext_previous32(UText *ut); 510 511 512 /** 513 * Set the iteration index and return the code point at that index. 514 * Leave the iteration index at the start of the following code point. 515 * 516 * This function is the most efficient and convenient way to 517 * begin a forward iteration. The results are identical to the those 518 * from the sequence 519 * \code 520 * utext_setIndex(); 521 * utext_next32(); 522 * \endcode 523 * 524 * @param ut the text to be accessed. 525 * @param nativeIndex Iteration index, in the native units of the text provider. 526 * @return Code point which starts at or before index, 527 * or U_SENTINEL (-1) if it is out of bounds. 528 * @stable ICU 3.4 529 */ 530 U_STABLE UChar32 U_EXPORT2 531 utext_next32From(UText *ut, int64_t nativeIndex); 532 533 534 535 /** 536 * Set the iteration index, and return the code point preceding the 537 * one specified by the initial index. Leave the iteration position 538 * at the start of the returned code point. 539 * 540 * This function is the most efficient and convenient way to 541 * begin a backwards iteration. 542 * 543 * @param ut the text to be accessed. 544 * @param nativeIndex Iteration index in the native units of the text provider. 545 * @return Code point preceding the one at the initial index, 546 * or U_SENTINEL (-1) if it is out of bounds. 547 * 548 * @stable ICU 3.4 549 */ 550 U_STABLE UChar32 U_EXPORT2 551 utext_previous32From(UText *ut, int64_t nativeIndex); 552 553 /** 554 * Get the current iterator position, which can range from 0 to 555 * the length of the text. 556 * The position is a native index into the input text, in whatever format it 557 * may have (possibly UTF-8 for example), and may not always be the same as 558 * the corresponding UChar (UTF-16) index. 559 * The returned position will always be aligned to a code point boundary. 560 * 561 * @param ut the text to be accessed. 562 * @return the current index position, in the native units of the text provider. 563 * @stable ICU 3.4 564 */ 565 U_STABLE int64_t U_EXPORT2 566 utext_getNativeIndex(const UText *ut); 567 568 /** 569 * Set the current iteration position to the nearest code point 570 * boundary at or preceding the specified index. 571 * The index is in the native units of the original input text. 572 * If the index is out of range, it will be pinned to be within 573 * the range of the input text. 574 * <p> 575 * It will usually be more efficient to begin an iteration 576 * using the functions utext_next32From() or utext_previous32From() 577 * rather than setIndex(). 578 * <p> 579 * Moving the index position to an adjacent character is best done 580 * with utext_next32(), utext_previous32() or utext_moveIndex32(). 581 * Attempting to do direct arithmetic on the index position is 582 * complicated by the fact that the size (in native units) of a 583 * character depends on the underlying representation of the character 584 * (UTF-8, UTF-16, UTF-32, arbitrary codepage), and is not 585 * easily knowable. 586 * 587 * @param ut the text to be accessed. 588 * @param nativeIndex the native unit index of the new iteration position. 589 * @stable ICU 3.4 590 */ 591 U_STABLE void U_EXPORT2 592 utext_setNativeIndex(UText *ut, int64_t nativeIndex); 593 594 /** 595 * Move the iterator postion by delta code points. The number of code points 596 * is a signed number; a negative delta will move the iterator backwards, 597 * towards the start of the text. 598 * <p> 599 * The index is moved by <code>delta</code> code points 600 * forward or backward, but no further backward than to 0 and 601 * no further forward than to utext_nativeLength(). 602 * The resulting index value will be in between 0 and length, inclusive. 603 * 604 * @param ut the text to be accessed. 605 * @param delta the signed number of code points to move the iteration position. 606 * @return TRUE if the position could be moved the requested number of positions while 607 * staying within the range [0 - text length]. 608 * @stable ICU 3.4 609 */ 610 U_STABLE UBool U_EXPORT2 611 utext_moveIndex32(UText *ut, int32_t delta); 612 613 /** 614 * Get the native index of the character preceeding the current position. 615 * If the iteration position is already at the start of the text, zero 616 * is returned. 617 * The value returned is the same as that obtained from the following sequence, 618 * but without the side effect of changing the iteration position. 619 * 620 * \code 621 * UText *ut = whatever; 622 * ... 623 * utext_previous(ut) 624 * utext_getNativeIndex(ut); 625 * \endcode 626 * 627 * This function is most useful during forwards iteration, where it will get the 628 * native index of the character most recently returned from utext_next(). 629 * 630 * @param ut the text to be accessed 631 * @return the native index of the character preceeding the current index position, 632 * or zero if the current position is at the start of the text. 633 * @stable ICU 3.6 634 */ 635 U_STABLE int64_t U_EXPORT2 636 utext_getPreviousNativeIndex(UText *ut); 637 638 639 /** 640 * 641 * Extract text from a UText into a UChar buffer. The range of text to be extracted 642 * is specified in the native indices of the UText provider. These may not necessarily 643 * be UTF-16 indices. 644 * <p> 645 * The size (number of 16 bit UChars) of the data to be extracted is returned. The 646 * full number of UChars is returned, even when the extracted text is truncated 647 * because the specified buffer size is too small. 648 * <p> 649 * The extracted string will (if you are a user) / must (if you are a text provider) 650 * be NUL-terminated if there is sufficient space in the destination buffer. This 651 * terminating NUL is not included in the returned length. 652 * <p> 653 * The iteration index is left at the position following the last extracted character. 654 * 655 * @param ut the UText from which to extract data. 656 * @param nativeStart the native index of the first character to extract.\ 657 * If the specified index is out of range, 658 * it will be pinned to to be within 0 <= index <= textLength 659 * @param nativeLimit the native string index of the position following the last 660 * character to extract. If the specified index is out of range, 661 * it will be pinned to to be within 0 <= index <= textLength. 662 * nativeLimit must be >= nativeStart. 663 * @param dest the UChar (UTF-16) buffer into which the extracted text is placed 664 * @param destCapacity The size, in UChars, of the destination buffer. May be zero 665 * for precomputing the required size. 666 * @param status receives any error status. 667 * U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the 668 * buffer was too small. Returns number of UChars for preflighting. 669 * @return Number of UChars in the data to be extracted. Does not include a trailing NUL. 670 * 671 * @stable ICU 3.4 672 */ 673 U_STABLE int32_t U_EXPORT2 674 utext_extract(UText *ut, 675 int64_t nativeStart, int64_t nativeLimit, 676 UChar *dest, int32_t destCapacity, 677 UErrorCode *status); 678 679 680 681 /************************************************************************************ 682 * 683 * #define inline versions of selected performance-critical text access functions 684 * Caution: do not use auto increment++ or decrement-- expressions 685 * as parameters to these macros. 686 * 687 * For most use, where there is no extreme performance constraint, the 688 * normal, non-inline functions are a better choice. The resulting code 689 * will be smaller, and, if the need ever arises, easier to debug. 690 * 691 * These are implemented as #defines rather than real functions 692 * because there is no fully portable way to do inline functions in plain C. 693 * 694 ************************************************************************************/ 695 696 #ifndef U_HIDE_INTERNAL_API 697 /** 698 * inline version of utext_current32(), for performance-critical situations. 699 * 700 * Get the code point at the current iteration position of the UText. 701 * Returns U_SENTINEL (-1) if the position is at the end of the 702 * text. 703 * 704 * @internal ICU 4.4 technology preview 705 */ 706 #define UTEXT_CURRENT32(ut) \ 707 ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ 708 ((ut)->chunkContents)[((ut)->chunkOffset)] : utext_current32(ut)) 709 #endif /* U_HIDE_INTERNAL_API */ 710 711 /** 712 * inline version of utext_next32(), for performance-critical situations. 713 * 714 * Get the code point at the current iteration position of the UText, and 715 * advance the position to the first index following the character. 716 * This is a post-increment operation. 717 * Returns U_SENTINEL (-1) if the position is at the end of the 718 * text. 719 * 720 * @stable ICU 3.4 721 */ 722 #define UTEXT_NEXT32(ut) \ 723 ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ 724 ((ut)->chunkContents)[((ut)->chunkOffset)++] : utext_next32(ut)) 725 726 /** 727 * inline version of utext_previous32(), for performance-critical situations. 728 * 729 * Move the iterator position to the character (code point) whose 730 * index precedes the current position, and return that character. 731 * This is a pre-decrement operation. 732 * Returns U_SENTINEL (-1) if the position is at the start of the text. 733 * 734 * @stable ICU 3.4 735 */ 736 #define UTEXT_PREVIOUS32(ut) \ 737 ((ut)->chunkOffset > 0 && \ 738 (ut)->chunkContents[(ut)->chunkOffset-1] < 0xd800 ? \ 739 (ut)->chunkContents[--((ut)->chunkOffset)] : utext_previous32(ut)) 740 741 /** 742 * inline version of utext_getNativeIndex(), for performance-critical situations. 743 * 744 * Get the current iterator position, which can range from 0 to 745 * the length of the text. 746 * The position is a native index into the input text, in whatever format it 747 * may have (possibly UTF-8 for example), and may not always be the same as 748 * the corresponding UChar (UTF-16) index. 749 * The returned position will always be aligned to a code point boundary. 750 * 751 * @stable ICU 3.6 752 */ 753 #define UTEXT_GETNATIVEINDEX(ut) \ 754 ((ut)->chunkOffset <= (ut)->nativeIndexingLimit? \ 755 (ut)->chunkNativeStart+(ut)->chunkOffset : \ 756 (ut)->pFuncs->mapOffsetToNative(ut)) 757 758 /** 759 * inline version of utext_setNativeIndex(), for performance-critical situations. 760 * 761 * Set the current iteration position to the nearest code point 762 * boundary at or preceding the specified index. 763 * The index is in the native units of the original input text. 764 * If the index is out of range, it will be pinned to be within 765 * the range of the input text. 766 * 767 * @stable ICU 3.8 768 */ 769 #define UTEXT_SETNATIVEINDEX(ut, ix) \ 770 { int64_t __offset = (ix) - (ut)->chunkNativeStart; \ 771 if (__offset>=0 && __offset<=(int64_t)(ut)->nativeIndexingLimit) { \ 772 (ut)->chunkOffset=(int32_t)__offset; \ 773 } else { \ 774 utext_setNativeIndex((ut), (ix)); } } 775 776 777 778 /************************************************************************************ 779 * 780 * Functions related to writing or modifying the text. 781 * These will work only with modifiable UTexts. Attempting to 782 * modify a read-only UText will return an error status. 783 * 784 ************************************************************************************/ 785 786 787 /** 788 * Return TRUE if the text can be written (modified) with utext_replace() or 789 * utext_copy(). For the text to be writable, the text provider must 790 * be of a type that supports writing and the UText must not be frozen. 791 * 792 * Attempting to modify text when utext_isWriteable() is FALSE will fail - 793 * the text will not be modified, and an error will be returned from the function 794 * that attempted the modification. 795 * 796 * @param ut the UText to be tested. 797 * @return TRUE if the text is modifiable. 798 * 799 * @see utext_freeze() 800 * @see utext_replace() 801 * @see utext_copy() 802 * @stable ICU 3.4 803 * 804 */ 805 U_STABLE UBool U_EXPORT2 806 utext_isWritable(const UText *ut); 807 808 809 /** 810 * Test whether there is meta data associated with the text. 811 * @see Replaceable::hasMetaData() 812 * 813 * @param ut The UText to be tested 814 * @return TRUE if the underlying text includes meta data. 815 * @stable ICU 3.4 816 */ 817 U_STABLE UBool U_EXPORT2 818 utext_hasMetaData(const UText *ut); 819 820 821 /** 822 * Replace a range of the original text with a replacement text. 823 * 824 * Leaves the current iteration position at the position following the 825 * newly inserted replacement text. 826 * 827 * This function is only available on UText types that support writing, 828 * that is, ones where utext_isWritable() returns TRUE. 829 * 830 * When using this function, there should be only a single UText opened onto the 831 * underlying native text string. Behavior after a replace operation 832 * on a UText is undefined for any other additional UTexts that refer to the 833 * modified string. 834 * 835 * @param ut the UText representing the text to be operated on. 836 * @param nativeStart the native index of the start of the region to be replaced 837 * @param nativeLimit the native index of the character following the region to be replaced. 838 * @param replacementText pointer to the replacement text 839 * @param replacementLength length of the replacement text, or -1 if the text is NUL terminated. 840 * @param status receives any error status. Possible errors include 841 * U_NO_WRITE_PERMISSION 842 * 843 * @return The signed number of (native) storage units by which 844 * the length of the text expanded or contracted. 845 * 846 * @stable ICU 3.4 847 */ 848 U_STABLE int32_t U_EXPORT2 849 utext_replace(UText *ut, 850 int64_t nativeStart, int64_t nativeLimit, 851 const UChar *replacementText, int32_t replacementLength, 852 UErrorCode *status); 853 854 855 856 /** 857 * 858 * Copy or move a substring from one position to another within the text, 859 * while retaining any metadata associated with the text. 860 * This function is used to duplicate or reorder substrings. 861 * The destination index must not overlap the source range. 862 * 863 * The text to be copied or moved is inserted at destIndex; 864 * it does not replace or overwrite any existing text. 865 * 866 * The iteration position is left following the newly inserted text 867 * at the destination position. 868 * 869 * This function is only available on UText types that support writing, 870 * that is, ones where utext_isWritable() returns TRUE. 871 * 872 * When using this function, there should be only a single UText opened onto the 873 * underlying native text string. Behavior after a copy operation 874 * on a UText is undefined in any other additional UTexts that refer to the 875 * modified string. 876 * 877 * @param ut The UText representing the text to be operated on. 878 * @param nativeStart The native index of the start of the region to be copied or moved 879 * @param nativeLimit The native index of the character position following the region 880 * to be copied. 881 * @param destIndex The native destination index to which the source substring is 882 * copied or moved. 883 * @param move If TRUE, then the substring is moved, not copied/duplicated. 884 * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION 885 * 886 * @stable ICU 3.4 887 */ 888 U_STABLE void U_EXPORT2 889 utext_copy(UText *ut, 890 int64_t nativeStart, int64_t nativeLimit, 891 int64_t destIndex, 892 UBool move, 893 UErrorCode *status); 894 895 896 /** 897 * <p> 898 * Freeze a UText. This prevents any modification to the underlying text itself 899 * by means of functions operating on this UText. 900 * </p> 901 * <p> 902 * Once frozen, a UText can not be unfrozen. The intent is to ensure 903 * that a the text underlying a frozen UText wrapper cannot be modified via that UText. 904 * </p> 905 * <p> 906 * Caution: freezing a UText will disable changes made via the specific 907 * frozen UText wrapper only; it will not have any effect on the ability to 908 * directly modify the text by bypassing the UText. Any such backdoor modifications 909 * are always an error while UText access is occuring because the underlying 910 * text can get out of sync with UText's buffering. 911 * </p> 912 * 913 * @param ut The UText to be frozen. 914 * @see utext_isWritable() 915 * @stable ICU 3.6 916 */ 917 U_STABLE void U_EXPORT2 918 utext_freeze(UText *ut); 919 920 921 /** 922 * UText provider properties (bit field indexes). 923 * 924 * @see UText 925 * @stable ICU 3.4 926 */ 927 enum { 928 /** 929 * It is potentially time consuming for the provider to determine the length of the text. 930 * @stable ICU 3.4 931 */ 932 UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE = 1, 933 /** 934 * Text chunks remain valid and usable until the text object is modified or 935 * deleted, not just until the next time the access() function is called 936 * (which is the default). 937 * @stable ICU 3.4 938 */ 939 UTEXT_PROVIDER_STABLE_CHUNKS = 2, 940 /** 941 * The provider supports modifying the text via the replace() and copy() 942 * functions. 943 * @see Replaceable 944 * @stable ICU 3.4 945 */ 946 UTEXT_PROVIDER_WRITABLE = 3, 947 /** 948 * There is meta data associated with the text. 949 * @see Replaceable::hasMetaData() 950 * @stable ICU 3.4 951 */ 952 UTEXT_PROVIDER_HAS_META_DATA = 4, 953 /** 954 * Text provider owns the text storage. 955 * Generally occurs as the result of a deep clone of the UText. 956 * When closing the UText, the associated text must 957 * also be closed/deleted/freed/ whatever is appropriate. 958 * @stable ICU 3.6 959 */ 960 UTEXT_PROVIDER_OWNS_TEXT = 5 961 }; 962 963 /** 964 * Function type declaration for UText.clone(). 965 * 966 * clone a UText. Much like opening a UText where the source text is itself 967 * another UText. 968 * 969 * A deep clone will copy both the UText data structures and the underlying text. 970 * The original and cloned UText will operate completely independently; modifications 971 * made to the text in one will not effect the other. Text providers are not 972 * required to support deep clones. The user of clone() must check the status return 973 * and be prepared to handle failures. 974 * 975 * A shallow clone replicates only the UText data structures; it does not make 976 * a copy of the underlying text. Shallow clones can be used as an efficient way to 977 * have multiple iterators active in a single text string that is not being 978 * modified. 979 * 980 * A shallow clone operation must not fail except for truly exceptional conditions such 981 * as memory allocation failures. 982 * 983 * A UText and its clone may be safely concurrently accessed by separate threads. 984 * This is true for both shallow and deep clones. 985 * It is the responsibility of the Text Provider to ensure that this thread safety 986 * constraint is met. 987 988 * 989 * @param dest A UText struct to be filled in with the result of the clone operation, 990 * or NULL if the clone function should heap-allocate a new UText struct. 991 * @param src The UText to be cloned. 992 * @param deep TRUE to request a deep clone, FALSE for a shallow clone. 993 * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR 994 * should be returned if the text provider is unable to clone the 995 * original text. 996 * @return The newly created clone, or NULL if the clone operation failed. 997 * 998 * @stable ICU 3.4 999 */ 1000 typedef UText * U_CALLCONV 1001 UTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status); 1002 1003 1004 /** 1005 * Function type declaration for UText.nativeLength(). 1006 * 1007 * @param ut the UText to get the length of. 1008 * @return the length, in the native units of the original text string. 1009 * @see UText 1010 * @stable ICU 3.4 1011 */ 1012 typedef int64_t U_CALLCONV 1013 UTextNativeLength(UText *ut); 1014 1015 /** 1016 * Function type declaration for UText.access(). Get the description of the text chunk 1017 * containing the text at a requested native index. The UText's iteration 1018 * position will be left at the requested index. If the index is out 1019 * of bounds, the iteration position will be left at the start or end 1020 * of the string, as appropriate. 1021 * 1022 * Chunks must begin and end on code point boundaries. A single code point 1023 * comprised of multiple storage units must never span a chunk boundary. 1024 * 1025 * 1026 * @param ut the UText being accessed. 1027 * @param nativeIndex Requested index of the text to be accessed. 1028 * @param forward If TRUE, then the returned chunk must contain text 1029 * starting from the index, so that start<=index<limit. 1030 * If FALSE, then the returned chunk must contain text 1031 * before the index, so that start<index<=limit. 1032 * @return True if the requested index could be accessed. The chunk 1033 * will contain the requested text. 1034 * False value if a chunk cannot be accessed 1035 * (the requested index is out of bounds). 1036 * 1037 * @see UText 1038 * @stable ICU 3.4 1039 */ 1040 typedef UBool U_CALLCONV 1041 UTextAccess(UText *ut, int64_t nativeIndex, UBool forward); 1042 1043 /** 1044 * Function type declaration for UText.extract(). 1045 * 1046 * Extract text from a UText into a UChar buffer. The range of text to be extracted 1047 * is specified in the native indices of the UText provider. These may not necessarily 1048 * be UTF-16 indices. 1049 * <p> 1050 * The size (number of 16 bit UChars) in the data to be extracted is returned. The 1051 * full amount is returned, even when the specified buffer size is smaller. 1052 * <p> 1053 * The extracted string will (if you are a user) / must (if you are a text provider) 1054 * be NUL-terminated if there is sufficient space in the destination buffer. 1055 * 1056 * @param ut the UText from which to extract data. 1057 * @param nativeStart the native index of the first characer to extract. 1058 * @param nativeLimit the native string index of the position following the last 1059 * character to extract. 1060 * @param dest the UChar (UTF-16) buffer into which the extracted text is placed 1061 * @param destCapacity The size, in UChars, of the destination buffer. May be zero 1062 * for precomputing the required size. 1063 * @param status receives any error status. 1064 * If U_BUFFER_OVERFLOW_ERROR: Returns number of UChars for 1065 * preflighting. 1066 * @return Number of UChars in the data. Does not include a trailing NUL. 1067 * 1068 * @stable ICU 3.4 1069 */ 1070 typedef int32_t U_CALLCONV 1071 UTextExtract(UText *ut, 1072 int64_t nativeStart, int64_t nativeLimit, 1073 UChar *dest, int32_t destCapacity, 1074 UErrorCode *status); 1075 1076 /** 1077 * Function type declaration for UText.replace(). 1078 * 1079 * Replace a range of the original text with a replacement text. 1080 * 1081 * Leaves the current iteration position at the position following the 1082 * newly inserted replacement text. 1083 * 1084 * This function need only be implemented on UText types that support writing. 1085 * 1086 * When using this function, there should be only a single UText opened onto the 1087 * underlying native text string. The function is responsible for updating the 1088 * text chunk within the UText to reflect the updated iteration position, 1089 * taking into account any changes to the underlying string's structure caused 1090 * by the replace operation. 1091 * 1092 * @param ut the UText representing the text to be operated on. 1093 * @param nativeStart the index of the start of the region to be replaced 1094 * @param nativeLimit the index of the character following the region to be replaced. 1095 * @param replacementText pointer to the replacement text 1096 * @param replacmentLength length of the replacement text in UChars, or -1 if the text is NUL terminated. 1097 * @param status receives any error status. Possible errors include 1098 * U_NO_WRITE_PERMISSION 1099 * 1100 * @return The signed number of (native) storage units by which 1101 * the length of the text expanded or contracted. 1102 * 1103 * @stable ICU 3.4 1104 */ 1105 typedef int32_t U_CALLCONV 1106 UTextReplace(UText *ut, 1107 int64_t nativeStart, int64_t nativeLimit, 1108 const UChar *replacementText, int32_t replacmentLength, 1109 UErrorCode *status); 1110 1111 /** 1112 * Function type declaration for UText.copy(). 1113 * 1114 * Copy or move a substring from one position to another within the text, 1115 * while retaining any metadata associated with the text. 1116 * This function is used to duplicate or reorder substrings. 1117 * The destination index must not overlap the source range. 1118 * 1119 * The text to be copied or moved is inserted at destIndex; 1120 * it does not replace or overwrite any existing text. 1121 * 1122 * This function need only be implemented for UText types that support writing. 1123 * 1124 * When using this function, there should be only a single UText opened onto the 1125 * underlying native text string. The function is responsible for updating the 1126 * text chunk within the UText to reflect the updated iteration position, 1127 * taking into account any changes to the underlying string's structure caused 1128 * by the replace operation. 1129 * 1130 * @param ut The UText representing the text to be operated on. 1131 * @param nativeStart The index of the start of the region to be copied or moved 1132 * @param nativeLimit The index of the character following the region to be replaced. 1133 * @param nativeDest The destination index to which the source substring is copied or moved. 1134 * @param move If TRUE, then the substring is moved, not copied/duplicated. 1135 * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION 1136 * 1137 * @stable ICU 3.4 1138 */ 1139 typedef void U_CALLCONV 1140 UTextCopy(UText *ut, 1141 int64_t nativeStart, int64_t nativeLimit, 1142 int64_t nativeDest, 1143 UBool move, 1144 UErrorCode *status); 1145 1146 /** 1147 * Function type declaration for UText.mapOffsetToNative(). 1148 * Map from the current UChar offset within the current text chunk to 1149 * the corresponding native index in the original source text. 1150 * 1151 * This is required only for text providers that do not use native UTF-16 indexes. 1152 * 1153 * @param ut the UText. 1154 * @return Absolute (native) index corresponding to chunkOffset in the current chunk. 1155 * The returned native index should always be to a code point boundary. 1156 * 1157 * @stable ICU 3.4 1158 */ 1159 typedef int64_t U_CALLCONV 1160 UTextMapOffsetToNative(const UText *ut); 1161 1162 /** 1163 * Function type declaration for UText.mapIndexToUTF16(). 1164 * Map from a native index to a UChar offset within a text chunk. 1165 * Behavior is undefined if the native index does not fall within the 1166 * current chunk. 1167 * 1168 * This function is required only for text providers that do not use native UTF-16 indexes. 1169 * 1170 * @param ut The UText containing the text chunk. 1171 * @param nativeIndex Absolute (native) text index, chunk->start<=index<=chunk->limit. 1172 * @return Chunk-relative UTF-16 offset corresponding to the specified native 1173 * index. 1174 * 1175 * @stable ICU 3.4 1176 */ 1177 typedef int32_t U_CALLCONV 1178 UTextMapNativeIndexToUTF16(const UText *ut, int64_t nativeIndex); 1179 1180 1181 /** 1182 * Function type declaration for UText.utextClose(). 1183 * 1184 * A Text Provider close function is only required for provider types that make 1185 * allocations in their open function (or other functions) that must be 1186 * cleaned when the UText is closed. 1187 * 1188 * The allocation of the UText struct itself and any "extra" storage 1189 * associated with the UText is handled by the common UText implementation 1190 * and does not require provider specific cleanup in a close function. 1191 * 1192 * Most UText provider implementations do not need to implement this function. 1193 * 1194 * @param ut A UText object to be closed. 1195 * 1196 * @stable ICU 3.4 1197 */ 1198 typedef void U_CALLCONV 1199 UTextClose(UText *ut); 1200 1201 1202 /** 1203 * (public) Function dispatch table for UText. 1204 * Conceptually very much like a C++ Virtual Function Table. 1205 * This struct defines the organization of the table. 1206 * Each text provider implementation must provide an 1207 * actual table that is initialized with the appropriate functions 1208 * for the type of text being handled. 1209 * @stable ICU 3.6 1210 */ 1211 struct UTextFuncs { 1212 /** 1213 * (public) Function table size, sizeof(UTextFuncs) 1214 * Intended for use should the table grow to accomodate added 1215 * functions in the future, to allow tests for older format 1216 * function tables that do not contain the extensions. 1217 * 1218 * Fields are placed for optimal alignment on 1219 * 32/64/128-bit-pointer machines, by normally grouping together 1220 * 4 32-bit fields, 1221 * 4 pointers, 1222 * 2 64-bit fields 1223 * in sequence. 1224 * @stable ICU 3.6 1225 */ 1226 int32_t tableSize; 1227 1228 /** 1229 * (private) Alignment padding. 1230 * Do not use, reserved for use by the UText framework only. 1231 * @internal 1232 */ 1233 int32_t reserved1, /** @internal */ reserved2, /** @internal */ reserved3; 1234 1235 1236 /** 1237 * (public) Function pointer for UTextClone 1238 * 1239 * @see UTextClone 1240 * @stable ICU 3.6 1241 */ 1242 UTextClone *clone; 1243 1244 /** 1245 * (public) function pointer for UTextLength 1246 * May be expensive to compute! 1247 * 1248 * @see UTextLength 1249 * @stable ICU 3.6 1250 */ 1251 UTextNativeLength *nativeLength; 1252 1253 /** 1254 * (public) Function pointer for UTextAccess. 1255 * 1256 * @see UTextAccess 1257 * @stable ICU 3.6 1258 */ 1259 UTextAccess *access; 1260 1261 /** 1262 * (public) Function pointer for UTextExtract. 1263 * 1264 * @see UTextExtract 1265 * @stable ICU 3.6 1266 */ 1267 UTextExtract *extract; 1268 1269 /** 1270 * (public) Function pointer for UTextReplace. 1271 * 1272 * @see UTextReplace 1273 * @stable ICU 3.6 1274 */ 1275 UTextReplace *replace; 1276 1277 /** 1278 * (public) Function pointer for UTextCopy. 1279 * 1280 * @see UTextCopy 1281 * @stable ICU 3.6 1282 */ 1283 UTextCopy *copy; 1284 1285 /** 1286 * (public) Function pointer for UTextMapOffsetToNative. 1287 * 1288 * @see UTextMapOffsetToNative 1289 * @stable ICU 3.6 1290 */ 1291 UTextMapOffsetToNative *mapOffsetToNative; 1292 1293 /** 1294 * (public) Function pointer for UTextMapNativeIndexToUTF16. 1295 * 1296 * @see UTextMapNativeIndexToUTF16 1297 * @stable ICU 3.6 1298 */ 1299 UTextMapNativeIndexToUTF16 *mapNativeIndexToUTF16; 1300 1301 /** 1302 * (public) Function pointer for UTextClose. 1303 * 1304 * @see UTextClose 1305 * @stable ICU 3.6 1306 */ 1307 UTextClose *close; 1308 1309 /** 1310 * (private) Spare function pointer 1311 * @internal 1312 */ 1313 UTextClose *spare1; 1314 1315 /** 1316 * (private) Spare function pointer 1317 * @internal 1318 */ 1319 UTextClose *spare2; 1320 1321 /** 1322 * (private) Spare function pointer 1323 * @internal 1324 */ 1325 UTextClose *spare3; 1326 1327 }; 1328 /** 1329 * Function dispatch table for UText 1330 * @see UTextFuncs 1331 */ 1332 typedef struct UTextFuncs UTextFuncs; 1333 1334 /** 1335 * UText struct. Provides the interface between the generic UText access code 1336 * and the UText provider code that works on specific kinds of 1337 * text (UTF-8, noncontiguous UTF-16, whatever.) 1338 * 1339 * Applications that are using predefined types of text providers 1340 * to pass text data to ICU services will have no need to view the 1341 * internals of the UText structs that they open. 1342 * 1343 * @stable ICU 3.6 1344 */ 1345 struct UText { 1346 /** 1347 * (private) Magic. Used to help detect when UText functions are handed 1348 * invalid or unitialized UText structs. 1349 * utext_openXYZ() functions take an initialized, 1350 * but not necessarily open, UText struct as an 1351 * optional fill-in parameter. This magic field 1352 * is used to check for that initialization. 1353 * Text provider close functions must NOT clear 1354 * the magic field because that would prevent 1355 * reuse of the UText struct. 1356 * @internal 1357 */ 1358 uint32_t magic; 1359 1360 1361 /** 1362 * (private) Flags for managing the allocation and freeing of 1363 * memory associated with this UText. 1364 * @internal 1365 */ 1366 int32_t flags; 1367 1368 1369 /** 1370 * Text provider properties. This set of flags is maintainted by the 1371 * text provider implementation. 1372 * @stable ICU 3.4 1373 */ 1374 int32_t providerProperties; 1375 1376 /** 1377 * (public) sizeOfStruct=sizeof(UText) 1378 * Allows possible backward compatible extension. 1379 * 1380 * @stable ICU 3.4 1381 */ 1382 int32_t sizeOfStruct; 1383 1384 /* ------ 16 byte alignment boundary ----------- */ 1385 1386 1387 /** 1388 * (protected) Native index of the first character position following 1389 * the current chunk. 1390 * @stable ICU 3.6 1391 */ 1392 int64_t chunkNativeLimit; 1393 1394 /** 1395 * (protected) Size in bytes of the extra space (pExtra). 1396 * @stable ICU 3.4 1397 */ 1398 int32_t extraSize; 1399 1400 /** 1401 * (protected) The highest chunk offset where native indexing and 1402 * chunk (UTF-16) indexing correspond. For UTF-16 sources, value 1403 * will be equal to chunkLength. 1404 * 1405 * @stable ICU 3.6 1406 */ 1407 int32_t nativeIndexingLimit; 1408 1409 /* ---- 16 byte alignment boundary------ */ 1410 1411 /** 1412 * (protected) Native index of the first character in the text chunk. 1413 * @stable ICU 3.6 1414 */ 1415 int64_t chunkNativeStart; 1416 1417 /** 1418 * (protected) Current iteration position within the text chunk (UTF-16 buffer). 1419 * This is the index to the character that will be returned by utext_next32(). 1420 * @stable ICU 3.6 1421 */ 1422 int32_t chunkOffset; 1423 1424 /** 1425 * (protected) Length the text chunk (UTF-16 buffer), in UChars. 1426 * @stable ICU 3.6 1427 */ 1428 int32_t chunkLength; 1429 1430 /* ---- 16 byte alignment boundary-- */ 1431 1432 1433 /** 1434 * (protected) pointer to a chunk of text in UTF-16 format. 1435 * May refer either to original storage of the source of the text, or 1436 * if conversion was required, to a buffer owned by the UText. 1437 * @stable ICU 3.6 1438 */ 1439 const UChar *chunkContents; 1440 1441 /** 1442 * (public) Pointer to Dispatch table for accessing functions for this UText. 1443 * @stable ICU 3.6 1444 */ 1445 const UTextFuncs *pFuncs; 1446 1447 /** 1448 * (protected) Pointer to additional space requested by the 1449 * text provider during the utext_open operation. 1450 * @stable ICU 3.4 1451 */ 1452 void *pExtra; 1453 1454 /** 1455 * (protected) Pointer to string or text-containin object or similar. 1456 * This is the source of the text that this UText is wrapping, in a format 1457 * that is known to the text provider functions. 1458 * @stable ICU 3.4 1459 */ 1460 const void *context; 1461 1462 /* --- 16 byte alignment boundary--- */ 1463 1464 /** 1465 * (protected) Pointer fields available for use by the text provider. 1466 * Not used by UText common code. 1467 * @stable ICU 3.6 1468 */ 1469 const void *p; 1470 /** 1471 * (protected) Pointer fields available for use by the text provider. 1472 * Not used by UText common code. 1473 * @stable ICU 3.6 1474 */ 1475 const void *q; 1476 /** 1477 * (protected) Pointer fields available for use by the text provider. 1478 * Not used by UText common code. 1479 * @stable ICU 3.6 1480 */ 1481 const void *r; 1482 1483 /** 1484 * Private field reserved for future use by the UText framework 1485 * itself. This is not to be touched by the text providers. 1486 * @internal ICU 3.4 1487 */ 1488 void *privP; 1489 1490 1491 /* --- 16 byte alignment boundary--- */ 1492 1493 1494 /** 1495 * (protected) Integer field reserved for use by the text provider. 1496 * Not used by the UText framework, or by the client (user) of the UText. 1497 * @stable ICU 3.4 1498 */ 1499 int64_t a; 1500 1501 /** 1502 * (protected) Integer field reserved for use by the text provider. 1503 * Not used by the UText framework, or by the client (user) of the UText. 1504 * @stable ICU 3.4 1505 */ 1506 int32_t b; 1507 1508 /** 1509 * (protected) Integer field reserved for use by the text provider. 1510 * Not used by the UText framework, or by the client (user) of the UText. 1511 * @stable ICU 3.4 1512 */ 1513 int32_t c; 1514 1515 /* ---- 16 byte alignment boundary---- */ 1516 1517 1518 /** 1519 * Private field reserved for future use by the UText framework 1520 * itself. This is not to be touched by the text providers. 1521 * @internal ICU 3.4 1522 */ 1523 int64_t privA; 1524 /** 1525 * Private field reserved for future use by the UText framework 1526 * itself. This is not to be touched by the text providers. 1527 * @internal ICU 3.4 1528 */ 1529 int32_t privB; 1530 /** 1531 * Private field reserved for future use by the UText framework 1532 * itself. This is not to be touched by the text providers. 1533 * @internal ICU 3.4 1534 */ 1535 int32_t privC; 1536 }; 1537 1538 1539 /** 1540 * Common function for use by Text Provider implementations to allocate and/or initialize 1541 * a new UText struct. To be called in the implementation of utext_open() functions. 1542 * If the supplied UText parameter is null, a new UText struct will be allocated on the heap. 1543 * If the supplied UText is already open, the provider's close function will be called 1544 * so that the struct can be reused by the open that is in progress. 1545 * 1546 * @param ut pointer to a UText struct to be re-used, or null if a new UText 1547 * should be allocated. 1548 * @param extraSpace The amount of additional space to be allocated as part 1549 * of this UText, for use by types of providers that require 1550 * additional storage. 1551 * @param status Errors are returned here. 1552 * @return pointer to the UText, allocated if necessary, with extra space set up if requested. 1553 * @stable ICU 3.4 1554 */ 1555 U_STABLE UText * U_EXPORT2 1556 utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status); 1557 1558 #ifndef U_HIDE_INTERNAL_API 1559 /** 1560 * @internal 1561 * Value used to help identify correctly initialized UText structs. 1562 * Note: must be publicly visible so that UTEXT_INITIALIZER can access it. 1563 */ 1564 enum { 1565 UTEXT_MAGIC = 0x345ad82c 1566 }; 1567 #endif /* U_HIDE_INTERNAL_API */ 1568 1569 /** 1570 * initializer to be used with local (stack) instances of a UText 1571 * struct. UText structs must be initialized before passing 1572 * them to one of the utext_open functions. 1573 * 1574 * @stable ICU 3.6 1575 */ 1576 #define UTEXT_INITIALIZER { \ 1577 UTEXT_MAGIC, /* magic */ \ 1578 0, /* flags */ \ 1579 0, /* providerProps */ \ 1580 sizeof(UText), /* sizeOfStruct */ \ 1581 0, /* chunkNativeLimit */ \ 1582 0, /* extraSize */ \ 1583 0, /* nativeIndexingLimit */ \ 1584 0, /* chunkNativeStart */ \ 1585 0, /* chunkOffset */ \ 1586 0, /* chunkLength */ \ 1587 NULL, /* chunkContents */ \ 1588 NULL, /* pFuncs */ \ 1589 NULL, /* pExtra */ \ 1590 NULL, /* context */ \ 1591 NULL, NULL, NULL, /* p, q, r */ \ 1592 NULL, /* privP */ \ 1593 0, 0, 0, /* a, b, c */ \ 1594 0, 0, 0 /* privA,B,C, */ \ 1595 } 1596 1597 1598 U_CDECL_END 1599 1600 1601 1602 #endif 1603