1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2004-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: utext.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2004oct06 14 * created by: Markus W. Scherer 15 */ 16 17 #ifndef __UTEXT_H__ 18 #define __UTEXT_H__ 19 20 /** 21 * \file 22 * \brief C API: Abstract Unicode Text API 23 * 24 * The Text Access API provides a means to allow text that is stored in alternative 25 * formats to work with ICU services. ICU normally operates on text that is 26 * stored in UTF-16 format, in (UChar *) arrays for the C APIs or as type 27 * UnicodeString for C++ APIs. 28 * 29 * ICU Text Access allows other formats, such as UTF-8 or non-contiguous 30 * UTF-16 strings, to be placed in a UText wrapper and then passed to ICU services. 31 * 32 * There are three general classes of usage for UText: 33 * 34 * Application Level Use. This is the simplest usage - applications would 35 * use one of the utext_open() functions on their input text, and pass 36 * the resulting UText to the desired ICU service. 37 * 38 * Second is usage in ICU Services, such as break iteration, that will need to 39 * operate on input presented to them as a UText. These implementations 40 * will need to use the iteration and related UText functions to gain 41 * access to the actual text. 42 * 43 * The third class of UText users are "text providers." These are the 44 * UText implementations for the various text storage formats. An application 45 * or system with a unique text storage format can implement a set of 46 * UText provider functions for that format, which will then allow 47 * ICU services to operate on that format. 48 * 49 * 50 * <em>Iterating over text</em> 51 * 52 * Here is sample code for a forward iteration over the contents of a UText 53 * 54 * \code 55 * UChar32 c; 56 * UText *ut = whatever(); 57 * 58 * for (c=utext_next32From(ut, 0); c>=0; c=utext_next32(ut)) { 59 * // do whatever with the codepoint c here. 60 * } 61 * \endcode 62 * 63 * And here is similar code to iterate in the reverse direction, from the end 64 * of the text towards the beginning. 65 * 66 * \code 67 * UChar32 c; 68 * UText *ut = whatever(); 69 * int textLength = utext_nativeLength(ut); 70 * for (c=utext_previous32From(ut, textLength); c>=0; c=utext_previous32(ut)) { 71 * // do whatever with the codepoint c here. 72 * } 73 * \endcode 74 * 75 * <em>Characters and Indexing</em> 76 * 77 * Indexing into text by UText functions is nearly always in terms of the native 78 * indexing of the underlying text storage. The storage format could be UTF-8 79 * or UTF-32, for example. When coding to the UText access API, no assumptions 80 * can be made regarding the size of characters, or how far an index 81 * may move when iterating between characters. 82 * 83 * All indices supplied to UText functions are pinned to the length of the 84 * text. An out-of-bounds index is not considered to be an error, but is 85 * adjusted to be in the range 0 <= index <= length of input text. 86 * 87 * 88 * When an index position is returned from a UText function, it will be 89 * a native index to the underlying text. In the case of multi-unit characters, 90 * it will always refer to the first position of the character, 91 * never to the interior. This is essentially the same thing as saying that 92 * a returned index will always point to a boundary between characters. 93 * 94 * When a native index is supplied to a UText function, all indices that 95 * refer to any part of a multi-unit character representation are considered 96 * to be equivalent. In the case of multi-unit characters, an incoming index 97 * will be logically normalized to refer to the start of the character. 98 * 99 * It is possible to test whether a native index is on a code point boundary 100 * by doing a utext_setNativeIndex() followed by a utext_getNativeIndex(). 101 * If the index is returned unchanged, it was on a code point boundary. If 102 * an adjusted index is returned, the original index referred to the 103 * interior of a character. 104 * 105 * <em>Conventions for calling UText functions</em> 106 * 107 * Most UText access functions have as their first parameter a (UText *) pointer, 108 * which specifies the UText to be used. Unless otherwise noted, the 109 * pointer must refer to a valid, open UText. Attempting to 110 * use a closed UText or passing a NULL pointer is a programming error and 111 * will produce undefined results or NULL pointer exceptions. 112 * 113 * The UText_Open family of functions can either open an existing (closed) 114 * UText, or heap allocate a new UText. Here is sample code for creating 115 * a stack-allocated UText. 116 * 117 * \code 118 * char *s = whatever(); // A utf-8 string 119 * U_ErrorCode status = U_ZERO_ERROR; 120 * UText ut = UTEXT_INITIALIZER; 121 * utext_openUTF8(ut, s, -1, &status); 122 * if (U_FAILURE(status)) { 123 * // error handling 124 * } else { 125 * // work with the UText 126 * } 127 * \endcode 128 * 129 * Any existing UText passed to an open function _must_ have been initialized, 130 * either by the UTEXT_INITIALIZER, or by having been originally heap-allocated 131 * by an open function. Passing NULL will cause the open function to 132 * heap-allocate and fully initialize a new UText. 133 * 134 */ 135 136 137 138 #include "unicode/utypes.h" 139 #include "unicode/uchar.h" 140 #if U_SHOW_CPLUSPLUS_API 141 #include "unicode/localpointer.h" 142 #include "unicode/rep.h" 143 #include "unicode/unistr.h" 144 #include "unicode/chariter.h" 145 #endif 146 147 148 U_CDECL_BEGIN 149 150 struct UText; 151 typedef struct UText UText; /**< C typedef for struct UText. @stable ICU 3.6 */ 152 153 154 /*************************************************************************************** 155 * 156 * C Functions for creating UText wrappers around various kinds of text strings. 157 * 158 ****************************************************************************************/ 159 160 161 /** 162 * Close function for UText instances. 163 * Cleans up, releases any resources being held by an open UText. 164 * <p> 165 * If the UText was originally allocated by one of the utext_open functions, 166 * the storage associated with the utext will also be freed. 167 * If the UText storage originated with the application, as it would with 168 * a local or static instance, the storage will not be deleted. 169 * 170 * An open UText can be reset to refer to new string by using one of the utext_open() 171 * functions without first closing the UText. 172 * 173 * @param ut The UText to be closed. 174 * @return NULL if the UText struct was deleted by the close. If the UText struct 175 * was originally provided by the caller to the open function, it is 176 * returned by this function, and may be safely used again in 177 * a subsequent utext_open. 178 * 179 * @stable ICU 3.4 180 */ 181 U_STABLE UText * U_EXPORT2 182 utext_close(UText *ut); 183 184 #if U_SHOW_CPLUSPLUS_API 185 186 U_NAMESPACE_BEGIN 187 188 /** 189 * \class LocalUTextPointer 190 * "Smart pointer" class, closes a UText via utext_close(). 191 * For most methods see the LocalPointerBase base class. 192 * 193 * @see LocalPointerBase 194 * @see LocalPointer 195 * @stable ICU 4.4 196 */ 197 U_DEFINE_LOCAL_OPEN_POINTER(LocalUTextPointer, UText, utext_close); 198 199 U_NAMESPACE_END 200 201 #endif 202 203 /** 204 * Open a read-only UText implementation for UTF-8 strings. 205 * 206 * \htmlonly 207 * Any invalid UTF-8 in the input will be handled in this way: 208 * a sequence of bytes that has the form of a truncated, but otherwise valid, 209 * UTF-8 sequence will be replaced by a single unicode replacement character, \uFFFD. 210 * Any other illegal bytes will each be replaced by a \uFFFD. 211 * \endhtmlonly 212 * 213 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 214 * If non-NULL, must refer to an initialized UText struct, which will then 215 * be reset to reference the specified UTF-8 string. 216 * @param s A UTF-8 string. Must not be NULL. 217 * @param length The length of the UTF-8 string in bytes, or -1 if the string is 218 * zero terminated. 219 * @param status Errors are returned here. 220 * @return A pointer to the UText. If a pre-allocated UText was provided, it 221 * will always be used and returned. 222 * @stable ICU 3.4 223 */ 224 U_STABLE UText * U_EXPORT2 225 utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status); 226 227 228 /** 229 * Open a read-only UText for UChar * string. 230 * 231 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 232 * If non-NULL, must refer to an initialized UText struct, which will then 233 * be reset to reference the specified UChar string. 234 * @param s A UChar (UTF-16) string 235 * @param length The number of UChars in the input string, or -1 if the string is 236 * zero terminated. 237 * @param status Errors are returned here. 238 * @return A pointer to the UText. If a pre-allocated UText was provided, it 239 * will always be used and returned. 240 * @stable ICU 3.4 241 */ 242 U_STABLE UText * U_EXPORT2 243 utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status); 244 245 246 #if U_SHOW_CPLUSPLUS_API 247 /** 248 * Open a writable UText for a non-const UnicodeString. 249 * 250 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 251 * If non-NULL, must refer to an initialized UText struct, which will then 252 * be reset to reference the specified input string. 253 * @param s A UnicodeString. 254 * @param status Errors are returned here. 255 * @return Pointer to the UText. If a UText was supplied as input, this 256 * will always be used and returned. 257 * @stable ICU 3.4 258 */ 259 U_STABLE UText * U_EXPORT2 260 utext_openUnicodeString(UText *ut, U_NAMESPACE_QUALIFIER UnicodeString *s, UErrorCode *status); 261 262 263 /** 264 * Open a UText for a const UnicodeString. The resulting UText will not be writable. 265 * 266 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 267 * If non-NULL, must refer to an initialized UText struct, which will then 268 * be reset to reference the specified input string. 269 * @param s A const UnicodeString to be wrapped. 270 * @param status Errors are returned here. 271 * @return Pointer to the UText. If a UText was supplied as input, this 272 * will always be used and returned. 273 * @stable ICU 3.4 274 */ 275 U_STABLE UText * U_EXPORT2 276 utext_openConstUnicodeString(UText *ut, const U_NAMESPACE_QUALIFIER UnicodeString *s, UErrorCode *status); 277 278 279 /** 280 * Open a writable UText implementation for an ICU Replaceable object. 281 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 282 * If non-NULL, must refer to an already existing UText, which will then 283 * be reset to reference the specified replaceable text. 284 * @param rep A Replaceable text object. 285 * @param status Errors are returned here. 286 * @return Pointer to the UText. If a UText was supplied as input, this 287 * will always be used and returned. 288 * @see Replaceable 289 * @stable ICU 3.4 290 */ 291 U_STABLE UText * U_EXPORT2 292 utext_openReplaceable(UText *ut, U_NAMESPACE_QUALIFIER Replaceable *rep, UErrorCode *status); 293 294 /** 295 * Open a UText implementation over an ICU CharacterIterator. 296 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 297 * If non-NULL, must refer to an already existing UText, which will then 298 * be reset to reference the specified replaceable text. 299 * @param ci A Character Iterator. 300 * @param status Errors are returned here. 301 * @return Pointer to the UText. If a UText was supplied as input, this 302 * will always be used and returned. 303 * @see Replaceable 304 * @stable ICU 3.4 305 */ 306 U_STABLE UText * U_EXPORT2 307 utext_openCharacterIterator(UText *ut, U_NAMESPACE_QUALIFIER CharacterIterator *ic, UErrorCode *status); 308 309 #endif 310 311 312 /** 313 * Clone a UText. This is much like opening a UText where the source text is itself 314 * another UText. 315 * 316 * A deep clone will copy both the UText data structures and the underlying text. 317 * The original and cloned UText will operate completely independently; modifications 318 * made to the text in one will not affect the other. Text providers are not 319 * required to support deep clones. The user of clone() must check the status return 320 * and be prepared to handle failures. 321 * 322 * The standard UText implementations for UTF8, UChar *, UnicodeString and 323 * Replaceable all support deep cloning. 324 * 325 * The UText returned from a deep clone will be writable, assuming that the text 326 * provider is able to support writing, even if the source UText had been made 327 * non-writable by means of UText_freeze(). 328 * 329 * A shallow clone replicates only the UText data structures; it does not make 330 * a copy of the underlying text. Shallow clones can be used as an efficient way to 331 * have multiple iterators active in a single text string that is not being 332 * modified. 333 * 334 * A shallow clone operation will not fail, barring truly exceptional conditions such 335 * as memory allocation failures. 336 * 337 * Shallow UText clones should be avoided if the UText functions that modify the 338 * text are expected to be used, either on the original or the cloned UText. 339 * Any such modifications can cause unpredictable behavior. Read Only 340 * shallow clones provide some protection against errors of this type by 341 * disabling text modification via the cloned UText. 342 * 343 * A shallow clone made with the readOnly parameter == FALSE will preserve the 344 * utext_isWritable() state of the source object. Note, however, that 345 * write operations must be avoided while more than one UText exists that refer 346 * to the same underlying text. 347 * 348 * A UText and its clone may be safely concurrently accessed by separate threads. 349 * This is true for read access only with shallow clones, and for both read and 350 * write access with deep clones. 351 * It is the responsibility of the Text Provider to ensure that this thread safety 352 * constraint is met. 353 * 354 * @param dest A UText struct to be filled in with the result of the clone operation, 355 * or NULL if the clone function should heap-allocate a new UText struct. 356 * If non-NULL, must refer to an already existing UText, which will then 357 * be reset to become the clone. 358 * @param src The UText to be cloned. 359 * @param deep TRUE to request a deep clone, FALSE for a shallow clone. 360 * @param readOnly TRUE to request that the cloned UText have read only access to the 361 * underlying text. 362 363 * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR 364 * will be returned if the text provider is unable to clone the 365 * original text. 366 * @return The newly created clone, or NULL if the clone operation failed. 367 * @stable ICU 3.4 368 */ 369 U_STABLE UText * U_EXPORT2 370 utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status); 371 372 373 /** 374 * Compare two UText objects for equality. 375 * UTexts are equal if they are iterating over the same text, and 376 * have the same iteration position within the text. 377 * If either or both of the parameters are NULL, the comparison is FALSE. 378 * 379 * @param a The first of the two UTexts to compare. 380 * @param b The other UText to be compared. 381 * @return TRUE if the two UTexts are equal. 382 * @stable ICU 3.6 383 */ 384 U_STABLE UBool U_EXPORT2 385 utext_equals(const UText *a, const UText *b); 386 387 388 /***************************************************************************** 389 * 390 * Functions to work with the text represeted by a UText wrapper 391 * 392 *****************************************************************************/ 393 394 /** 395 * Get the length of the text. Depending on the characteristics 396 * of the underlying text representation, this may be expensive. 397 * @see utext_isLengthExpensive() 398 * 399 * 400 * @param ut the text to be accessed. 401 * @return the length of the text, expressed in native units. 402 * 403 * @stable ICU 3.4 404 */ 405 U_STABLE int64_t U_EXPORT2 406 utext_nativeLength(UText *ut); 407 408 /** 409 * Return TRUE if calculating the length of the text could be expensive. 410 * Finding the length of NUL terminated strings is considered to be expensive. 411 * 412 * Note that the value of this function may change 413 * as the result of other operations on a UText. 414 * Once the length of a string has been discovered, it will no longer 415 * be expensive to report it. 416 * 417 * @param ut the text to be accessed. 418 * @return TRUE if determining the length of the text could be time consuming. 419 * @stable ICU 3.4 420 */ 421 U_STABLE UBool U_EXPORT2 422 utext_isLengthExpensive(const UText *ut); 423 424 /** 425 * Returns the code point at the requested index, 426 * or U_SENTINEL (-1) if it is out of bounds. 427 * 428 * If the specified index points to the interior of a multi-unit 429 * character - one of the trail bytes of a UTF-8 sequence, for example - 430 * the complete code point will be returned. 431 * 432 * The iteration position will be set to the start of the returned code point. 433 * 434 * This function is roughly equivalent to the the sequence 435 * utext_setNativeIndex(index); 436 * utext_current32(); 437 * (There is a subtle difference if the index is out of bounds by being less than zero - 438 * utext_setNativeIndex(negative value) sets the index to zero, after which utext_current() 439 * will return the char at zero. utext_char32At(negative index), on the other hand, will 440 * return the U_SENTINEL value of -1.) 441 * 442 * @param ut the text to be accessed 443 * @param nativeIndex the native index of the character to be accessed. If the index points 444 * to other than the first unit of a multi-unit character, it will be adjusted 445 * to the start of the character. 446 * @return the code point at the specified index. 447 * @stable ICU 3.4 448 */ 449 U_STABLE UChar32 U_EXPORT2 450 utext_char32At(UText *ut, int64_t nativeIndex); 451 452 453 /** 454 * 455 * Get the code point at the current iteration position, 456 * or U_SENTINEL (-1) if the iteration has reached the end of 457 * the input text. 458 * 459 * @param ut the text to be accessed. 460 * @return the Unicode code point at the current iterator position. 461 * @stable ICU 3.4 462 */ 463 U_STABLE UChar32 U_EXPORT2 464 utext_current32(UText *ut); 465 466 467 /** 468 * Get the code point at the current iteration position of the UText, and 469 * advance the position to the first index following the character. 470 * 471 * If the position is at the end of the text (the index following 472 * the last character, which is also the length of the text), 473 * return U_SENTINEL (-1) and do not advance the index. 474 * 475 * This is a post-increment operation. 476 * 477 * An inline macro version of this function, UTEXT_NEXT32(), 478 * is available for performance critical use. 479 * 480 * @param ut the text to be accessed. 481 * @return the Unicode code point at the iteration position. 482 * @see UTEXT_NEXT32 483 * @stable ICU 3.4 484 */ 485 U_STABLE UChar32 U_EXPORT2 486 utext_next32(UText *ut); 487 488 489 /** 490 * Move the iterator position to the character (code point) whose 491 * index precedes the current position, and return that character. 492 * This is a pre-decrement operation. 493 * 494 * If the initial position is at the start of the text (index of 0) 495 * return U_SENTINEL (-1), and leave the position unchanged. 496 * 497 * An inline macro version of this function, UTEXT_PREVIOUS32(), 498 * is available for performance critical use. 499 * 500 * @param ut the text to be accessed. 501 * @return the previous UChar32 code point, or U_SENTINEL (-1) 502 * if the iteration has reached the start of the text. 503 * @see UTEXT_PREVIOUS32 504 * @stable ICU 3.4 505 */ 506 U_STABLE UChar32 U_EXPORT2 507 utext_previous32(UText *ut); 508 509 510 /** 511 * Set the iteration index and return the code point at that index. 512 * Leave the iteration index at the start of the following code point. 513 * 514 * This function is the most efficient and convenient way to 515 * begin a forward iteration. The results are identical to the those 516 * from the sequence 517 * \code 518 * utext_setIndex(); 519 * utext_next32(); 520 * \endcode 521 * 522 * @param ut the text to be accessed. 523 * @param nativeIndex Iteration index, in the native units of the text provider. 524 * @return Code point which starts at or before index, 525 * or U_SENTINEL (-1) if it is out of bounds. 526 * @stable ICU 3.4 527 */ 528 U_STABLE UChar32 U_EXPORT2 529 utext_next32From(UText *ut, int64_t nativeIndex); 530 531 532 533 /** 534 * Set the iteration index, and return the code point preceding the 535 * one specified by the initial index. Leave the iteration position 536 * at the start of the returned code point. 537 * 538 * This function is the most efficient and convenient way to 539 * begin a backwards iteration. 540 * 541 * @param ut the text to be accessed. 542 * @param nativeIndex Iteration index in the native units of the text provider. 543 * @return Code point preceding the one at the initial index, 544 * or U_SENTINEL (-1) if it is out of bounds. 545 * 546 * @stable ICU 3.4 547 */ 548 U_STABLE UChar32 U_EXPORT2 549 utext_previous32From(UText *ut, int64_t nativeIndex); 550 551 /** 552 * Get the current iterator position, which can range from 0 to 553 * the length of the text. 554 * The position is a native index into the input text, in whatever format it 555 * may have (possibly UTF-8 for example), and may not always be the same as 556 * the corresponding UChar (UTF-16) index. 557 * The returned position will always be aligned to a code point boundary. 558 * 559 * @param ut the text to be accessed. 560 * @return the current index position, in the native units of the text provider. 561 * @stable ICU 3.4 562 */ 563 U_STABLE int64_t U_EXPORT2 564 utext_getNativeIndex(const UText *ut); 565 566 /** 567 * Set the current iteration position to the nearest code point 568 * boundary at or preceding the specified index. 569 * The index is in the native units of the original input text. 570 * If the index is out of range, it will be pinned to be within 571 * the range of the input text. 572 * <p> 573 * It will usually be more efficient to begin an iteration 574 * using the functions utext_next32From() or utext_previous32From() 575 * rather than setIndex(). 576 * <p> 577 * Moving the index position to an adjacent character is best done 578 * with utext_next32(), utext_previous32() or utext_moveIndex32(). 579 * Attempting to do direct arithmetic on the index position is 580 * complicated by the fact that the size (in native units) of a 581 * character depends on the underlying representation of the character 582 * (UTF-8, UTF-16, UTF-32, arbitrary codepage), and is not 583 * easily knowable. 584 * 585 * @param ut the text to be accessed. 586 * @param nativeIndex the native unit index of the new iteration position. 587 * @stable ICU 3.4 588 */ 589 U_STABLE void U_EXPORT2 590 utext_setNativeIndex(UText *ut, int64_t nativeIndex); 591 592 /** 593 * Move the iterator postion by delta code points. The number of code points 594 * is a signed number; a negative delta will move the iterator backwards, 595 * towards the start of the text. 596 * <p> 597 * The index is moved by <code>delta</code> code points 598 * forward or backward, but no further backward than to 0 and 599 * no further forward than to utext_nativeLength(). 600 * The resulting index value will be in between 0 and length, inclusive. 601 * 602 * @param ut the text to be accessed. 603 * @param delta the signed number of code points to move the iteration position. 604 * @return TRUE if the position could be moved the requested number of positions while 605 * staying within the range [0 - text length]. 606 * @stable ICU 3.4 607 */ 608 U_STABLE UBool U_EXPORT2 609 utext_moveIndex32(UText *ut, int32_t delta); 610 611 /** 612 * Get the native index of the character preceeding the current position. 613 * If the iteration position is already at the start of the text, zero 614 * is returned. 615 * The value returned is the same as that obtained from the following sequence, 616 * but without the side effect of changing the iteration position. 617 * 618 * \code 619 * UText *ut = whatever; 620 * ... 621 * utext_previous(ut) 622 * utext_getNativeIndex(ut); 623 * \endcode 624 * 625 * This function is most useful during forwards iteration, where it will get the 626 * native index of the character most recently returned from utext_next(). 627 * 628 * @param ut the text to be accessed 629 * @return the native index of the character preceeding the current index position, 630 * or zero if the current position is at the start of the text. 631 * @stable ICU 3.6 632 */ 633 U_STABLE int64_t U_EXPORT2 634 utext_getPreviousNativeIndex(UText *ut); 635 636 637 /** 638 * 639 * Extract text from a UText into a UChar buffer. The range of text to be extracted 640 * is specified in the native indices of the UText provider. These may not necessarily 641 * be UTF-16 indices. 642 * <p> 643 * The size (number of 16 bit UChars) of the data to be extracted is returned. The 644 * full number of UChars is returned, even when the extracted text is truncated 645 * because the specified buffer size is too small. 646 * <p> 647 * The extracted string will (if you are a user) / must (if you are a text provider) 648 * be NUL-terminated if there is sufficient space in the destination buffer. This 649 * terminating NUL is not included in the returned length. 650 * <p> 651 * The iteration index is left at the position following the last extracted character. 652 * 653 * @param ut the UText from which to extract data. 654 * @param nativeStart the native index of the first character to extract.\ 655 * If the specified index is out of range, 656 * it will be pinned to to be within 0 <= index <= textLength 657 * @param nativeLimit the native string index of the position following the last 658 * character to extract. If the specified index is out of range, 659 * it will be pinned to to be within 0 <= index <= textLength. 660 * nativeLimit must be >= nativeStart. 661 * @param dest the UChar (UTF-16) buffer into which the extracted text is placed 662 * @param destCapacity The size, in UChars, of the destination buffer. May be zero 663 * for precomputing the required size. 664 * @param status receives any error status. 665 * U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the 666 * buffer was too small. Returns number of UChars for preflighting. 667 * @return Number of UChars in the data to be extracted. Does not include a trailing NUL. 668 * 669 * @stable ICU 3.4 670 */ 671 U_STABLE int32_t U_EXPORT2 672 utext_extract(UText *ut, 673 int64_t nativeStart, int64_t nativeLimit, 674 UChar *dest, int32_t destCapacity, 675 UErrorCode *status); 676 677 678 /** 679 * Compare two UTexts (binary order). The comparison begins at each source text's 680 * iteration position. The iteration position of each UText will be left following 681 * the last character compared. 682 * 683 * The comparison is done in code point order; unlike u_strCompare, you 684 * cannot choose to use code unit order. This is because the characters 685 * in a UText are accessed one code point at a time, and may not be from a UTF-16 686 * context. 687 * 688 * This functions works with strings of different explicitly specified lengths 689 * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. 690 * A length argument of -1 signifies that as much of the string should be used as 691 * is necessary to compare with the other string. If both length arguments are -1, 692 * the entire remaining portionss of both strings are used. 693 * 694 * @param s1 First source string. 695 * @param length1 Length of first source string in UTF-32 code points. 696 * 697 * @param s2 Second source string. 698 * @param length2 Length of second source string in UTF-32 code points. 699 * 700 * @return <0 or 0 or >0 as usual for string comparisons 701 * 702 * @internal ICU 4.4 technology preview 703 */ 704 U_INTERNAL int32_t U_EXPORT2 705 utext_compare(UText *s1, int32_t length1, 706 UText *s2, int32_t length2); 707 708 /** 709 * Compare two UTexts (binary order). The comparison begins at each source text's 710 * iteration position. The iteration position of each UText will be left following 711 * the last character compared. This method differs from utext_compare in that 712 * it accepts native limits rather than lengths for each string. 713 * 714 * The comparison is done in code point order; unlike u_strCompare, you 715 * cannot choose to use code unit order. This is because the characters 716 * in a UText are accessed one code point at a time, and may not be from a UTF-16 717 * context. 718 * 719 * This functions works with strings of different explicitly specified lengths 720 * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. 721 * A limit argument of -1 signifies that as much of the string should be used as 722 * is necessary to compare with the other string. If both limit arguments are -1, 723 * the entire remaining portionss of both strings are used. 724 * 725 * @param s1 First source string. 726 * @param limit1 Native index of the last character in the first source string to be considered. 727 * 728 * @param s2 Second source string. 729 * @param limit2 Native index of the last character in the second source string to be considered. 730 * 731 * @return <0 or 0 or >0 as usual for string comparisons 732 * 733 * @internal ICU 4.4 technology preview 734 */ 735 U_INTERNAL int32_t U_EXPORT2 736 utext_compareNativeLimit(UText *s1, int64_t limit1, 737 UText *s2, int64_t limit2); 738 739 /** 740 * Compare two UTexts case-insensitively using full case folding. The comparison 741 * begins at each source text's iteration position. The iteration position of each 742 * UText will be left following the last character compared. 743 * 744 * The comparison is done in code point order; this is because the characters 745 * in a UText are accessed one code point at a time, and may not be from a UTF-16 746 * context. 747 * 748 * This functions works with strings of different explicitly specified lengths 749 * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. 750 * A length argument of -1 signifies that as much of the string should be used as 751 * is necessary to compare with the other string. If both length arguments are -1, 752 * the entire remaining portionss of both strings are used. 753 * 754 * @param s1 First source string. 755 * @param length1 Length of first source string in UTF-32 code points. 756 * 757 * @param s2 Second source string. 758 * @param length2 Length of second source string in UTF-32 code points. 759 * 760 * @param options A bit set of options: 761 * - U_FOLD_CASE_DEFAULT or 0 is used for default options: 762 * Comparison in code point order with default case folding. 763 * 764 * - U_FOLD_CASE_EXCLUDE_SPECIAL_I 765 * 766 * @param pErrorCode Must be a valid pointer to an error code value, 767 * which must not indicate a failure before the function call. 768 * 769 * @return <0 or 0 or >0 as usual for string comparisons 770 * 771 * @internal ICU 4.4 technology preview 772 */ 773 U_INTERNAL int32_t U_EXPORT2 774 utext_caseCompare(UText *s1, int32_t length1, 775 UText *s2, int32_t length2, 776 uint32_t options, UErrorCode *pErrorCode); 777 778 /** 779 * Compare two UTexts case-insensitively using full case folding. The comparison 780 * begins at each source text's iteration position. The iteration position of each 781 * UText will be left following the last character compared. This method differs from 782 * utext_caseCompare in that it accepts native limits rather than lengths for each 783 * string. 784 * 785 * The comparison is done in code point order; this is because the characters 786 * in a UText are accessed one code point at a time, and may not be from a UTF-16 787 * context. 788 * 789 * This functions works with strings of different explicitly specified lengths 790 * unlike the ANSI C-like u_strcmp() and u_memcmp() etc. 791 * A limit argument of -1 signifies that as much of the string should be used as 792 * is necessary to compare with the other string. If both length arguments are -1, 793 * the entire remaining portionss of both strings are used. 794 * 795 * @param s1 First source string. 796 * @param limit1 Native index of the last character in the first source string to be considered. 797 * 798 * @param s2 Second source string. 799 * @param limit2 Native index of the last character in the second source string to be considered. 800 * 801 * @param options A bit set of options: 802 * - U_FOLD_CASE_DEFAULT or 0 is used for default options: 803 * Comparison in code point order with default case folding. 804 * 805 * - U_FOLD_CASE_EXCLUDE_SPECIAL_I 806 * 807 * @param pErrorCode Must be a valid pointer to an error code value, 808 * which must not indicate a failure before the function call. 809 * 810 * @return <0 or 0 or >0 as usual for string comparisons 811 * 812 * @internal ICU 4.4 technology preview 813 */ 814 U_INTERNAL int32_t U_EXPORT2 815 utext_caseCompareNativeLimit(UText *s1, int64_t limit1, 816 UText *s2, int64_t limit2, 817 uint32_t options, UErrorCode *pErrorCode); 818 819 820 /************************************************************************************ 821 * 822 * #define inline versions of selected performance-critical text access functions 823 * Caution: do not use auto increment++ or decrement-- expressions 824 * as parameters to these macros. 825 * 826 * For most use, where there is no extreme performance constraint, the 827 * normal, non-inline functions are a better choice. The resulting code 828 * will be smaller, and, if the need ever arises, easier to debug. 829 * 830 * These are implemented as #defines rather than real functions 831 * because there is no fully portable way to do inline functions in plain C. 832 * 833 ************************************************************************************/ 834 835 /** 836 * inline version of utext_current32(), for performance-critical situations. 837 * 838 * Get the code point at the current iteration position of the UText. 839 * Returns U_SENTINEL (-1) if the position is at the end of the 840 * text. 841 * 842 * @internal ICU 4.4 technology preview 843 */ 844 #define UTEXT_CURRENT32(ut) \ 845 ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ 846 ((ut)->chunkContents)[((ut)->chunkOffset)] : utext_current32(ut)) 847 848 /** 849 * inline version of utext_next32(), for performance-critical situations. 850 * 851 * Get the code point at the current iteration position of the UText, and 852 * advance the position to the first index following the character. 853 * This is a post-increment operation. 854 * Returns U_SENTINEL (-1) if the position is at the end of the 855 * text. 856 * 857 * @stable ICU 3.4 858 */ 859 #define UTEXT_NEXT32(ut) \ 860 ((ut)->chunkOffset < (ut)->chunkLength && ((ut)->chunkContents)[(ut)->chunkOffset]<0xd800 ? \ 861 ((ut)->chunkContents)[((ut)->chunkOffset)++] : utext_next32(ut)) 862 863 /** 864 * inline version of utext_previous32(), for performance-critical situations. 865 * 866 * Move the iterator position to the character (code point) whose 867 * index precedes the current position, and return that character. 868 * This is a pre-decrement operation. 869 * Returns U_SENTINEL (-1) if the position is at the start of the text. 870 * 871 * @stable ICU 3.4 872 */ 873 #define UTEXT_PREVIOUS32(ut) \ 874 ((ut)->chunkOffset > 0 && \ 875 (ut)->chunkContents[(ut)->chunkOffset-1] < 0xd800 ? \ 876 (ut)->chunkContents[--((ut)->chunkOffset)] : utext_previous32(ut)) 877 878 /** 879 * inline version of utext_getNativeIndex(), for performance-critical situations. 880 * 881 * Get the current iterator position, which can range from 0 to 882 * the length of the text. 883 * The position is a native index into the input text, in whatever format it 884 * may have (possibly UTF-8 for example), and may not always be the same as 885 * the corresponding UChar (UTF-16) index. 886 * The returned position will always be aligned to a code point boundary. 887 * 888 * @stable ICU 3.6 889 */ 890 #define UTEXT_GETNATIVEINDEX(ut) \ 891 ((ut)->chunkOffset <= (ut)->nativeIndexingLimit? \ 892 (ut)->chunkNativeStart+(ut)->chunkOffset : \ 893 (ut)->pFuncs->mapOffsetToNative(ut)) 894 895 /** 896 * inline version of utext_setNativeIndex(), for performance-critical situations. 897 * 898 * Set the current iteration position to the nearest code point 899 * boundary at or preceding the specified index. 900 * The index is in the native units of the original input text. 901 * If the index is out of range, it will be pinned to be within 902 * the range of the input text. 903 * 904 * @stable ICU 3.8 905 */ 906 #define UTEXT_SETNATIVEINDEX(ut, ix) \ 907 { int64_t __offset = (ix) - (ut)->chunkNativeStart; \ 908 if (__offset>=0 && __offset<=(int64_t)(ut)->nativeIndexingLimit) { \ 909 (ut)->chunkOffset=(int32_t)__offset; \ 910 } else { \ 911 utext_setNativeIndex((ut), (ix)); } } 912 913 914 915 /************************************************************************************ 916 * 917 * Functions related to writing or modifying the text. 918 * These will work only with modifiable UTexts. Attempting to 919 * modify a read-only UText will return an error status. 920 * 921 ************************************************************************************/ 922 923 924 /** 925 * Return TRUE if the text can be written (modified) with utext_replace() or 926 * utext_copy(). For the text to be writable, the text provider must 927 * be of a type that supports writing and the UText must not be frozen. 928 * 929 * Attempting to modify text when utext_isWriteable() is FALSE will fail - 930 * the text will not be modified, and an error will be returned from the function 931 * that attempted the modification. 932 * 933 * @param ut the UText to be tested. 934 * @return TRUE if the text is modifiable. 935 * 936 * @see utext_freeze() 937 * @see utext_replace() 938 * @see utext_copy() 939 * @stable ICU 3.4 940 * 941 */ 942 U_STABLE UBool U_EXPORT2 943 utext_isWritable(const UText *ut); 944 945 946 /** 947 * Test whether there is meta data associated with the text. 948 * @see Replaceable::hasMetaData() 949 * 950 * @param ut The UText to be tested 951 * @return TRUE if the underlying text includes meta data. 952 * @stable ICU 3.4 953 */ 954 U_STABLE UBool U_EXPORT2 955 utext_hasMetaData(const UText *ut); 956 957 958 /** 959 * Replace a range of the original text with a replacement text. 960 * 961 * Leaves the current iteration position at the position following the 962 * newly inserted replacement text. 963 * 964 * This function is only available on UText types that support writing, 965 * that is, ones where utext_isWritable() returns TRUE. 966 * 967 * When using this function, there should be only a single UText opened onto the 968 * underlying native text string. Behavior after a replace operation 969 * on a UText is undefined for any other additional UTexts that refer to the 970 * modified string. 971 * 972 * @param ut the UText representing the text to be operated on. 973 * @param nativeStart the native index of the start of the region to be replaced 974 * @param nativeLimit the native index of the character following the region to be replaced. 975 * @param replacementText pointer to the replacement text 976 * @param replacementLength length of the replacement text, or -1 if the text is NUL terminated. 977 * @param status receives any error status. Possible errors include 978 * U_NO_WRITE_PERMISSION 979 * 980 * @return The signed number of (native) storage units by which 981 * the length of the text expanded or contracted. 982 * 983 * @stable ICU 3.4 984 */ 985 U_STABLE int32_t U_EXPORT2 986 utext_replace(UText *ut, 987 int64_t nativeStart, int64_t nativeLimit, 988 const UChar *replacementText, int32_t replacementLength, 989 UErrorCode *status); 990 991 992 993 /** 994 * 995 * Copy or move a substring from one position to another within the text, 996 * while retaining any metadata associated with the text. 997 * This function is used to duplicate or reorder substrings. 998 * The destination index must not overlap the source range. 999 * 1000 * The text to be copied or moved is inserted at destIndex; 1001 * it does not replace or overwrite any existing text. 1002 * 1003 * The iteration position is left following the newly inserted text 1004 * at the destination position. 1005 * 1006 * This function is only available on UText types that support writing, 1007 * that is, ones where utext_isWritable() returns TRUE. 1008 * 1009 * When using this function, there should be only a single UText opened onto the 1010 * underlying native text string. Behavior after a copy operation 1011 * on a UText is undefined in any other additional UTexts that refer to the 1012 * modified string. 1013 * 1014 * @param ut The UText representing the text to be operated on. 1015 * @param nativeStart The native index of the start of the region to be copied or moved 1016 * @param nativeLimit The native index of the character position following the region 1017 * to be copied. 1018 * @param destIndex The native destination index to which the source substring is 1019 * copied or moved. 1020 * @param move If TRUE, then the substring is moved, not copied/duplicated. 1021 * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION 1022 * 1023 * @stable ICU 3.4 1024 */ 1025 U_STABLE void U_EXPORT2 1026 utext_copy(UText *ut, 1027 int64_t nativeStart, int64_t nativeLimit, 1028 int64_t destIndex, 1029 UBool move, 1030 UErrorCode *status); 1031 1032 1033 /** 1034 * <p> 1035 * Freeze a UText. This prevents any modification to the underlying text itself 1036 * by means of functions operating on this UText. 1037 * </p> 1038 * <p> 1039 * Once frozen, a UText can not be unfrozen. The intent is to ensure 1040 * that a the text underlying a frozen UText wrapper cannot be modified via that UText. 1041 * </p> 1042 * <p> 1043 * Caution: freezing a UText will disable changes made via the specific 1044 * frozen UText wrapper only; it will not have any effect on the ability to 1045 * directly modify the text by bypassing the UText. Any such backdoor modifications 1046 * are always an error while UText access is occuring because the underlying 1047 * text can get out of sync with UText's buffering. 1048 * </p> 1049 * 1050 * @param ut The UText to be frozen. 1051 * @see utext_isWritable() 1052 * @stable ICU 3.6 1053 */ 1054 U_STABLE void U_EXPORT2 1055 utext_freeze(UText *ut); 1056 1057 1058 /** 1059 * UText provider properties (bit field indexes). 1060 * 1061 * @see UText 1062 * @stable ICU 3.4 1063 */ 1064 enum { 1065 /** 1066 * It is potentially time consuming for the provider to determine the length of the text. 1067 * @stable ICU 3.4 1068 */ 1069 UTEXT_PROVIDER_LENGTH_IS_EXPENSIVE = 1, 1070 /** 1071 * Text chunks remain valid and usable until the text object is modified or 1072 * deleted, not just until the next time the access() function is called 1073 * (which is the default). 1074 * @stable ICU 3.4 1075 */ 1076 UTEXT_PROVIDER_STABLE_CHUNKS = 2, 1077 /** 1078 * The provider supports modifying the text via the replace() and copy() 1079 * functions. 1080 * @see Replaceable 1081 * @stable ICU 3.4 1082 */ 1083 UTEXT_PROVIDER_WRITABLE = 3, 1084 /** 1085 * There is meta data associated with the text. 1086 * @see Replaceable::hasMetaData() 1087 * @stable ICU 3.4 1088 */ 1089 UTEXT_PROVIDER_HAS_META_DATA = 4, 1090 /** 1091 * Text provider owns the text storage. 1092 * Generally occurs as the result of a deep clone of the UText. 1093 * When closing the UText, the associated text must 1094 * also be closed/deleted/freed/ whatever is appropriate. 1095 * @stable ICU 3.6 1096 */ 1097 UTEXT_PROVIDER_OWNS_TEXT = 5 1098 }; 1099 1100 /** 1101 * Function type declaration for UText.clone(). 1102 * 1103 * clone a UText. Much like opening a UText where the source text is itself 1104 * another UText. 1105 * 1106 * A deep clone will copy both the UText data structures and the underlying text. 1107 * The original and cloned UText will operate completely independently; modifications 1108 * made to the text in one will not effect the other. Text providers are not 1109 * required to support deep clones. The user of clone() must check the status return 1110 * and be prepared to handle failures. 1111 * 1112 * A shallow clone replicates only the UText data structures; it does not make 1113 * a copy of the underlying text. Shallow clones can be used as an efficient way to 1114 * have multiple iterators active in a single text string that is not being 1115 * modified. 1116 * 1117 * A shallow clone operation must not fail except for truly exceptional conditions such 1118 * as memory allocation failures. 1119 * 1120 * A UText and its clone may be safely concurrently accessed by separate threads. 1121 * This is true for both shallow and deep clones. 1122 * It is the responsibility of the Text Provider to ensure that this thread safety 1123 * constraint is met. 1124 1125 * 1126 * @param dest A UText struct to be filled in with the result of the clone operation, 1127 * or NULL if the clone function should heap-allocate a new UText struct. 1128 * @param src The UText to be cloned. 1129 * @param deep TRUE to request a deep clone, FALSE for a shallow clone. 1130 * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR 1131 * should be returned if the text provider is unable to clone the 1132 * original text. 1133 * @return The newly created clone, or NULL if the clone operation failed. 1134 * 1135 * @stable ICU 3.4 1136 */ 1137 typedef UText * U_CALLCONV 1138 UTextClone(UText *dest, const UText *src, UBool deep, UErrorCode *status); 1139 1140 1141 /** 1142 * Function type declaration for UText.nativeLength(). 1143 * 1144 * @param ut the UText to get the length of. 1145 * @return the length, in the native units of the original text string. 1146 * @see UText 1147 * @stable ICU 3.4 1148 */ 1149 typedef int64_t U_CALLCONV 1150 UTextNativeLength(UText *ut); 1151 1152 /** 1153 * Function type declaration for UText.access(). Get the description of the text chunk 1154 * containing the text at a requested native index. The UText's iteration 1155 * position will be left at the requested index. If the index is out 1156 * of bounds, the iteration position will be left at the start or end 1157 * of the string, as appropriate. 1158 * 1159 * Chunks must begin and end on code point boundaries. A single code point 1160 * comprised of multiple storage units must never span a chunk boundary. 1161 * 1162 * 1163 * @param ut the UText being accessed. 1164 * @param nativeIndex Requested index of the text to be accessed. 1165 * @param forward If TRUE, then the returned chunk must contain text 1166 * starting from the index, so that start<=index<limit. 1167 * If FALSE, then the returned chunk must contain text 1168 * before the index, so that start<index<=limit. 1169 * @return True if the requested index could be accessed. The chunk 1170 * will contain the requested text. 1171 * False value if a chunk cannot be accessed 1172 * (the requested index is out of bounds). 1173 * 1174 * @see UText 1175 * @stable ICU 3.4 1176 */ 1177 typedef UBool U_CALLCONV 1178 UTextAccess(UText *ut, int64_t nativeIndex, UBool forward); 1179 1180 /** 1181 * Function type declaration for UText.extract(). 1182 * 1183 * Extract text from a UText into a UChar buffer. The range of text to be extracted 1184 * is specified in the native indices of the UText provider. These may not necessarily 1185 * be UTF-16 indices. 1186 * <p> 1187 * The size (number of 16 bit UChars) in the data to be extracted is returned. The 1188 * full amount is returned, even when the specified buffer size is smaller. 1189 * <p> 1190 * The extracted string will (if you are a user) / must (if you are a text provider) 1191 * be NUL-terminated if there is sufficient space in the destination buffer. 1192 * 1193 * @param ut the UText from which to extract data. 1194 * @param nativeStart the native index of the first characer to extract. 1195 * @param nativeLimit the native string index of the position following the last 1196 * character to extract. 1197 * @param dest the UChar (UTF-16) buffer into which the extracted text is placed 1198 * @param destCapacity The size, in UChars, of the destination buffer. May be zero 1199 * for precomputing the required size. 1200 * @param status receives any error status. 1201 * If U_BUFFER_OVERFLOW_ERROR: Returns number of UChars for 1202 * preflighting. 1203 * @return Number of UChars in the data. Does not include a trailing NUL. 1204 * 1205 * @stable ICU 3.4 1206 */ 1207 typedef int32_t U_CALLCONV 1208 UTextExtract(UText *ut, 1209 int64_t nativeStart, int64_t nativeLimit, 1210 UChar *dest, int32_t destCapacity, 1211 UErrorCode *status); 1212 1213 /** 1214 * Function type declaration for UText.replace(). 1215 * 1216 * Replace a range of the original text with a replacement text. 1217 * 1218 * Leaves the current iteration position at the position following the 1219 * newly inserted replacement text. 1220 * 1221 * This function need only be implemented on UText types that support writing. 1222 * 1223 * When using this function, there should be only a single UText opened onto the 1224 * underlying native text string. The function is responsible for updating the 1225 * text chunk within the UText to reflect the updated iteration position, 1226 * taking into account any changes to the underlying string's structure caused 1227 * by the replace operation. 1228 * 1229 * @param ut the UText representing the text to be operated on. 1230 * @param nativeStart the index of the start of the region to be replaced 1231 * @param nativeLimit the index of the character following the region to be replaced. 1232 * @param replacementText pointer to the replacement text 1233 * @param replacmentLength length of the replacement text in UChars, or -1 if the text is NUL terminated. 1234 * @param status receives any error status. Possible errors include 1235 * U_NO_WRITE_PERMISSION 1236 * 1237 * @return The signed number of (native) storage units by which 1238 * the length of the text expanded or contracted. 1239 * 1240 * @stable ICU 3.4 1241 */ 1242 typedef int32_t U_CALLCONV 1243 UTextReplace(UText *ut, 1244 int64_t nativeStart, int64_t nativeLimit, 1245 const UChar *replacementText, int32_t replacmentLength, 1246 UErrorCode *status); 1247 1248 /** 1249 * Function type declaration for UText.copy(). 1250 * 1251 * Copy or move a substring from one position to another within the text, 1252 * while retaining any metadata associated with the text. 1253 * This function is used to duplicate or reorder substrings. 1254 * The destination index must not overlap the source range. 1255 * 1256 * The text to be copied or moved is inserted at destIndex; 1257 * it does not replace or overwrite any existing text. 1258 * 1259 * This function need only be implemented for UText types that support writing. 1260 * 1261 * When using this function, there should be only a single UText opened onto the 1262 * underlying native text string. The function is responsible for updating the 1263 * text chunk within the UText to reflect the updated iteration position, 1264 * taking into account any changes to the underlying string's structure caused 1265 * by the replace operation. 1266 * 1267 * @param ut The UText representing the text to be operated on. 1268 * @param nativeStart The index of the start of the region to be copied or moved 1269 * @param nativeLimit The index of the character following the region to be replaced. 1270 * @param nativeDest The destination index to which the source substring is copied or moved. 1271 * @param move If TRUE, then the substring is moved, not copied/duplicated. 1272 * @param status receives any error status. Possible errors include U_NO_WRITE_PERMISSION 1273 * 1274 * @stable ICU 3.4 1275 */ 1276 typedef void U_CALLCONV 1277 UTextCopy(UText *ut, 1278 int64_t nativeStart, int64_t nativeLimit, 1279 int64_t nativeDest, 1280 UBool move, 1281 UErrorCode *status); 1282 1283 /** 1284 * Function type declaration for UText.mapOffsetToNative(). 1285 * Map from the current UChar offset within the current text chunk to 1286 * the corresponding native index in the original source text. 1287 * 1288 * This is required only for text providers that do not use native UTF-16 indexes. 1289 * 1290 * @param ut the UText. 1291 * @return Absolute (native) index corresponding to chunkOffset in the current chunk. 1292 * The returned native index should always be to a code point boundary. 1293 * 1294 * @stable ICU 3.4 1295 */ 1296 typedef int64_t U_CALLCONV 1297 UTextMapOffsetToNative(const UText *ut); 1298 1299 /** 1300 * Function type declaration for UText.mapIndexToUTF16(). 1301 * Map from a native index to a UChar offset within a text chunk. 1302 * Behavior is undefined if the native index does not fall within the 1303 * current chunk. 1304 * 1305 * This function is required only for text providers that do not use native UTF-16 indexes. 1306 * 1307 * @param ut The UText containing the text chunk. 1308 * @param nativeIndex Absolute (native) text index, chunk->start<=index<=chunk->limit. 1309 * @return Chunk-relative UTF-16 offset corresponding to the specified native 1310 * index. 1311 * 1312 * @stable ICU 3.4 1313 */ 1314 typedef int32_t U_CALLCONV 1315 UTextMapNativeIndexToUTF16(const UText *ut, int64_t nativeIndex); 1316 1317 1318 /** 1319 * Function type declaration for UText.utextClose(). 1320 * 1321 * A Text Provider close function is only required for provider types that make 1322 * allocations in their open function (or other functions) that must be 1323 * cleaned when the UText is closed. 1324 * 1325 * The allocation of the UText struct itself and any "extra" storage 1326 * associated with the UText is handled by the common UText implementation 1327 * and does not require provider specific cleanup in a close function. 1328 * 1329 * Most UText provider implementations do not need to implement this function. 1330 * 1331 * @param ut A UText object to be closed. 1332 * 1333 * @stable ICU 3.4 1334 */ 1335 typedef void U_CALLCONV 1336 UTextClose(UText *ut); 1337 1338 1339 /** 1340 * (public) Function dispatch table for UText. 1341 * Conceptually very much like a C++ Virtual Function Table. 1342 * This struct defines the organization of the table. 1343 * Each text provider implementation must provide an 1344 * actual table that is initialized with the appropriate functions 1345 * for the type of text being handled. 1346 * @stable ICU 3.6 1347 */ 1348 struct UTextFuncs { 1349 /** 1350 * (public) Function table size, sizeof(UTextFuncs) 1351 * Intended for use should the table grow to accomodate added 1352 * functions in the future, to allow tests for older format 1353 * function tables that do not contain the extensions. 1354 * 1355 * Fields are placed for optimal alignment on 1356 * 32/64/128-bit-pointer machines, by normally grouping together 1357 * 4 32-bit fields, 1358 * 4 pointers, 1359 * 2 64-bit fields 1360 * in sequence. 1361 * @stable ICU 3.6 1362 */ 1363 int32_t tableSize; 1364 1365 /** 1366 * (private) Alignment padding. 1367 * Do not use, reserved for use by the UText framework only. 1368 * @internal 1369 */ 1370 int32_t reserved1, /** @internal */ reserved2, /** @internal */ reserved3; 1371 1372 1373 /** 1374 * (public) Function pointer for UTextClone 1375 * 1376 * @see UTextClone 1377 * @stable ICU 3.6 1378 */ 1379 UTextClone *clone; 1380 1381 /** 1382 * (public) function pointer for UTextLength 1383 * May be expensive to compute! 1384 * 1385 * @see UTextLength 1386 * @stable ICU 3.6 1387 */ 1388 UTextNativeLength *nativeLength; 1389 1390 /** 1391 * (public) Function pointer for UTextAccess. 1392 * 1393 * @see UTextAccess 1394 * @stable ICU 3.6 1395 */ 1396 UTextAccess *access; 1397 1398 /** 1399 * (public) Function pointer for UTextExtract. 1400 * 1401 * @see UTextExtract 1402 * @stable ICU 3.6 1403 */ 1404 UTextExtract *extract; 1405 1406 /** 1407 * (public) Function pointer for UTextReplace. 1408 * 1409 * @see UTextReplace 1410 * @stable ICU 3.6 1411 */ 1412 UTextReplace *replace; 1413 1414 /** 1415 * (public) Function pointer for UTextCopy. 1416 * 1417 * @see UTextCopy 1418 * @stable ICU 3.6 1419 */ 1420 UTextCopy *copy; 1421 1422 /** 1423 * (public) Function pointer for UTextMapOffsetToNative. 1424 * 1425 * @see UTextMapOffsetToNative 1426 * @stable ICU 3.6 1427 */ 1428 UTextMapOffsetToNative *mapOffsetToNative; 1429 1430 /** 1431 * (public) Function pointer for UTextMapNativeIndexToUTF16. 1432 * 1433 * @see UTextMapNativeIndexToUTF16 1434 * @stable ICU 3.6 1435 */ 1436 UTextMapNativeIndexToUTF16 *mapNativeIndexToUTF16; 1437 1438 /** 1439 * (public) Function pointer for UTextClose. 1440 * 1441 * @see UTextClose 1442 * @stable ICU 3.6 1443 */ 1444 UTextClose *close; 1445 1446 /** 1447 * (private) Spare function pointer 1448 * @internal 1449 */ 1450 UTextClose *spare1; 1451 1452 /** 1453 * (private) Spare function pointer 1454 * @internal 1455 */ 1456 UTextClose *spare2; 1457 1458 /** 1459 * (private) Spare function pointer 1460 * @internal 1461 */ 1462 UTextClose *spare3; 1463 1464 }; 1465 /** 1466 * Function dispatch table for UText 1467 * @see UTextFuncs 1468 */ 1469 typedef struct UTextFuncs UTextFuncs; 1470 1471 /** 1472 * UText struct. Provides the interface between the generic UText access code 1473 * and the UText provider code that works on specific kinds of 1474 * text (UTF-8, noncontiguous UTF-16, whatever.) 1475 * 1476 * Applications that are using predefined types of text providers 1477 * to pass text data to ICU services will have no need to view the 1478 * internals of the UText structs that they open. 1479 * 1480 * @stable ICU 3.6 1481 */ 1482 struct UText { 1483 /** 1484 * (private) Magic. Used to help detect when UText functions are handed 1485 * invalid or unitialized UText structs. 1486 * utext_openXYZ() functions take an initialized, 1487 * but not necessarily open, UText struct as an 1488 * optional fill-in parameter. This magic field 1489 * is used to check for that initialization. 1490 * Text provider close functions must NOT clear 1491 * the magic field because that would prevent 1492 * reuse of the UText struct. 1493 * @internal 1494 */ 1495 uint32_t magic; 1496 1497 1498 /** 1499 * (private) Flags for managing the allocation and freeing of 1500 * memory associated with this UText. 1501 * @internal 1502 */ 1503 int32_t flags; 1504 1505 1506 /** 1507 * Text provider properties. This set of flags is maintainted by the 1508 * text provider implementation. 1509 * @stable ICU 3.4 1510 */ 1511 int32_t providerProperties; 1512 1513 /** 1514 * (public) sizeOfStruct=sizeof(UText) 1515 * Allows possible backward compatible extension. 1516 * 1517 * @stable ICU 3.4 1518 */ 1519 int32_t sizeOfStruct; 1520 1521 /* ------ 16 byte alignment boundary ----------- */ 1522 1523 1524 /** 1525 * (protected) Native index of the first character position following 1526 * the current chunk. 1527 * @stable ICU 3.6 1528 */ 1529 int64_t chunkNativeLimit; 1530 1531 /** 1532 * (protected) Size in bytes of the extra space (pExtra). 1533 * @stable ICU 3.4 1534 */ 1535 int32_t extraSize; 1536 1537 /** 1538 * (protected) The highest chunk offset where native indexing and 1539 * chunk (UTF-16) indexing correspond. For UTF-16 sources, value 1540 * will be equal to chunkLength. 1541 * 1542 * @stable ICU 3.6 1543 */ 1544 int32_t nativeIndexingLimit; 1545 1546 /* ---- 16 byte alignment boundary------ */ 1547 1548 /** 1549 * (protected) Native index of the first character in the text chunk. 1550 * @stable ICU 3.6 1551 */ 1552 int64_t chunkNativeStart; 1553 1554 /** 1555 * (protected) Current iteration position within the text chunk (UTF-16 buffer). 1556 * This is the index to the character that will be returned by utext_next32(). 1557 * @stable ICU 3.6 1558 */ 1559 int32_t chunkOffset; 1560 1561 /** 1562 * (protected) Length the text chunk (UTF-16 buffer), in UChars. 1563 * @stable ICU 3.6 1564 */ 1565 int32_t chunkLength; 1566 1567 /* ---- 16 byte alignment boundary-- */ 1568 1569 1570 /** 1571 * (protected) pointer to a chunk of text in UTF-16 format. 1572 * May refer either to original storage of the source of the text, or 1573 * if conversion was required, to a buffer owned by the UText. 1574 * @stable ICU 3.6 1575 */ 1576 const UChar *chunkContents; 1577 1578 /** 1579 * (public) Pointer to Dispatch table for accessing functions for this UText. 1580 * @stable ICU 3.6 1581 */ 1582 const UTextFuncs *pFuncs; 1583 1584 /** 1585 * (protected) Pointer to additional space requested by the 1586 * text provider during the utext_open operation. 1587 * @stable ICU 3.4 1588 */ 1589 void *pExtra; 1590 1591 /** 1592 * (protected) Pointer to string or text-containin object or similar. 1593 * This is the source of the text that this UText is wrapping, in a format 1594 * that is known to the text provider functions. 1595 * @stable ICU 3.4 1596 */ 1597 const void *context; 1598 1599 /* --- 16 byte alignment boundary--- */ 1600 1601 /** 1602 * (protected) Pointer fields available for use by the text provider. 1603 * Not used by UText common code. 1604 * @stable ICU 3.6 1605 */ 1606 const void *p; 1607 /** 1608 * (protected) Pointer fields available for use by the text provider. 1609 * Not used by UText common code. 1610 * @stable ICU 3.6 1611 */ 1612 const void *q; 1613 /** 1614 * (protected) Pointer fields available for use by the text provider. 1615 * Not used by UText common code. 1616 * @stable ICU 3.6 1617 */ 1618 const void *r; 1619 1620 /** 1621 * Private field reserved for future use by the UText framework 1622 * itself. This is not to be touched by the text providers. 1623 * @internal ICU 3.4 1624 */ 1625 void *privP; 1626 1627 1628 /* --- 16 byte alignment boundary--- */ 1629 1630 1631 /** 1632 * (protected) Integer field reserved for use by the text provider. 1633 * Not used by the UText framework, or by the client (user) of the UText. 1634 * @stable ICU 3.4 1635 */ 1636 int64_t a; 1637 1638 /** 1639 * (protected) Integer field reserved for use by the text provider. 1640 * Not used by the UText framework, or by the client (user) of the UText. 1641 * @stable ICU 3.4 1642 */ 1643 int32_t b; 1644 1645 /** 1646 * (protected) Integer field reserved for use by the text provider. 1647 * Not used by the UText framework, or by the client (user) of the UText. 1648 * @stable ICU 3.4 1649 */ 1650 int32_t c; 1651 1652 /* ---- 16 byte alignment boundary---- */ 1653 1654 1655 /** 1656 * Private field reserved for future use by the UText framework 1657 * itself. This is not to be touched by the text providers. 1658 * @internal ICU 3.4 1659 */ 1660 int64_t privA; 1661 /** 1662 * Private field reserved for future use by the UText framework 1663 * itself. This is not to be touched by the text providers. 1664 * @internal ICU 3.4 1665 */ 1666 int32_t privB; 1667 /** 1668 * Private field reserved for future use by the UText framework 1669 * itself. This is not to be touched by the text providers. 1670 * @internal ICU 3.4 1671 */ 1672 int32_t privC; 1673 }; 1674 1675 1676 /** 1677 * Common function for use by Text Provider implementations to allocate and/or initialize 1678 * a new UText struct. To be called in the implementation of utext_open() functions. 1679 * If the supplied UText parameter is null, a new UText struct will be allocated on the heap. 1680 * If the supplied UText is already open, the provider's close function will be called 1681 * so that the struct can be reused by the open that is in progress. 1682 * 1683 * @param ut pointer to a UText struct to be re-used, or null if a new UText 1684 * should be allocated. 1685 * @param extraSpace The amount of additional space to be allocated as part 1686 * of this UText, for use by types of providers that require 1687 * additional storage. 1688 * @param status Errors are returned here. 1689 * @return pointer to the UText, allocated if necessary, with extra space set up if requested. 1690 * @stable ICU 3.4 1691 */ 1692 U_STABLE UText * U_EXPORT2 1693 utext_setup(UText *ut, int32_t extraSpace, UErrorCode *status); 1694 1695 /** 1696 * @internal 1697 * Value used to help identify correctly initialized UText structs. 1698 * Note: must be publicly visible so that UTEXT_INITIALIZER can access it. 1699 */ 1700 enum { 1701 UTEXT_MAGIC = 0x345ad82c 1702 }; 1703 1704 /** 1705 * initializer to be used with local (stack) instances of a UText 1706 * struct. UText structs must be initialized before passing 1707 * them to one of the utext_open functions. 1708 * 1709 * @stable ICU 3.6 1710 */ 1711 #define UTEXT_INITIALIZER { \ 1712 UTEXT_MAGIC, /* magic */ \ 1713 0, /* flags */ \ 1714 0, /* providerProps */ \ 1715 sizeof(UText), /* sizeOfStruct */ \ 1716 0, /* chunkNativeLimit */ \ 1717 0, /* extraSize */ \ 1718 0, /* nativeIndexingLimit */ \ 1719 0, /* chunkNativeStart */ \ 1720 0, /* chunkOffset */ \ 1721 0, /* chunkLength */ \ 1722 NULL, /* chunkContents */ \ 1723 NULL, /* pFuncs */ \ 1724 NULL, /* pExtra */ \ 1725 NULL, /* context */ \ 1726 NULL, NULL, NULL, /* p, q, r */ \ 1727 NULL, /* privP */ \ 1728 0, 0, 0, /* a, b, c */ \ 1729 0, 0, 0 /* privA,B,C, */ \ 1730 } 1731 1732 1733 U_CDECL_END 1734 1735 1736 1737 #endif 1738