1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2004-2012, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: utext.h 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2004oct06 16 * created by: Markus W. Scherer 17 */ 18 19 #ifndef __UTEXT_H__ 20 #define __UTEXT_H__ 21 22 /** 23 * \file 24 * \brief C API: Abstract Unicode Text API 25 * 26 * The Text Access API provides a means to allow text that is stored in alternative 27 * formats to work with ICU services. ICU normally operates on text that is 28 * stored in UTF-16 format, in (UChar *) arrays for the C APIs or as type 29 * UnicodeString for C++ APIs. 30 * 31 * ICU Text Access allows other formats, such as UTF-8 or non-contiguous 32 * UTF-16 strings, to be placed in a UText wrapper and then passed to ICU services. 33 * 34 * There are three general classes of usage for UText: 35 * 36 * Application Level Use. This is the simplest usage - applications would 37 * use one of the utext_open() functions on their input text, and pass 38 * the resulting UText to the desired ICU service. 39 * 40 * Second is usage in ICU Services, such as break iteration, that will need to 41 * operate on input presented to them as a UText. These implementations 42 * will need to use the iteration and related UText functions to gain 43 * access to the actual text. 44 * 45 * The third class of UText users are "text providers." These are the 46 * UText implementations for the various text storage formats. An application 47 * or system with a unique text storage format can implement a set of 48 * UText provider functions for that format, which will then allow 49 * ICU services to operate on that format. 50 * 51 * 52 * <em>Iterating over text</em> 53 * 54 * Here is sample code for a forward iteration over the contents of a UText 55 * 56 * \code 57 * UChar32 c; 58 * UText *ut = whatever(); 59 * 60 * for (c=utext_next32From(ut, 0); c>=0; c=utext_next32(ut)) { 61 * // do whatever with the codepoint c here. 62 * } 63 * \endcode 64 * 65 * And here is similar code to iterate in the reverse direction, from the end 66 * of the text towards the beginning. 67 * 68 * \code 69 * UChar32 c; 70 * UText *ut = whatever(); 71 * int textLength = utext_nativeLength(ut); 72 * for (c=utext_previous32From(ut, textLength); c>=0; c=utext_previous32(ut)) { 73 * // do whatever with the codepoint c here. 74 * } 75 * \endcode 76 * 77 * <em>Characters and Indexing</em> 78 * 79 * Indexing into text by UText functions is nearly always in terms of the native 80 * indexing of the underlying text storage. The storage format could be UTF-8 81 * or UTF-32, for example. When coding to the UText access API, no assumptions 82 * can be made regarding the size of characters, or how far an index 83 * may move when iterating between characters. 84 * 85 * All indices supplied to UText functions are pinned to the length of the 86 * text. An out-of-bounds index is not considered to be an error, but is 87 * adjusted to be in the range 0 <= index <= length of input text. 88 * 89 * 90 * When an index position is returned from a UText function, it will be 91 * a native index to the underlying text. In the case of multi-unit characters, 92 * it will always refer to the first position of the character, 93 * never to the interior. This is essentially the same thing as saying that 94 * a returned index will always point to a boundary between characters. 95 * 96 * When a native index is supplied to a UText function, all indices that 97 * refer to any part of a multi-unit character representation are considered 98 * to be equivalent. In the case of multi-unit characters, an incoming index 99 * will be logically normalized to refer to the start of the character. 100 * 101 * It is possible to test whether a native index is on a code point boundary 102 * by doing a utext_setNativeIndex() followed by a utext_getNativeIndex(). 103 * If the index is returned unchanged, it was on a code point boundary. If 104 * an adjusted index is returned, the original index referred to the 105 * interior of a character. 106 * 107 * <em>Conventions for calling UText functions</em> 108 * 109 * Most UText access functions have as their first parameter a (UText *) pointer, 110 * which specifies the UText to be used. Unless otherwise noted, the 111 * pointer must refer to a valid, open UText. Attempting to 112 * use a closed UText or passing a NULL pointer is a programming error and 113 * will produce undefined results or NULL pointer exceptions. 114 * 115 * The UText_Open family of functions can either open an existing (closed) 116 * UText, or heap allocate a new UText. Here is sample code for creating 117 * a stack-allocated UText. 118 * 119 * \code 120 * char *s = whatever(); // A utf-8 string 121 * U_ErrorCode status = U_ZERO_ERROR; 122 * UText ut = UTEXT_INITIALIZER; 123 * utext_openUTF8(ut, s, -1, &status); 124 * if (U_FAILURE(status)) { 125 * // error handling 126 * } else { 127 * // work with the UText 128 * } 129 * \endcode 130 * 131 * Any existing UText passed to an open function _must_ have been initialized, 132 * either by the UTEXT_INITIALIZER, or by having been originally heap-allocated 133 * by an open function. Passing NULL will cause the open function to 134 * heap-allocate and fully initialize a new UText. 135 * 136 */ 137 138 139 140 #include "unicode/utypes.h" 141 #include "unicode/uchar.h" 142 #if U_SHOW_CPLUSPLUS_API 143 #include "unicode/localpointer.h" 144 #include "unicode/rep.h" 145 #include "unicode/unistr.h" 146 #include "unicode/chariter.h" 147 #endif 148 149 150 U_CDECL_BEGIN 151 152 struct UText; 153 typedef struct UText UText; /**< C typedef for struct UText. @stable ICU 3.6 */ 154 155 156 /*************************************************************************************** 157 * 158 * C Functions for creating UText wrappers around various kinds of text strings. 159 * 160 ****************************************************************************************/ 161 162 163 /** 164 * Close function for UText instances. 165 * Cleans up, releases any resources being held by an open UText. 166 * <p> 167 * If the UText was originally allocated by one of the utext_open functions, 168 * the storage associated with the utext will also be freed. 169 * If the UText storage originated with the application, as it would with 170 * a local or static instance, the storage will not be deleted. 171 * 172 * An open UText can be reset to refer to new string by using one of the utext_open() 173 * functions without first closing the UText. 174 * 175 * @param ut The UText to be closed. 176 * @return NULL if the UText struct was deleted by the close. If the UText struct 177 * was originally provided by the caller to the open function, it is 178 * returned by this function, and may be safely used again in 179 * a subsequent utext_open. 180 * 181 * @stable ICU 3.4 182 */ 183 U_CAPI UText * U_EXPORT2 184 utext_close(UText *ut); 185 186 /** 187 * Open a read-only UText implementation for UTF-8 strings. 188 * 189 * \htmlonly 190 * Any invalid UTF-8 in the input will be handled in this way: 191 * a sequence of bytes that has the form of a truncated, but otherwise valid, 192 * UTF-8 sequence will be replaced by a single unicode replacement character, \uFFFD. 193 * Any other illegal bytes will each be replaced by a \uFFFD. 194 * \endhtmlonly 195 * 196 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 197 * If non-NULL, must refer to an initialized UText struct, which will then 198 * be reset to reference the specified UTF-8 string. 199 * @param s A UTF-8 string. Must not be NULL. 200 * @param length The length of the UTF-8 string in bytes, or -1 if the string is 201 * zero terminated. 202 * @param status Errors are returned here. 203 * @return A pointer to the UText. If a pre-allocated UText was provided, it 204 * will always be used and returned. 205 * @stable ICU 3.4 206 */ 207 U_CAPI UText * U_EXPORT2 208 utext_openUTF8(UText *ut, const char *s, int64_t length, UErrorCode *status); 209 210 211 /** 212 * Open a read-only UText for UChar * string. 213 * 214 * @param ut Pointer to a UText struct. If NULL, a new UText will be created. 215 * If non-NULL, must refer to an initialized UText struct, which will then 216 * be reset to reference the specified UChar string. 217 * @param s A UChar (UTF-16) string 218 * @param length The number of UChars in the input string, or -1 if the string is 219 * zero terminated. 220 * @param status Errors are returned here. 221 * @return A pointer to the UText. If a pre-allocated UText was provided, it 222 * will always be used and returned. 223 * @stable ICU 3.4 224 */ 225 U_CAPI UText * U_EXPORT2 226 utext_openUChars(UText *ut, const UChar *s, int64_t length, UErrorCode *status); 227 228 229 /** 230 * Clone a UText. This is much like opening a UText where the source text is itself 231 * another UText. 232 * 233 * A deep clone will copy both the UText data structures and the underlying text. 234 * The original and cloned UText will operate completely independently; modifications 235 * made to the text in one will not affect the other. Text providers are not 236 * required to support deep clones. The user of clone() must check the status return 237 * and be prepared to handle failures. 238 * 239 * The standard UText implementations for UTF8, UChar *, UnicodeString and 240 * Replaceable all support deep cloning. 241 * 242 * The UText returned from a deep clone will be writable, assuming that the text 243 * provider is able to support writing, even if the source UText had been made 244 * non-writable by means of UText_freeze(). 245 * 246 * A shallow clone replicates only the UText data structures; it does not make 247 * a copy of the underlying text. Shallow clones can be used as an efficient way to 248 * have multiple iterators active in a single text string that is not being 249 * modified. 250 * 251 * A shallow clone operation will not fail, barring truly exceptional conditions such 252 * as memory allocation failures. 253 * 254 * Shallow UText clones should be avoided if the UText functions that modify the 255 * text are expected to be used, either on the original or the cloned UText. 256 * Any such modifications can cause unpredictable behavior. Read Only 257 * shallow clones provide some protection against errors of this type by 258 * disabling text modification via the cloned UText. 259 * 260 * A shallow clone made with the readOnly parameter == false will preserve the 261 * utext_isWritable() state of the source object. Note, however, that 262 * write operations must be avoided while more than one UText exists that refer 263 * to the same underlying text. 264 * 265 * A UText and its clone may be safely concurrently accessed by separate threads. 266 * This is true for read access only with shallow clones, and for both read and 267 * write access with deep clones. 268 * It is the responsibility of the Text Provider to ensure that this thread safety 269 * constraint is met. 270 * 271 * @param dest A UText struct to be filled in with the result of the clone operation, 272 * or NULL if the clone function should heap-allocate a new UText struct. 273 * If non-NULL, must refer to an already existing UText, which will then 274 * be reset to become the clone. 275 * @param src The UText to be cloned. 276 * @param deep true to request a deep clone, false for a shallow clone. 277 * @param readOnly true to request that the cloned UText have read only access to the 278 * underlying text. 279 280 * @param status Errors are returned here. For deep clones, U_UNSUPPORTED_ERROR 281 * will be returned if the text provider is unable to clone the 282 * original text. 283 * @return The newly created clone, or NULL if the clone operation failed. 284 * @stable ICU 3.4 285 */ 286 U_CAPI UText * U_EXPORT2 287 utext_clone(UText *dest, const UText *src, UBool deep, UBool readOnly, UErrorCode *status); 288 289 290 /** 291 * Compare two UText objects for equality. 292 * UTexts are equal if they are iterating over the same text, and 293 * have the same iteration position within the text. 294 * If either or both of the parameters are NULL, the comparison is false. 295 * 296 * @param a The first of the two UTexts to compare. 297 * @param b The other UText to be compared. 298 * @return true if the two UTexts are equal. 299 * @stable ICU 3.6 300 */ 301 U_CAPI UBool U_EXPORT2 302 utext_equals(const UText *a, const UText *b); 303 304 305 /***************************************************************************** 306 * 307 * Functions to work with the text represented by a UText wrapper 308 * 309 *****************************************************************************/ 310 311 /** 312 * Get the length of the text. Depending on the characteristics 313 * of the underlying text representation, this may be expensive. 314 * @see utext_isLengthExpensive() 315 * 316 * 317 * @param ut the text to be accessed. 318 * @return the length of the text, expressed in native units. 319 * 320 * @stable ICU 3.4 321 */ 322 U_CAPI int64_t U_EXPORT2 323 utext_nativeLength(UText *ut); 324 325 /** 326 * Returns the code point at the requested index, 327 * or U_SENTINEL (-1) if it is out of bounds. 328 * 329 * If the specified index points to the interior of a multi-unit 330 * character - one of the trail bytes of a UTF-8 sequence, for example - 331 * the complete code point will be returned. 332 * 333 * The iteration position will be set to the start of the returned code point. 334 * 335 * This function is roughly equivalent to the sequence 336 * utext_setNativeIndex(index); 337 * utext_current32(); 338 * (There is a subtle difference if the index is out of bounds by being less than zero - 339 * utext_setNativeIndex(negative value) sets the index to zero, after which utext_current() 340 * will return the char at zero. utext_char32At(negative index), on the other hand, will 341 * return the U_SENTINEL value of -1.) 342 * 343 * @param ut the text to be accessed 344 * @param nativeIndex the native index of the character to be accessed. If the index points 345 * to other than the first unit of a multi-unit character, it will be adjusted 346 * to the start of the character. 347 * @return the code point at the specified index. 348 * @stable ICU 3.4 349 */ 350 U_CAPI UChar32 U_EXPORT2 351 utext_char32At(UText *ut, int64_t nativeIndex); 352 353 354 /** 355 * 356 * Get the code point at the current iteration position, 357 * or U_SENTINEL (-1) if the iteration has reached the end of 358 * the input text. 359 * 360 * @param ut the text to be accessed. 361 * @return the Unicode code point at the current iterator position. 362 * @stable ICU 3.4 363 */ 364 U_CAPI UChar32 U_EXPORT2 365 utext_current32(UText *ut); 366 367 368 /** 369 * Get the code point at the current iteration position of the UText, and 370 * advance the position to the first index following the character. 371 * 372 * If the position is at the end of the text (the index following 373 * the last character, which is also the length of the text), 374 * return U_SENTINEL (-1) and do not advance the index. 375 * 376 * This is a post-increment operation. 377 * 378 * An inline macro version of this function, UTEXT_NEXT32(), 379 * is available for performance critical use. 380 * 381 * @param ut the text to be accessed. 382 * @return the Unicode code point at the iteration position. 383 * @see UTEXT_NEXT32 384 * @stable ICU 3.4 385 */ 386 U_CAPI UChar32 U_EXPORT2 387 utext_next32(UText *ut); 388 389 390 /** 391 * Move the iterator position to the character (code point) whose 392 * index precedes the current position, and return that character. 393 * This is a pre-decrement operation. 394 * 395 * If the initial position is at the start of the text (index of 0) 396 * return U_SENTINEL (-1), and leave the position unchanged. 397 * 398 * An inline macro version of this function, UTEXT_PREVIOUS32(), 399 * is available for performance critical use. 400 * 401 * @param ut the text to be accessed. 402 * @return the previous UChar32 code point, or U_SENTINEL (-1) 403 * if the iteration has reached the start of the text. 404 * @see UTEXT_PREVIOUS32 405 * @stable ICU 3.4 406 */ 407 U_CAPI UChar32 U_EXPORT2 408 utext_previous32(UText *ut); 409 410 411 /** 412 * Set the iteration index and return the code point at that index. 413 * Leave the iteration index at the start of the following code point. 414 * 415 * This function is the most efficient and convenient way to 416 * begin a forward iteration. The results are identical to the those 417 * from the sequence 418 * \code 419 * utext_setIndex(); 420 * utext_next32(); 421 * \endcode 422 * 423 * @param ut the text to be accessed. 424 * @param nativeIndex Iteration index, in the native units of the text provider. 425 * @return Code point which starts at or before index, 426 * or U_SENTINEL (-1) if it is out of bounds. 427 * @stable ICU 3.4 428 */ 429 U_CAPI UChar32 U_EXPORT2 430 utext_next32From(UText *ut, int64_t nativeIndex); 431 432 433 434 /** 435 * Set the iteration index, and return the code point preceding the 436 * one specified by the initial index. Leave the iteration position 437 * at the start of the returned code point. 438 * 439 * This function is the most efficient and convenient way to 440 * begin a backwards iteration. 441 * 442 * @param ut the text to be accessed. 443 * @param nativeIndex Iteration index in the native units of the text provider. 444 * @return Code point preceding the one at the initial index, 445 * or U_SENTINEL (-1) if it is out of bounds. 446 * 447 * @stable ICU 3.4 448 */ 449 U_CAPI UChar32 U_EXPORT2 450 utext_previous32From(UText *ut, int64_t nativeIndex); 451 452 /** 453 * Get the current iterator position, which can range from 0 to 454 * the length of the text. 455 * The position is a native index into the input text, in whatever format it 456 * may have (possibly UTF-8 for example), and may not always be the same as 457 * the corresponding UChar (UTF-16) index. 458 * The returned position will always be aligned to a code point boundary. 459 * 460 * @param ut the text to be accessed. 461 * @return the current index position, in the native units of the text provider. 462 * @stable ICU 3.4 463 */ 464 U_CAPI int64_t U_EXPORT2 465 utext_getNativeIndex(const UText *ut); 466 467 /** 468 * Set the current iteration position to the nearest code point 469 * boundary at or preceding the specified index. 470 * The index is in the native units of the original input text. 471 * If the index is out of range, it will be pinned to be within 472 * the range of the input text. 473 * <p> 474 * It will usually be more efficient to begin an iteration 475 * using the functions utext_next32From() or utext_previous32From() 476 * rather than setIndex(). 477 * <p> 478 * Moving the index position to an adjacent character is best done 479 * with utext_next32(), utext_previous32() or utext_moveIndex32(). 480 * Attempting to do direct arithmetic on the index position is 481 * complicated by the fact that the size (in native units) of a 482 * character depends on the underlying representation of the character 483 * (UTF-8, UTF-16, UTF-32, arbitrary codepage), and is not 484 * easily knowable. 485 * 486 * @param ut the text to be accessed. 487 * @param nativeIndex the native unit index of the new iteration position. 488 * @stable ICU 3.4 489 */ 490 U_CAPI void U_EXPORT2 491 utext_setNativeIndex(UText *ut, int64_t nativeIndex); 492 493 /** 494 * Move the iterator position by delta code points. The number of code points 495 * is a signed number; a negative delta will move the iterator backwards, 496 * towards the start of the text. 497 * <p> 498 * The index is moved by <code>delta</code> code points 499 * forward or backward, but no further backward than to 0 and 500 * no further forward than to utext_nativeLength(). 501 * The resulting index value will be in between 0 and length, inclusive. 502 * 503 * @param ut the text to be accessed. 504 * @param delta the signed number of code points to move the iteration position. 505 * @return true if the position could be moved the requested number of positions while 506 * staying within the range [0 - text length]. 507 * @stable ICU 3.4 508 */ 509 U_CAPI UBool U_EXPORT2 510 utext_moveIndex32(UText *ut, int32_t delta); 511 512 /** 513 * Get the native index of the character preceding the current position. 514 * If the iteration position is already at the start of the text, zero 515 * is returned. 516 * The value returned is the same as that obtained from the following sequence, 517 * but without the side effect of changing the iteration position. 518 * 519 * \code 520 * UText *ut = whatever; 521 * ... 522 * utext_previous(ut) 523 * utext_getNativeIndex(ut); 524 * \endcode 525 * 526 * This function is most useful during forwards iteration, where it will get the 527 * native index of the character most recently returned from utext_next(). 528 * 529 * @param ut the text to be accessed 530 * @return the native index of the character preceding the current index position, 531 * or zero if the current position is at the start of the text. 532 * @stable ICU 3.6 533 */ 534 U_CAPI int64_t U_EXPORT2 535 utext_getPreviousNativeIndex(UText *ut); 536 537 538 /** 539 * 540 * Extract text from a UText into a UChar buffer. The range of text to be extracted 541 * is specified in the native indices of the UText provider. These may not necessarily 542 * be UTF-16 indices. 543 * <p> 544 * The size (number of 16 bit UChars) of the data to be extracted is returned. The 545 * full number of UChars is returned, even when the extracted text is truncated 546 * because the specified buffer size is too small. 547 * <p> 548 * The extracted string will (if you are a user) / must (if you are a text provider) 549 * be NUL-terminated if there is sufficient space in the destination buffer. This 550 * terminating NUL is not included in the returned length. 551 * <p> 552 * The iteration index is left at the position following the last extracted character. 553 * 554 * @param ut the UText from which to extract data. 555 * @param nativeStart the native index of the first character to extract.\ 556 * If the specified index is out of range, 557 * it will be pinned to be within 0 <= index <= textLength 558 * @param nativeLimit the native string index of the position following the last 559 * character to extract. If the specified index is out of range, 560 * it will be pinned to be within 0 <= index <= textLength. 561 * nativeLimit must be >= nativeStart. 562 * @param dest the UChar (UTF-16) buffer into which the extracted text is placed 563 * @param destCapacity The size, in UChars, of the destination buffer. May be zero 564 * for precomputing the required size. 565 * @param status receives any error status. 566 * U_BUFFER_OVERFLOW_ERROR: the extracted text was truncated because the 567 * buffer was too small. Returns number of UChars for preflighting. 568 * @return Number of UChars in the data to be extracted. Does not include a trailing NUL. 569 * 570 * @stable ICU 3.4 571 */ 572 U_CAPI int32_t U_EXPORT2 573 utext_extract(UText *ut, 574 int64_t nativeStart, int64_t nativeLimit, 575 UChar *dest, int32_t destCapacity, 576 UErrorCode *status); 577 578 579 U_CDECL_END 580 581 582 #endif 583