1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // edits.h 5 // created: 2016dec30 Markus W. Scherer 6 7 #ifndef __EDITS_H__ 8 #define __EDITS_H__ 9 10 #include "unicode/utypes.h" 11 #include "unicode/uobject.h" 12 13 /** 14 * \file 15 * \brief C++ API: C++ class Edits for low-level string transformations on styled text. 16 */ 17 18 U_NAMESPACE_BEGIN 19 20 class UnicodeString; 21 22 /** 23 * Records lengths of string edits but not replacement text. Supports replacements, insertions, deletions 24 * in linear progression. Does not support moving/reordering of text. 25 * 26 * There are two types of edits: <em>change edits</em> and <em>no-change edits</em>. Add edits to 27 * instances of this class using {@link #addReplace(int32_t, int32_t)} (for change edits) and 28 * {@link #addUnchanged(int32_t)} (for no-change edits). Change edits are retained with full granularity, 29 * whereas adjacent no-change edits are always merged together. In no-change edits, there is a one-to-one 30 * mapping between code points in the source and destination strings. 31 * 32 * After all edits have been added, instances of this class should be considered immutable, and an 33 * {@link Edits::Iterator} can be used for queries. 34 * 35 * There are four flavors of Edits::Iterator: 36 * 37 * <ul> 38 * <li>{@link #getFineIterator()} retains full granularity of change edits. 39 * <li>{@link #getFineChangesIterator()} retains full granularity of change edits, and when calling 40 * next() on the iterator, skips over no-change edits (unchanged regions). 41 * <li>{@link #getCoarseIterator()} treats adjacent change edits as a single edit. (Adjacent no-change 42 * edits are automatically merged during the construction phase.) 43 * <li>{@link #getCoarseChangesIterator()} treats adjacent change edits as a single edit, and when 44 * calling next() on the iterator, skips over no-change edits (unchanged regions). 45 * </ul> 46 * 47 * For example, consider the string "abcßDeF", which case-folds to "abcssdef". This string has the 48 * following fine edits: 49 * <ul> 50 * <li>abc ⇨ abc (no-change) 51 * <li>ß ⇨ ss (change) 52 * <li>D ⇨ d (change) 53 * <li>e ⇨ e (no-change) 54 * <li>F ⇨ f (change) 55 * </ul> 56 * and the following coarse edits (note how adjacent change edits get merged together): 57 * <ul> 58 * <li>abc ⇨ abc (no-change) 59 * <li>ßD ⇨ ssd (change) 60 * <li>e ⇨ e (no-change) 61 * <li>F ⇨ f (change) 62 * </ul> 63 * 64 * The "fine changes" and "coarse changes" iterators will step through only the change edits when their 65 * `Edits::Iterator::next()` methods are called. They are identical to the non-change iterators when 66 * their `Edits::Iterator::findSourceIndex()` or `Edits::Iterator::findDestinationIndex()` 67 * methods are used to walk through the string. 68 * 69 * For examples of how to use this class, see the test `TestCaseMapEditsIteratorDocs` in 70 * UCharacterCaseTest.java. 71 * 72 * An Edits object tracks a separate UErrorCode, but ICU string transformation functions 73 * (e.g., case mapping functions) merge any such errors into their API's UErrorCode. 74 * 75 * @stable ICU 59 76 */ 77 class U_COMMON_API Edits U_FINAL : public UMemory { 78 public: 79 /** 80 * Constructs an empty object. 81 * @stable ICU 59 82 */ Edits()83 Edits() : 84 array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0), 85 errorCode_(U_ZERO_ERROR) {} 86 /** 87 * Copy constructor. 88 * @param other source edits 89 * @stable ICU 60 90 */ Edits(const Edits & other)91 Edits(const Edits &other) : 92 array(stackArray), capacity(STACK_CAPACITY), length(other.length), 93 delta(other.delta), numChanges(other.numChanges), 94 errorCode_(other.errorCode_) { 95 copyArray(other); 96 } 97 /** 98 * Move constructor, might leave src empty. 99 * This object will have the same contents that the source object had. 100 * @param src source edits 101 * @stable ICU 60 102 */ Edits(Edits && src)103 Edits(Edits &&src) U_NOEXCEPT : 104 array(stackArray), capacity(STACK_CAPACITY), length(src.length), 105 delta(src.delta), numChanges(src.numChanges), 106 errorCode_(src.errorCode_) { 107 moveArray(src); 108 } 109 110 /** 111 * Destructor. 112 * @stable ICU 59 113 */ 114 ~Edits(); 115 116 /** 117 * Assignment operator. 118 * @param other source edits 119 * @return *this 120 * @stable ICU 60 121 */ 122 Edits &operator=(const Edits &other); 123 124 /** 125 * Move assignment operator, might leave src empty. 126 * This object will have the same contents that the source object had. 127 * The behavior is undefined if *this and src are the same object. 128 * @param src source edits 129 * @return *this 130 * @stable ICU 60 131 */ 132 Edits &operator=(Edits &&src) U_NOEXCEPT; 133 134 /** 135 * Resets the data but may not release memory. 136 * @stable ICU 59 137 */ 138 void reset() U_NOEXCEPT; 139 140 /** 141 * Adds a no-change edit: a record for an unchanged segment of text. 142 * Normally called from inside ICU string transformation functions, not user code. 143 * @stable ICU 59 144 */ 145 void addUnchanged(int32_t unchangedLength); 146 /** 147 * Adds a change edit: a record for a text replacement/insertion/deletion. 148 * Normally called from inside ICU string transformation functions, not user code. 149 * @stable ICU 59 150 */ 151 void addReplace(int32_t oldLength, int32_t newLength); 152 /** 153 * Sets the UErrorCode if an error occurred while recording edits. 154 * Preserves older error codes in the outErrorCode. 155 * Normally called from inside ICU string transformation functions, not user code. 156 * @param outErrorCode Set to an error code if it does not contain one already 157 * and an error occurred while recording edits. 158 * Otherwise unchanged. 159 * @return TRUE if U_FAILURE(outErrorCode) 160 * @stable ICU 59 161 */ 162 UBool copyErrorTo(UErrorCode &outErrorCode); 163 164 /** 165 * How much longer is the new text compared with the old text? 166 * @return new length minus old length 167 * @stable ICU 59 168 */ lengthDelta()169 int32_t lengthDelta() const { return delta; } 170 /** 171 * @return TRUE if there are any change edits 172 * @stable ICU 59 173 */ hasChanges()174 UBool hasChanges() const { return numChanges != 0; } 175 176 /** 177 * @return the number of change edits 178 * @stable ICU 60 179 */ numberOfChanges()180 int32_t numberOfChanges() const { return numChanges; } 181 182 /** 183 * Access to the list of edits. 184 * 185 * At any moment in time, an instance of this class points to a single edit: a "window" into a span 186 * of the source string and the corresponding span of the destination string. The source string span 187 * starts at {@link #sourceIndex()} and runs for {@link #oldLength()} chars; the destination string 188 * span starts at {@link #destinationIndex()} and runs for {@link #newLength()} chars. 189 * 190 * The iterator can be moved between edits using the `next()`, `findSourceIndex(int32_t, UErrorCode &)`, 191 * and `findDestinationIndex(int32_t, UErrorCode &)` methods. 192 * Calling any of these methods mutates the iterator to make it point to the corresponding edit. 193 * 194 * For more information, see the documentation for {@link Edits}. 195 * 196 * @see getCoarseIterator 197 * @see getFineIterator 198 * @stable ICU 59 199 */ 200 struct U_COMMON_API Iterator U_FINAL : public UMemory { 201 /** 202 * Default constructor, empty iterator. 203 * @stable ICU 60 204 */ IteratorU_FINAL205 Iterator() : 206 array(nullptr), index(0), length(0), 207 remaining(0), onlyChanges_(FALSE), coarse(FALSE), 208 dir(0), changed(FALSE), oldLength_(0), newLength_(0), 209 srcIndex(0), replIndex(0), destIndex(0) {} 210 /** 211 * Copy constructor. 212 * @stable ICU 59 213 */ 214 Iterator(const Iterator &other) = default; 215 /** 216 * Assignment operator. 217 * @stable ICU 59 218 */ 219 Iterator &operator=(const Iterator &other) = default; 220 221 /** 222 * Advances the iterator to the next edit. 223 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 224 * or else the function returns immediately. Check for U_FAILURE() 225 * on output or use with function chaining. (See User Guide for details.) 226 * @return TRUE if there is another edit 227 * @stable ICU 59 228 */ nextU_FINAL229 UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); } 230 231 /** 232 * Moves the iterator to the edit that contains the source index. 233 * The source index may be found in a no-change edit 234 * even if normal iteration would skip no-change edits. 235 * Normal iteration can continue from a found edit. 236 * 237 * The iterator state before this search logically does not matter. 238 * (It may affect the performance of the search.) 239 * 240 * The iterator state after this search is undefined 241 * if the source index is out of bounds for the source string. 242 * 243 * @param i source index 244 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 245 * or else the function returns immediately. Check for U_FAILURE() 246 * on output or use with function chaining. (See User Guide for details.) 247 * @return TRUE if the edit for the source index was found 248 * @stable ICU 59 249 */ findSourceIndexU_FINAL250 UBool findSourceIndex(int32_t i, UErrorCode &errorCode) { 251 return findIndex(i, TRUE, errorCode) == 0; 252 } 253 254 /** 255 * Moves the iterator to the edit that contains the destination index. 256 * The destination index may be found in a no-change edit 257 * even if normal iteration would skip no-change edits. 258 * Normal iteration can continue from a found edit. 259 * 260 * The iterator state before this search logically does not matter. 261 * (It may affect the performance of the search.) 262 * 263 * The iterator state after this search is undefined 264 * if the source index is out of bounds for the source string. 265 * 266 * @param i destination index 267 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 268 * or else the function returns immediately. Check for U_FAILURE() 269 * on output or use with function chaining. (See User Guide for details.) 270 * @return TRUE if the edit for the destination index was found 271 * @stable ICU 60 272 */ findDestinationIndexU_FINAL273 UBool findDestinationIndex(int32_t i, UErrorCode &errorCode) { 274 return findIndex(i, FALSE, errorCode) == 0; 275 } 276 277 /** 278 * Computes the destination index corresponding to the given source index. 279 * If the source index is inside a change edit (not at its start), 280 * then the destination index at the end of that edit is returned, 281 * since there is no information about index mapping inside a change edit. 282 * 283 * (This means that indexes to the start and middle of an edit, 284 * for example around a grapheme cluster, are mapped to indexes 285 * encompassing the entire edit. 286 * The alternative, mapping an interior index to the start, 287 * would map such an interval to an empty one.) 288 * 289 * This operation will usually but not always modify this object. 290 * The iterator state after this search is undefined. 291 * 292 * @param i source index 293 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 294 * or else the function returns immediately. Check for U_FAILURE() 295 * on output or use with function chaining. (See User Guide for details.) 296 * @return destination index; undefined if i is not 0..string length 297 * @stable ICU 60 298 */ 299 int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode); 300 301 /** 302 * Computes the source index corresponding to the given destination index. 303 * If the destination index is inside a change edit (not at its start), 304 * then the source index at the end of that edit is returned, 305 * since there is no information about index mapping inside a change edit. 306 * 307 * (This means that indexes to the start and middle of an edit, 308 * for example around a grapheme cluster, are mapped to indexes 309 * encompassing the entire edit. 310 * The alternative, mapping an interior index to the start, 311 * would map such an interval to an empty one.) 312 * 313 * This operation will usually but not always modify this object. 314 * The iterator state after this search is undefined. 315 * 316 * @param i destination index 317 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 318 * or else the function returns immediately. Check for U_FAILURE() 319 * on output or use with function chaining. (See User Guide for details.) 320 * @return source index; undefined if i is not 0..string length 321 * @stable ICU 60 322 */ 323 int32_t sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode); 324 325 /** 326 * Returns whether the edit currently represented by the iterator is a change edit. 327 * 328 * @return TRUE if this edit replaces oldLength() units with newLength() different ones. 329 * FALSE if oldLength units remain unchanged. 330 * @stable ICU 59 331 */ hasChangeU_FINAL332 UBool hasChange() const { return changed; } 333 334 /** 335 * The length of the current span in the source string, which starts at {@link #sourceIndex}. 336 * 337 * @return the number of units in the original string which are replaced or remain unchanged. 338 * @stable ICU 59 339 */ oldLengthU_FINAL340 int32_t oldLength() const { return oldLength_; } 341 342 /** 343 * The length of the current span in the destination string, which starts at 344 * {@link #destinationIndex}, or in the replacement string, which starts at 345 * {@link #replacementIndex}. 346 * 347 * @return the number of units in the modified string, if hasChange() is TRUE. 348 * Same as oldLength if hasChange() is FALSE. 349 * @stable ICU 59 350 */ newLengthU_FINAL351 int32_t newLength() const { return newLength_; } 352 353 /** 354 * The start index of the current span in the source string; the span has length 355 * {@link #oldLength}. 356 * 357 * @return the current index into the source string 358 * @stable ICU 59 359 */ sourceIndexU_FINAL360 int32_t sourceIndex() const { return srcIndex; } 361 362 /** 363 * The start index of the current span in the replacement string; the span has length 364 * {@link #newLength}. Well-defined only if the current edit is a change edit. 365 * 366 * The *replacement string* is the concatenation of all substrings of the destination 367 * string corresponding to change edits. 368 * 369 * This method is intended to be used together with operations that write only replacement 370 * characters (e.g. operations specifying the \ref U_OMIT_UNCHANGED_TEXT option). 371 * The source string can then be modified in-place. 372 * 373 * @return the current index into the replacement-characters-only string, 374 * not counting unchanged spans 375 * @stable ICU 59 376 */ replacementIndexU_FINAL377 int32_t replacementIndex() const { 378 // TODO: Throw an exception if we aren't in a change edit? 379 return replIndex; 380 } 381 382 /** 383 * The start index of the current span in the destination string; the span has length 384 * {@link #newLength}. 385 * 386 * @return the current index into the full destination string 387 * @stable ICU 59 388 */ destinationIndexU_FINAL389 int32_t destinationIndex() const { return destIndex; } 390 391 #ifndef U_HIDE_INTERNAL_API 392 /** 393 * A string representation of the current edit represented by the iterator for debugging. You 394 * should not depend on the contents of the return string. 395 * @internal 396 */ 397 UnicodeString& toString(UnicodeString& appendTo) const; 398 #endif // U_HIDE_INTERNAL_API 399 400 private: 401 friend class Edits; 402 403 Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs); 404 405 int32_t readLength(int32_t head); 406 void updateNextIndexes(); 407 void updatePreviousIndexes(); 408 UBool noNext(); 409 UBool next(UBool onlyChanges, UErrorCode &errorCode); 410 UBool previous(UErrorCode &errorCode); 411 /** @return -1: error or i<0; 0: found; 1: i>=string length */ 412 int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode); 413 414 const uint16_t *array; 415 int32_t index, length; 416 // 0 if we are not within compressed equal-length changes. 417 // Otherwise the number of remaining changes, including the current one. 418 int32_t remaining; 419 UBool onlyChanges_, coarse; 420 421 int8_t dir; // iteration direction: back(<0), initial(0), forward(>0) 422 UBool changed; 423 int32_t oldLength_, newLength_; 424 int32_t srcIndex, replIndex, destIndex; 425 }; 426 427 /** 428 * Returns an Iterator for coarse-grained change edits 429 * (adjacent change edits are treated as one). 430 * Can be used to perform simple string updates. 431 * Skips no-change edits. 432 * @return an Iterator that merges adjacent changes. 433 * @stable ICU 59 434 */ getCoarseChangesIterator()435 Iterator getCoarseChangesIterator() const { 436 return Iterator(array, length, TRUE, TRUE); 437 } 438 439 /** 440 * Returns an Iterator for coarse-grained change and no-change edits 441 * (adjacent change edits are treated as one). 442 * Can be used to perform simple string updates. 443 * Adjacent change edits are treated as one edit. 444 * @return an Iterator that merges adjacent changes. 445 * @stable ICU 59 446 */ getCoarseIterator()447 Iterator getCoarseIterator() const { 448 return Iterator(array, length, FALSE, TRUE); 449 } 450 451 /** 452 * Returns an Iterator for fine-grained change edits 453 * (full granularity of change edits is retained). 454 * Can be used for modifying styled text. 455 * Skips no-change edits. 456 * @return an Iterator that separates adjacent changes. 457 * @stable ICU 59 458 */ getFineChangesIterator()459 Iterator getFineChangesIterator() const { 460 return Iterator(array, length, TRUE, FALSE); 461 } 462 463 /** 464 * Returns an Iterator for fine-grained change and no-change edits 465 * (full granularity of change edits is retained). 466 * Can be used for modifying styled text. 467 * @return an Iterator that separates adjacent changes. 468 * @stable ICU 59 469 */ getFineIterator()470 Iterator getFineIterator() const { 471 return Iterator(array, length, FALSE, FALSE); 472 } 473 474 /** 475 * Merges the two input Edits and appends the result to this object. 476 * 477 * Consider two string transformations (for example, normalization and case mapping) 478 * where each records Edits in addition to writing an output string.<br> 479 * Edits ab reflect how substrings of input string a 480 * map to substrings of intermediate string b.<br> 481 * Edits bc reflect how substrings of intermediate string b 482 * map to substrings of output string c.<br> 483 * This function merges ab and bc such that the additional edits 484 * recorded in this object reflect how substrings of input string a 485 * map to substrings of output string c. 486 * 487 * If unrelated Edits are passed in where the output string of the first 488 * has a different length than the input string of the second, 489 * then a U_ILLEGAL_ARGUMENT_ERROR is reported. 490 * 491 * @param ab reflects how substrings of input string a 492 * map to substrings of intermediate string b. 493 * @param bc reflects how substrings of intermediate string b 494 * map to substrings of output string c. 495 * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test, 496 * or else the function returns immediately. Check for U_FAILURE() 497 * on output or use with function chaining. (See User Guide for details.) 498 * @return *this, with the merged edits appended 499 * @stable ICU 60 500 */ 501 Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode); 502 503 private: 504 void releaseArray() U_NOEXCEPT; 505 Edits ©Array(const Edits &other); 506 Edits &moveArray(Edits &src) U_NOEXCEPT; 507 setLastUnit(int32_t last)508 void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; } lastUnit()509 int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; } 510 511 void append(int32_t r); 512 UBool growArray(); 513 514 static const int32_t STACK_CAPACITY = 100; 515 uint16_t *array; 516 int32_t capacity; 517 int32_t length; 518 int32_t delta; 519 int32_t numChanges; 520 UErrorCode errorCode_; 521 uint16_t stackArray[STACK_CAPACITY]; 522 }; 523 524 U_NAMESPACE_END 525 526 #endif // __EDITS_H__ 527