1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 package com.ibm.icu.text; 4 5 import java.nio.BufferOverflowException; 6 import java.util.Arrays; 7 8 /** 9 * Records lengths of string edits but not replacement text. Supports replacements, insertions, deletions 10 * in linear progression. Does not support moving/reordering of text. 11 * <p> 12 * There are two types of edits: <em>change edits</em> and <em>no-change edits</em>. Add edits to 13 * instances of this class using {@link #addReplace(int, int)} (for change edits) and 14 * {@link #addUnchanged(int)} (for no-change edits). Change edits are retained with full granularity, 15 * whereas adjacent no-change edits are always merged together. In no-change edits, there is a one-to-one 16 * mapping between code points in the source and destination strings. 17 * <p> 18 * After all edits have been added, instances of this class should be considered immutable, and an 19 * {@link Edits.Iterator} can be used for queries. 20 * <p> 21 * There are four flavors of Edits.Iterator: 22 * <ul> 23 * <li>{@link #getFineIterator()} retains full granularity of change edits. 24 * <li>{@link #getFineChangesIterator()} retains full granularity of change edits, and when calling 25 * next() on the iterator, skips over no-change edits (unchanged regions). 26 * <li>{@link #getCoarseIterator()} treats adjacent change edits as a single edit. (Adjacent no-change 27 * edits are automatically merged during the construction phase.) 28 * <li>{@link #getCoarseChangesIterator()} treats adjacent change edits as a single edit, and when 29 * calling next() on the iterator, skips over no-change edits (unchanged regions). 30 * </ul> 31 * <p> 32 * For example, consider the string "abcßDeF", which case-folds to "abcssdef". This string has the 33 * following fine edits: 34 * <ul> 35 * <li>abc ⇨ abc (no-change) 36 * <li>ß ⇨ ss (change) 37 * <li>D ⇨ d (change) 38 * <li>e ⇨ e (no-change) 39 * <li>F ⇨ f (change) 40 * </ul> 41 * and the following coarse edits (note how adjacent change edits get merged together): 42 * <ul> 43 * <li>abc ⇨ abc (no-change) 44 * <li>ßD ⇨ ssd (change) 45 * <li>e ⇨ e (no-change) 46 * <li>F ⇨ f (change) 47 * </ul> 48 * <p> 49 * The "fine changes" and "coarse changes" iterators will step through only the change edits when their 50 * {@link Edits.Iterator#next()} methods are called. They are identical to the non-change iterators when 51 * their {@link Edits.Iterator#findSourceIndex(int)} or {@link Edits.Iterator#findDestinationIndex(int)} 52 * methods are used to walk through the string. 53 * <p> 54 * For examples of how to use this class, see the test <code>TestCaseMapEditsIteratorDocs</code> in 55 * UCharacterCaseTest.java. 56 * 57 * @stable ICU 59 58 */ 59 public final class Edits { 60 // 0000uuuuuuuuuuuu records u+1 unchanged text units. 61 private static final int MAX_UNCHANGED_LENGTH = 0x1000; 62 private static final int MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1; 63 64 // 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units. 65 private static final int MAX_SHORT_CHANGE_OLD_LENGTH = 6; 66 private static final int MAX_SHORT_CHANGE_NEW_LENGTH = 7; 67 private static final int SHORT_CHANGE_NUM_MASK = 0x1ff; 68 private static final int MAX_SHORT_CHANGE = 0x6fff; 69 70 // 0111mmmmmmnnnnnn records a replacement of m text units with n. 71 // m or n = 61: actual length follows in the next edits array unit. 72 // m or n = 62..63: actual length follows in the next two edits array units. 73 // Bit 30 of the actual length is in the head unit. 74 // Trailing units have bit 15 set. 75 private static final int LENGTH_IN_1TRAIL = 61; 76 private static final int LENGTH_IN_2TRAIL = 62; 77 78 private static final int STACK_CAPACITY = 100; 79 private char[] array; 80 private int length; 81 private int delta; 82 private int numChanges; 83 84 /** 85 * Constructs an empty object. 86 * @stable ICU 59 87 */ Edits()88 public Edits() { 89 array = new char[STACK_CAPACITY]; 90 } 91 92 /** 93 * Resets the data but may not release memory. 94 * @stable ICU 59 95 */ reset()96 public void reset() { 97 length = delta = numChanges = 0; 98 } 99 setLastUnit(int last)100 private void setLastUnit(int last) { 101 array[length - 1] = (char)last; 102 } lastUnit()103 private int lastUnit() { 104 return length > 0 ? array[length - 1] : 0xffff; 105 } 106 107 /** 108 * Adds a no-change edit: a record for an unchanged segment of text. 109 * Normally called from inside ICU string transformation functions, not user code. 110 * @stable ICU 59 111 */ addUnchanged(int unchangedLength)112 public void addUnchanged(int unchangedLength) { 113 if(unchangedLength < 0) { 114 throw new IllegalArgumentException( 115 "addUnchanged(" + unchangedLength + "): length must not be negative"); 116 } 117 // Merge into previous unchanged-text record, if any. 118 int last = lastUnit(); 119 if(last < MAX_UNCHANGED) { 120 int remaining = MAX_UNCHANGED - last; 121 if (remaining >= unchangedLength) { 122 setLastUnit(last + unchangedLength); 123 return; 124 } 125 setLastUnit(MAX_UNCHANGED); 126 unchangedLength -= remaining; 127 } 128 // Split large lengths into multiple units. 129 while(unchangedLength >= MAX_UNCHANGED_LENGTH) { 130 append(MAX_UNCHANGED); 131 unchangedLength -= MAX_UNCHANGED_LENGTH; 132 } 133 // Write a small (remaining) length. 134 if(unchangedLength > 0) { 135 append(unchangedLength - 1); 136 } 137 } 138 139 /** 140 * Adds a change edit: a record for a text replacement/insertion/deletion. 141 * Normally called from inside ICU string transformation functions, not user code. 142 * @stable ICU 59 143 */ addReplace(int oldLength, int newLength)144 public void addReplace(int oldLength, int newLength) { 145 if(oldLength < 0 || newLength < 0) { 146 throw new IllegalArgumentException( 147 "addReplace(" + oldLength + ", " + newLength + 148 "): both lengths must be non-negative"); 149 } 150 if (oldLength == 0 && newLength == 0) { 151 return; 152 } 153 ++numChanges; 154 int newDelta = newLength - oldLength; 155 if (newDelta != 0) { 156 if ((newDelta > 0 && delta >= 0 && newDelta > (Integer.MAX_VALUE - delta)) || 157 (newDelta < 0 && delta < 0 && newDelta < (Integer.MIN_VALUE - delta))) { 158 // Integer overflow or underflow. 159 throw new IndexOutOfBoundsException(); 160 } 161 delta += newDelta; 162 } 163 164 if(0 < oldLength && oldLength <= MAX_SHORT_CHANGE_OLD_LENGTH && 165 newLength <= MAX_SHORT_CHANGE_NEW_LENGTH) { 166 // Merge into previous same-lengths short-replacement record, if any. 167 int u = (oldLength << 12) | (newLength << 9); 168 int last = lastUnit(); 169 if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE && 170 (last & ~SHORT_CHANGE_NUM_MASK) == u && 171 (last & SHORT_CHANGE_NUM_MASK) < SHORT_CHANGE_NUM_MASK) { 172 setLastUnit(last + 1); 173 return; 174 } 175 append(u); 176 return; 177 } 178 179 int head = 0x7000; 180 if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) { 181 head |= oldLength << 6; 182 head |= newLength; 183 append(head); 184 } else if ((array.length - length) >= 5 || growArray()) { 185 int limit = length + 1; 186 if(oldLength < LENGTH_IN_1TRAIL) { 187 head |= oldLength << 6; 188 } else if(oldLength <= 0x7fff) { 189 head |= LENGTH_IN_1TRAIL << 6; 190 array[limit++] = (char)(0x8000 | oldLength); 191 } else { 192 head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6; 193 array[limit++] = (char)(0x8000 | (oldLength >> 15)); 194 array[limit++] = (char)(0x8000 | oldLength); 195 } 196 if(newLength < LENGTH_IN_1TRAIL) { 197 head |= newLength; 198 } else if(newLength <= 0x7fff) { 199 head |= LENGTH_IN_1TRAIL; 200 array[limit++] = (char)(0x8000 | newLength); 201 } else { 202 head |= LENGTH_IN_2TRAIL + (newLength >> 30); 203 array[limit++] = (char)(0x8000 | (newLength >> 15)); 204 array[limit++] = (char)(0x8000 | newLength); 205 } 206 array[length] = (char)head; 207 length = limit; 208 } 209 } 210 append(int r)211 private void append(int r) { 212 if(length < array.length || growArray()) { 213 array[length++] = (char)r; 214 } 215 } 216 growArray()217 private boolean growArray() { 218 int newCapacity; 219 if (array.length == STACK_CAPACITY) { 220 newCapacity = 2000; 221 } else if (array.length == Integer.MAX_VALUE) { 222 throw new BufferOverflowException(); 223 } else if (array.length >= (Integer.MAX_VALUE / 2)) { 224 newCapacity = Integer.MAX_VALUE; 225 } else { 226 newCapacity = 2 * array.length; 227 } 228 // Grow by at least 5 units so that a maximal change record will fit. 229 if ((newCapacity - array.length) < 5) { 230 throw new BufferOverflowException(); 231 } 232 array = Arrays.copyOf(array, newCapacity); 233 return true; 234 } 235 236 /** 237 * How much longer is the new text compared with the old text? 238 * @return new length minus old length 239 * @stable ICU 59 240 */ lengthDelta()241 public int lengthDelta() { return delta; } 242 /** 243 * @return true if there are any change edits 244 * @stable ICU 59 245 */ hasChanges()246 public boolean hasChanges() { return numChanges != 0; } 247 248 /** 249 * @return the number of change edits 250 * @stable ICU 60 251 */ numberOfChanges()252 public int numberOfChanges() { return numChanges; } 253 254 /** 255 * Access to the list of edits. 256 * <p> 257 * At any moment in time, an instance of this class points to a single edit: a "window" into a span 258 * of the source string and the corresponding span of the destination string. The source string span 259 * starts at {@link #sourceIndex()} and runs for {@link #oldLength()} chars; the destination string 260 * span starts at {@link #destinationIndex()} and runs for {@link #newLength()} chars. 261 * <p> 262 * The iterator can be moved between edits using the {@link #next()}, {@link #findSourceIndex(int)}, 263 * and {@link #findDestinationIndex(int)} methods. Calling any of these methods mutates the iterator 264 * to make it point to the corresponding edit. 265 * <p> 266 * For more information, see the documentation for {@link Edits}. 267 * <p> 268 * Note: Although this class is called "Iterator", it does not implement {@link java.util.Iterator}. 269 * 270 * @see #getCoarseIterator 271 * @see #getFineIterator 272 * @stable ICU 59 273 */ 274 public static final class Iterator { 275 private final char[] array; 276 private int index; 277 private final int length; 278 /** 279 * 0 if we are not within compressed equal-length changes. 280 * Otherwise the number of remaining changes, including the current one. 281 */ 282 private int remaining; 283 private final boolean onlyChanges_, coarse; 284 285 private int dir; // iteration direction: back(<0), initial(0), forward(>0) 286 private boolean changed; 287 private int oldLength_, newLength_; 288 private int srcIndex, replIndex, destIndex; 289 Iterator(char[] a, int len, boolean oc, boolean crs)290 private Iterator(char[] a, int len, boolean oc, boolean crs) { 291 array = a; 292 length = len; 293 onlyChanges_ = oc; 294 coarse = crs; 295 } 296 readLength(int head)297 private int readLength(int head) { 298 if (head < LENGTH_IN_1TRAIL) { 299 return head; 300 } else if (head < LENGTH_IN_2TRAIL) { 301 assert(index < length); 302 assert(array[index] >= 0x8000); 303 return array[index++] & 0x7fff; 304 } else { 305 assert((index + 2) <= length); 306 assert(array[index] >= 0x8000); 307 assert(array[index + 1] >= 0x8000); 308 int len = ((head & 1) << 30) | 309 ((array[index] & 0x7fff) << 15) | 310 (array[index + 1] & 0x7fff); 311 index += 2; 312 return len; 313 } 314 } 315 updateNextIndexes()316 private void updateNextIndexes() { 317 srcIndex += oldLength_; 318 if (changed) { 319 replIndex += newLength_; 320 } 321 destIndex += newLength_; 322 } 323 updatePreviousIndexes()324 private void updatePreviousIndexes() { 325 srcIndex -= oldLength_; 326 if (changed) { 327 replIndex -= newLength_; 328 } 329 destIndex -= newLength_; 330 } 331 noNext()332 private boolean noNext() { 333 // No change before or beyond the string. 334 dir = 0; 335 changed = false; 336 oldLength_ = newLength_ = 0; 337 return false; 338 } 339 340 /** 341 * Advances the iterator to the next edit. 342 * @return true if there is another edit 343 * @stable ICU 59 344 */ next()345 public boolean next() { 346 return next(onlyChanges_); 347 } 348 next(boolean onlyChanges)349 private boolean next(boolean onlyChanges) { 350 // Forward iteration: Update the string indexes to the limit of the current span, 351 // and post-increment-read array units to assemble a new span. 352 // Leaves the array index one after the last unit of that span. 353 if (dir > 0) { 354 updateNextIndexes(); 355 } else { 356 if (dir < 0) { 357 // Turn around from previous() to next(). 358 // Post-increment-read the same span again. 359 if (remaining > 0) { 360 // Fine-grained iterator: 361 // Stay on the current one of a sequence of compressed changes. 362 ++index; // next() rests on the index after the sequence unit. 363 dir = 1; 364 return true; 365 } 366 } 367 dir = 1; 368 } 369 if (remaining >= 1) { 370 // Fine-grained iterator: Continue a sequence of compressed changes. 371 if (remaining > 1) { 372 --remaining; 373 return true; 374 } 375 remaining = 0; 376 } 377 if (index >= length) { 378 return noNext(); 379 } 380 int u = array[index++]; 381 if (u <= MAX_UNCHANGED) { 382 // Combine adjacent unchanged ranges. 383 changed = false; 384 oldLength_ = u + 1; 385 while (index < length && (u = array[index]) <= MAX_UNCHANGED) { 386 ++index; 387 oldLength_ += u + 1; 388 } 389 newLength_ = oldLength_; 390 if (onlyChanges) { 391 updateNextIndexes(); 392 if (index >= length) { 393 return noNext(); 394 } 395 // already fetched u > MAX_UNCHANGED at index 396 ++index; 397 } else { 398 return true; 399 } 400 } 401 changed = true; 402 if (u <= MAX_SHORT_CHANGE) { 403 int oldLen = u >> 12; 404 int newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH; 405 int num = (u & SHORT_CHANGE_NUM_MASK) + 1; 406 if (coarse) { 407 oldLength_ = num * oldLen; 408 newLength_ = num * newLen; 409 } else { 410 // Split a sequence of changes that was compressed into one unit. 411 oldLength_ = oldLen; 412 newLength_ = newLen; 413 if (num > 1) { 414 remaining = num; // This is the first of two or more changes. 415 } 416 return true; 417 } 418 } else { 419 assert(u <= 0x7fff); 420 oldLength_ = readLength((u >> 6) & 0x3f); 421 newLength_ = readLength(u & 0x3f); 422 if (!coarse) { 423 return true; 424 } 425 } 426 // Combine adjacent changes. 427 while (index < length && (u = array[index]) > MAX_UNCHANGED) { 428 ++index; 429 if (u <= MAX_SHORT_CHANGE) { 430 int num = (u & SHORT_CHANGE_NUM_MASK) + 1; 431 oldLength_ += (u >> 12) * num; 432 newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num; 433 } else { 434 assert(u <= 0x7fff); 435 oldLength_ += readLength((u >> 6) & 0x3f); 436 newLength_ += readLength(u & 0x3f); 437 } 438 } 439 return true; 440 } 441 previous()442 private boolean previous() { 443 // Backward iteration: Pre-decrement-read array units to assemble a new span, 444 // then update the string indexes to the start of that span. 445 // Leaves the array index on the head unit of that span. 446 if (dir >= 0) { 447 if (dir > 0) { 448 // Turn around from next() to previous(). 449 // Set the string indexes to the span limit and 450 // pre-decrement-read the same span again. 451 if (remaining > 0) { 452 // Fine-grained iterator: 453 // Stay on the current one of a sequence of compressed changes. 454 --index; // previous() rests on the sequence unit. 455 dir = -1; 456 return true; 457 } 458 updateNextIndexes(); 459 } 460 dir = -1; 461 } 462 if (remaining > 0) { 463 // Fine-grained iterator: Continue a sequence of compressed changes. 464 int u = array[index]; 465 assert(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE); 466 if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) { 467 ++remaining; 468 updatePreviousIndexes(); 469 return true; 470 } 471 remaining = 0; 472 } 473 if (index <= 0) { 474 return noNext(); 475 } 476 int u = array[--index]; 477 if (u <= MAX_UNCHANGED) { 478 // Combine adjacent unchanged ranges. 479 changed = false; 480 oldLength_ = u + 1; 481 while (index > 0 && (u = array[index - 1]) <= MAX_UNCHANGED) { 482 --index; 483 oldLength_ += u + 1; 484 } 485 newLength_ = oldLength_; 486 // No need to handle onlyChanges as long as previous() is called only from findIndex(). 487 updatePreviousIndexes(); 488 return true; 489 } 490 changed = true; 491 if (u <= MAX_SHORT_CHANGE) { 492 int oldLen = u >> 12; 493 int newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH; 494 int num = (u & SHORT_CHANGE_NUM_MASK) + 1; 495 if (coarse) { 496 oldLength_ = num * oldLen; 497 newLength_ = num * newLen; 498 } else { 499 // Split a sequence of changes that was compressed into one unit. 500 oldLength_ = oldLen; 501 newLength_ = newLen; 502 if (num > 1) { 503 remaining = 1; // This is the last of two or more changes. 504 } 505 updatePreviousIndexes(); 506 return true; 507 } 508 } else { 509 if (u <= 0x7fff) { 510 // The change is encoded in u alone. 511 oldLength_ = readLength((u >> 6) & 0x3f); 512 newLength_ = readLength(u & 0x3f); 513 } else { 514 // Back up to the head of the change, read the lengths, 515 // and reset the index to the head again. 516 assert(index > 0); 517 while ((u = array[--index]) > 0x7fff) {} 518 assert(u > MAX_SHORT_CHANGE); 519 int headIndex = index++; 520 oldLength_ = readLength((u >> 6) & 0x3f); 521 newLength_ = readLength(u & 0x3f); 522 index = headIndex; 523 } 524 if (!coarse) { 525 updatePreviousIndexes(); 526 return true; 527 } 528 } 529 // Combine adjacent changes. 530 while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) { 531 --index; 532 if (u <= MAX_SHORT_CHANGE) { 533 int num = (u & SHORT_CHANGE_NUM_MASK) + 1; 534 oldLength_ += (u >> 12) * num; 535 newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num; 536 } else if (u <= 0x7fff) { 537 // Read the lengths, and reset the index to the head again. 538 int headIndex = index++; 539 oldLength_ += readLength((u >> 6) & 0x3f); 540 newLength_ += readLength(u & 0x3f); 541 index = headIndex; 542 } 543 } 544 updatePreviousIndexes(); 545 return true; 546 } 547 548 /** 549 * Moves the iterator to the edit that contains the source index. 550 * The source index may be found in a no-change edit 551 * even if normal iteration would skip no-change edits. 552 * Normal iteration can continue from a found edit. 553 * 554 * <p>The iterator state before this search logically does not matter. 555 * (It may affect the performance of the search.) 556 * 557 * <p>The iterator state after this search is undefined 558 * if the source index is out of bounds for the source string. 559 * 560 * @param i source index 561 * @return true if the edit for the source index was found 562 * @stable ICU 59 563 */ findSourceIndex(int i)564 public boolean findSourceIndex(int i) { 565 return findIndex(i, true) == 0; 566 } 567 568 /** 569 * Moves the iterator to the edit that contains the destination index. 570 * The destination index may be found in a no-change edit 571 * even if normal iteration would skip no-change edits. 572 * Normal iteration can continue from a found edit. 573 * 574 * <p>The iterator state before this search logically does not matter. 575 * (It may affect the performance of the search.) 576 * 577 * <p>The iterator state after this search is undefined 578 * if the source index is out of bounds for the source string. 579 * 580 * @param i destination index 581 * @return true if the edit for the destination index was found 582 * @stable ICU 60 583 */ findDestinationIndex(int i)584 public boolean findDestinationIndex(int i) { 585 return findIndex(i, false) == 0; 586 } 587 588 /** @return -1: error or i<0; 0: found; 1: i>=string length */ findIndex(int i, boolean findSource)589 private int findIndex(int i, boolean findSource) { 590 if (i < 0) { return -1; } 591 int spanStart, spanLength; 592 if (findSource) { // find source index 593 spanStart = srcIndex; 594 spanLength = oldLength_; 595 } else { // find destination index 596 spanStart = destIndex; 597 spanLength = newLength_; 598 } 599 if (i < spanStart) { 600 if (i >= (spanStart / 2)) { 601 // Search backwards. 602 for (;;) { 603 boolean hasPrevious = previous(); 604 assert(hasPrevious); // because i>=0 and the first span starts at 0 605 spanStart = findSource ? srcIndex : destIndex; 606 if (i >= spanStart) { 607 // The index is in the current span. 608 return 0; 609 } 610 if (remaining > 0) { 611 // Is the index in one of the remaining compressed edits? 612 // spanStart is the start of the current span, first of the remaining ones. 613 spanLength = findSource ? oldLength_ : newLength_; 614 int u = array[index]; 615 assert(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE); 616 int num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining; 617 int len = num * spanLength; 618 if (i >= (spanStart - len)) { 619 int n = ((spanStart - i - 1) / spanLength) + 1; 620 // 1 <= n <= num 621 srcIndex -= n * oldLength_; 622 replIndex -= n * newLength_; 623 destIndex -= n * newLength_; 624 remaining += n; 625 return 0; 626 } 627 // Skip all of these edits at once. 628 srcIndex -= num * oldLength_; 629 replIndex -= num * newLength_; 630 destIndex -= num * newLength_; 631 remaining = 0; 632 } 633 } 634 } 635 // Reset the iterator to the start. 636 dir = 0; 637 index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0; 638 } else if (i < (spanStart + spanLength)) { 639 // The index is in the current span. 640 return 0; 641 } 642 while (next(false)) { 643 if (findSource) { 644 spanStart = srcIndex; 645 spanLength = oldLength_; 646 } else { 647 spanStart = destIndex; 648 spanLength = newLength_; 649 } 650 if (i < (spanStart + spanLength)) { 651 // The index is in the current span. 652 return 0; 653 } 654 if (remaining > 1) { 655 // Is the index in one of the remaining compressed edits? 656 // spanStart is the start of the current span, first of the remaining ones. 657 int len = remaining * spanLength; 658 if (i < (spanStart + len)) { 659 int n = (i - spanStart) / spanLength; // 1 <= n <= remaining - 1 660 srcIndex += n * oldLength_; 661 replIndex += n * newLength_; 662 destIndex += n * newLength_; 663 remaining -= n; 664 return 0; 665 } 666 // Make next() skip all of these edits at once. 667 oldLength_ *= remaining; 668 newLength_ *= remaining; 669 remaining = 0; 670 } 671 } 672 return 1; 673 } 674 675 /** 676 * Computes the destination index corresponding to the given source index. 677 * If the source index is inside a change edit (not at its start), 678 * then the destination index at the end of that edit is returned, 679 * since there is no information about index mapping inside a change edit. 680 * 681 * <p>(This means that indexes to the start and middle of an edit, 682 * for example around a grapheme cluster, are mapped to indexes 683 * encompassing the entire edit. 684 * The alternative, mapping an interior index to the start, 685 * would map such an interval to an empty one.) 686 * 687 * <p>This operation will usually but not always modify this object. 688 * The iterator state after this search is undefined. 689 * 690 * @param i source index 691 * @return destination index; undefined if i is not 0..string length 692 * @stable ICU 60 693 */ 694 public int destinationIndexFromSourceIndex(int i) { 695 int where = findIndex(i, true); 696 if (where < 0) { 697 // Error or before the string. 698 return 0; 699 } 700 if (where > 0 || i == srcIndex) { 701 // At or after string length, or at start of the found span. 702 return destIndex; 703 } 704 if (changed) { 705 // In a change span, map to its end. 706 return destIndex + newLength_; 707 } else { 708 // In an unchanged span, offset 1:1 within it. 709 return destIndex + (i - srcIndex); 710 } 711 } 712 713 /** 714 * Computes the source index corresponding to the given destination index. 715 * If the destination index is inside a change edit (not at its start), 716 * then the source index at the end of that edit is returned, 717 * since there is no information about index mapping inside a change edit. 718 * 719 * <p>(This means that indexes to the start and middle of an edit, 720 * for example around a grapheme cluster, are mapped to indexes 721 * encompassing the entire edit. 722 * The alternative, mapping an interior index to the start, 723 * would map such an interval to an empty one.) 724 * 725 * <p>This operation will usually but not always modify this object. 726 * The iterator state after this search is undefined. 727 * 728 * @param i destination index 729 * @return source index; undefined if i is not 0..string length 730 * @stable ICU 60 731 */ 732 public int sourceIndexFromDestinationIndex(int i) { 733 int where = findIndex(i, false); 734 if (where < 0) { 735 // Error or before the string. 736 return 0; 737 } 738 if (where > 0 || i == destIndex) { 739 // At or after string length, or at start of the found span. 740 return srcIndex; 741 } 742 if (changed) { 743 // In a change span, map to its end. 744 return srcIndex + oldLength_; 745 } else { 746 // In an unchanged span, offset within it. 747 return srcIndex + (i - destIndex); 748 } 749 } 750 751 /** 752 * Returns whether the edit currently represented by the iterator is a change edit. 753 * 754 * @return true if this edit replaces oldLength() units with newLength() different ones. 755 * false if oldLength units remain unchanged. 756 * @stable ICU 59 757 */ 758 public boolean hasChange() { return changed; } 759 760 /** 761 * The length of the current span in the source string, which starts at {@link #sourceIndex}. 762 * 763 * @return the number of units in the source string which are replaced or remain unchanged. 764 * @stable ICU 59 765 */ 766 public int oldLength() { return oldLength_; } 767 768 /** 769 * The length of the current span in the destination string, which starts at 770 * {@link #destinationIndex}, or in the replacement string, which starts at 771 * {@link #replacementIndex}. 772 * 773 * @return the number of units in the destination string, if hasChange() is true. Same as 774 * oldLength if hasChange() is false. 775 * @stable ICU 59 776 */ 777 public int newLength() { return newLength_; } 778 779 /** 780 * The start index of the current span in the source string; the span has length 781 * {@link #oldLength}. 782 * 783 * @return the current index into the source string 784 * @stable ICU 59 785 */ 786 public int sourceIndex() { return srcIndex; } 787 788 /** 789 * The start index of the current span in the replacement string; the span has length 790 * {@link #newLength}. Well-defined only if the current edit is a change edit. 791 * <p> 792 * The <em>replacement string</em> is the concatenation of all substrings of the destination 793 * string corresponding to change edits. 794 * <p> 795 * This method is intended to be used together with operations that write only replacement 796 * characters (e.g., {@link CaseMap#omitUnchangedText()}). The source string can then be modified 797 * in-place. 798 * 799 * @return the current index into the replacement-characters-only string, not counting unchanged 800 * spans 801 * @stable ICU 59 802 */ 803 public int replacementIndex() { 804 // TODO: Throw an exception if we aren't in a change edit? 805 return replIndex; 806 } 807 808 /** 809 * The start index of the current span in the destination string; the span has length 810 * {@link #newLength}. 811 * 812 * @return the current index into the full destination string 813 * @stable ICU 59 814 */ 815 public int destinationIndex() { return destIndex; } 816 817 /** 818 * A string representation of the current edit represented by the iterator for debugging. You 819 * should not depend on the contents of the return string; it may change over time. 820 * @return a string representation of the object. 821 * @stable ICU 59 822 */ 823 @Override 824 public String toString() { 825 StringBuilder sb = new StringBuilder(); 826 sb.append(super.toString()); 827 sb.append("{ src["); 828 sb.append(srcIndex); 829 sb.append(".."); 830 sb.append(srcIndex + oldLength_); 831 if (changed) { 832 sb.append("] ⇝ dest["); 833 } else { 834 sb.append("] ≡ dest["); 835 } 836 sb.append(destIndex); 837 sb.append(".."); 838 sb.append(destIndex + newLength_); 839 if (changed) { 840 sb.append("], repl["); 841 sb.append(replIndex); 842 sb.append(".."); 843 sb.append(replIndex + newLength_); 844 sb.append("] }"); 845 } else { 846 sb.append("] (no-change) }"); 847 } 848 return sb.toString(); 849 } 850 }; 851 852 /** 853 * Returns an Iterator for coarse-grained change edits 854 * (adjacent change edits are treated as one). 855 * Can be used to perform simple string updates. 856 * Skips no-change edits. 857 * @return an Iterator that merges adjacent changes. 858 * @stable ICU 59 859 */ 860 public Iterator getCoarseChangesIterator() { 861 return new Iterator(array, length, true, true); 862 } 863 864 /** 865 * Returns an Iterator for coarse-grained change and no-change edits 866 * (adjacent change edits are treated as one). 867 * Can be used to perform simple string updates. 868 * Adjacent change edits are treated as one edit. 869 * @return an Iterator that merges adjacent changes. 870 * @stable ICU 59 871 */ 872 public Iterator getCoarseIterator() { 873 return new Iterator(array, length, false, true); 874 } 875 876 /** 877 * Returns an Iterator for fine-grained change edits 878 * (full granularity of change edits is retained). 879 * Can be used for modifying styled text. 880 * Skips no-change edits. 881 * @return an Iterator that separates adjacent changes. 882 * @stable ICU 59 883 */ 884 public Iterator getFineChangesIterator() { 885 return new Iterator(array, length, true, false); 886 } 887 888 /** 889 * Returns an Iterator for fine-grained change and no-change edits 890 * (full granularity of change edits is retained). 891 * Can be used for modifying styled text. 892 * @return an Iterator that separates adjacent changes. 893 * @stable ICU 59 894 */ 895 public Iterator getFineIterator() { 896 return new Iterator(array, length, false, false); 897 } 898 899 /** 900 * Merges the two input Edits and appends the result to this object. 901 * 902 * <p>Consider two string transformations (for example, normalization and case mapping) 903 * where each records Edits in addition to writing an output string.<br> 904 * Edits ab reflect how substrings of input string a 905 * map to substrings of intermediate string b.<br> 906 * Edits bc reflect how substrings of intermediate string b 907 * map to substrings of output string c.<br> 908 * This function merges ab and bc such that the additional edits 909 * recorded in this object reflect how substrings of input string a 910 * map to substrings of output string c. 911 * 912 * <p>If unrelated Edits are passed in where the output string of the first 913 * has a different length than the input string of the second, 914 * then an IllegalArgumentException is thrown. 915 * 916 * @param ab reflects how substrings of input string a 917 * map to substrings of intermediate string b. 918 * @param bc reflects how substrings of intermediate string b 919 * map to substrings of output string c. 920 * @return this, with the merged edits appended 921 * @stable ICU 60 922 */ 923 public Edits mergeAndAppend(Edits ab, Edits bc) { 924 // Picture string a --(Edits ab)--> string b --(Edits bc)--> string c. 925 // Parallel iteration over both Edits. 926 Iterator abIter = ab.getFineIterator(); 927 Iterator bcIter = bc.getFineIterator(); 928 boolean abHasNext = true, bcHasNext = true; 929 // Copy iterator state into local variables, so that we can modify and subdivide spans. 930 // ab old & new length, bc old & new length 931 int aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0; 932 // When we have different-intermediate-length changes, we accumulate a larger change. 933 int pending_aLength = 0, pending_cLength = 0; 934 for (;;) { 935 // At this point, for each of the two iterators: 936 // Either we are done with the locally cached current edit, 937 // and its intermediate-string length has been reset, 938 // or we will continue to work with a truncated remainder of this edit. 939 // 940 // If the current edit is done, and the iterator has not yet reached the end, 941 // then we fetch the next edit. This is true for at least one of the iterators. 942 // 943 // Normally it does not matter whether we fetch from ab and then bc or vice versa. 944 // However, the result is observably different when 945 // ab deletions meet bc insertions at the same intermediate-string index. 946 // Some users expect the bc insertions to come first, so we fetch from bc first. 947 if (bc_bLength == 0) { 948 if (bcHasNext && (bcHasNext = bcIter.next())) { 949 bc_bLength = bcIter.oldLength(); 950 cLength = bcIter.newLength(); 951 if (bc_bLength == 0) { 952 // insertion 953 if (ab_bLength == 0 || !abIter.hasChange()) { 954 addReplace(pending_aLength, pending_cLength + cLength); 955 pending_aLength = pending_cLength = 0; 956 } else { 957 pending_cLength += cLength; 958 } 959 continue; 960 } 961 } 962 // else see if the other iterator is done, too. 963 } 964 if (ab_bLength == 0) { 965 if (abHasNext && (abHasNext = abIter.next())) { 966 aLength = abIter.oldLength(); 967 ab_bLength = abIter.newLength(); 968 if (ab_bLength == 0) { 969 // deletion 970 if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) { 971 addReplace(pending_aLength + aLength, pending_cLength); 972 pending_aLength = pending_cLength = 0; 973 } else { 974 pending_aLength += aLength; 975 } 976 continue; 977 } 978 } else if (bc_bLength == 0) { 979 // Both iterators are done at the same time: 980 // The intermediate-string lengths match. 981 break; 982 } else { 983 throw new IllegalArgumentException( 984 "The ab output string is shorter than the bc input string."); 985 } 986 } 987 if (bc_bLength == 0) { 988 throw new IllegalArgumentException( 989 "The bc input string is shorter than the ab output string."); 990 } 991 // Done fetching: ab_bLength > 0 && bc_bLength > 0 992 993 // The current state has two parts: 994 // - Past: We accumulate a longer ac edit in the "pending" variables. 995 // - Current: We have copies of the current ab/bc edits in local variables. 996 // At least one side is newly fetched. 997 // One side might be a truncated remainder of an edit we fetched earlier. 998 999 if (!abIter.hasChange() && !bcIter.hasChange()) { 1000 // An unchanged span all the way from string a to string c. 1001 if (pending_aLength != 0 || pending_cLength != 0) { 1002 addReplace(pending_aLength, pending_cLength); 1003 pending_aLength = pending_cLength = 0; 1004 } 1005 int unchangedLength = aLength <= cLength ? aLength : cLength; 1006 addUnchanged(unchangedLength); 1007 ab_bLength = aLength -= unchangedLength; 1008 bc_bLength = cLength -= unchangedLength; 1009 // At least one of the unchanged spans is now empty. 1010 continue; 1011 } 1012 if (!abIter.hasChange() && bcIter.hasChange()) { 1013 // Unchanged a->b but changed b->c. 1014 if (ab_bLength >= bc_bLength) { 1015 // Split the longer unchanged span into change + remainder. 1016 addReplace(pending_aLength + bc_bLength, pending_cLength + cLength); 1017 pending_aLength = pending_cLength = 0; 1018 aLength = ab_bLength -= bc_bLength; 1019 bc_bLength = 0; 1020 continue; 1021 } 1022 // Handle the shorter unchanged span below like a change. 1023 } else if (abIter.hasChange() && !bcIter.hasChange()) { 1024 // Changed a->b and then unchanged b->c. 1025 if (ab_bLength <= bc_bLength) { 1026 // Split the longer unchanged span into change + remainder. 1027 addReplace(pending_aLength + aLength, pending_cLength + ab_bLength); 1028 pending_aLength = pending_cLength = 0; 1029 cLength = bc_bLength -= ab_bLength; 1030 ab_bLength = 0; 1031 continue; 1032 } 1033 // Handle the shorter unchanged span below like a change. 1034 } else { // both abIter.hasChange() && bcIter.hasChange() 1035 if (ab_bLength == bc_bLength) { 1036 // Changes on both sides up to the same position. Emit & reset. 1037 addReplace(pending_aLength + aLength, pending_cLength + cLength); 1038 pending_aLength = pending_cLength = 0; 1039 ab_bLength = bc_bLength = 0; 1040 continue; 1041 } 1042 } 1043 // Accumulate the a->c change, reset the shorter side, 1044 // keep a remainder of the longer one. 1045 pending_aLength += aLength; 1046 pending_cLength += cLength; 1047 if (ab_bLength < bc_bLength) { 1048 bc_bLength -= ab_bLength; 1049 cLength = ab_bLength = 0; 1050 } else { // ab_bLength > bc_bLength 1051 ab_bLength -= bc_bLength; 1052 aLength = bc_bLength = 0; 1053 } 1054 } 1055 if (pending_aLength != 0 || pending_cLength != 0) { 1056 addReplace(pending_aLength, pending_cLength); 1057 } 1058 return this; 1059 } 1060 } 1061