1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2017 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html 4 package android.icu.text; 5 6 import java.nio.BufferOverflowException; 7 import java.util.Arrays; 8 9 /** 10 * Records lengths of string edits but not replacement text. Supports replacements, insertions, deletions 11 * in linear progression. Does not support moving/reordering of text. 12 * <p> 13 * There are two types of edits: <em>change edits</em> and <em>no-change edits</em>. Add edits to 14 * instances of this class using {@link #addReplace(int, int)} (for change edits) and 15 * {@link #addUnchanged(int)} (for no-change edits). Change edits are retained with full granularity, 16 * whereas adjacent no-change edits are always merged together. In no-change edits, there is a one-to-one 17 * mapping between code points in the source and destination strings. 18 * <p> 19 * After all edits have been added, instances of this class should be considered immutable, and an 20 * {@link Edits.Iterator} can be used for queries. 21 * <p> 22 * There are four flavors of Edits.Iterator: 23 * <ul> 24 * <li>{@link #getFineIterator()} retains full granularity of change edits. 25 * <li>{@link #getFineChangesIterator()} retains full granularity of change edits, and when calling 26 * next() on the iterator, skips over no-change edits (unchanged regions). 27 * <li>{@link #getCoarseIterator()} treats adjacent change edits as a single edit. (Adjacent no-change 28 * edits are automatically merged during the construction phase.) 29 * <li>{@link #getCoarseChangesIterator()} treats adjacent change edits as a single edit, and when 30 * calling next() on the iterator, skips over no-change edits (unchanged regions). 31 * </ul> 32 * <p> 33 * For example, consider the string "abcßDeF", which case-folds to "abcssdef". This string has the 34 * following fine edits: 35 * <ul> 36 * <li>abc ⇨ abc (no-change) 37 * <li>ß ⇨ ss (change) 38 * <li>D ⇨ d (change) 39 * <li>e ⇨ e (no-change) 40 * <li>F ⇨ f (change) 41 * </ul> 42 * and the following coarse edits (note how adjacent change edits get merged together): 43 * <ul> 44 * <li>abc ⇨ abc (no-change) 45 * <li>ßD ⇨ ssd (change) 46 * <li>e ⇨ e (no-change) 47 * <li>F ⇨ f (change) 48 * </ul> 49 * <p> 50 * The "fine changes" and "coarse changes" iterators will step through only the change edits when their 51 * {@link Edits.Iterator#next()} methods are called. They are identical to the non-change iterators when 52 * their {@link Edits.Iterator#findSourceIndex(int)} or {@link Edits.Iterator#findDestinationIndex(int)} 53 * methods are used to walk through the string. 54 * <p> 55 * For examples of how to use this class, see the test <code>TestCaseMapEditsIteratorDocs</code> in 56 * UCharacterCaseTest.java. 57 */ 58 public final class Edits { 59 // 0000uuuuuuuuuuuu records u+1 unchanged text units. 60 private static final int MAX_UNCHANGED_LENGTH = 0x1000; 61 private static final int MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1; 62 63 // 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units. 64 private static final int MAX_SHORT_CHANGE_OLD_LENGTH = 6; 65 private static final int MAX_SHORT_CHANGE_NEW_LENGTH = 7; 66 private static final int SHORT_CHANGE_NUM_MASK = 0x1ff; 67 private static final int MAX_SHORT_CHANGE = 0x6fff; 68 69 // 0111mmmmmmnnnnnn records a replacement of m text units with n. 70 // m or n = 61: actual length follows in the next edits array unit. 71 // m or n = 62..63: actual length follows in the next two edits array units. 72 // Bit 30 of the actual length is in the head unit. 73 // Trailing units have bit 15 set. 74 private static final int LENGTH_IN_1TRAIL = 61; 75 private static final int LENGTH_IN_2TRAIL = 62; 76 77 private static final int STACK_CAPACITY = 100; 78 private char[] array; 79 private int length; 80 private int delta; 81 private int numChanges; 82 83 /** 84 * Constructs an empty object. 85 */ Edits()86 public Edits() { 87 array = new char[STACK_CAPACITY]; 88 } 89 90 /** 91 * Resets the data but may not release memory. 92 */ reset()93 public void reset() { 94 length = delta = numChanges = 0; 95 } 96 setLastUnit(int last)97 private void setLastUnit(int last) { 98 array[length - 1] = (char)last; 99 } lastUnit()100 private int lastUnit() { 101 return length > 0 ? array[length - 1] : 0xffff; 102 } 103 104 /** 105 * Adds a no-change edit: a record for an unchanged segment of text. 106 * Normally called from inside ICU string transformation functions, not user code. 107 */ addUnchanged(int unchangedLength)108 public void addUnchanged(int unchangedLength) { 109 if(unchangedLength < 0) { 110 throw new IllegalArgumentException( 111 "addUnchanged(" + unchangedLength + "): length must not be negative"); 112 } 113 // Merge into previous unchanged-text record, if any. 114 int last = lastUnit(); 115 if(last < MAX_UNCHANGED) { 116 int remaining = MAX_UNCHANGED - last; 117 if (remaining >= unchangedLength) { 118 setLastUnit(last + unchangedLength); 119 return; 120 } 121 setLastUnit(MAX_UNCHANGED); 122 unchangedLength -= remaining; 123 } 124 // Split large lengths into multiple units. 125 while(unchangedLength >= MAX_UNCHANGED_LENGTH) { 126 append(MAX_UNCHANGED); 127 unchangedLength -= MAX_UNCHANGED_LENGTH; 128 } 129 // Write a small (remaining) length. 130 if(unchangedLength > 0) { 131 append(unchangedLength - 1); 132 } 133 } 134 135 /** 136 * Adds a change edit: a record for a text replacement/insertion/deletion. 137 * Normally called from inside ICU string transformation functions, not user code. 138 */ addReplace(int oldLength, int newLength)139 public void addReplace(int oldLength, int newLength) { 140 if(oldLength < 0 || newLength < 0) { 141 throw new IllegalArgumentException( 142 "addReplace(" + oldLength + ", " + newLength + 143 "): both lengths must be non-negative"); 144 } 145 if (oldLength == 0 && newLength == 0) { 146 return; 147 } 148 ++numChanges; 149 int newDelta = newLength - oldLength; 150 if (newDelta != 0) { 151 if ((newDelta > 0 && delta >= 0 && newDelta > (Integer.MAX_VALUE - delta)) || 152 (newDelta < 0 && delta < 0 && newDelta < (Integer.MIN_VALUE - delta))) { 153 // Integer overflow or underflow. 154 throw new IndexOutOfBoundsException(); 155 } 156 delta += newDelta; 157 } 158 159 if(0 < oldLength && oldLength <= MAX_SHORT_CHANGE_OLD_LENGTH && 160 newLength <= MAX_SHORT_CHANGE_NEW_LENGTH) { 161 // Merge into previous same-lengths short-replacement record, if any. 162 int u = (oldLength << 12) | (newLength << 9); 163 int last = lastUnit(); 164 if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE && 165 (last & ~SHORT_CHANGE_NUM_MASK) == u && 166 (last & SHORT_CHANGE_NUM_MASK) < SHORT_CHANGE_NUM_MASK) { 167 setLastUnit(last + 1); 168 return; 169 } 170 append(u); 171 return; 172 } 173 174 int head = 0x7000; 175 if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) { 176 head |= oldLength << 6; 177 head |= newLength; 178 append(head); 179 } else if ((array.length - length) >= 5 || growArray()) { 180 int limit = length + 1; 181 if(oldLength < LENGTH_IN_1TRAIL) { 182 head |= oldLength << 6; 183 } else if(oldLength <= 0x7fff) { 184 head |= LENGTH_IN_1TRAIL << 6; 185 array[limit++] = (char)(0x8000 | oldLength); 186 } else { 187 head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6; 188 array[limit++] = (char)(0x8000 | (oldLength >> 15)); 189 array[limit++] = (char)(0x8000 | oldLength); 190 } 191 if(newLength < LENGTH_IN_1TRAIL) { 192 head |= newLength; 193 } else if(newLength <= 0x7fff) { 194 head |= LENGTH_IN_1TRAIL; 195 array[limit++] = (char)(0x8000 | newLength); 196 } else { 197 head |= LENGTH_IN_2TRAIL + (newLength >> 30); 198 array[limit++] = (char)(0x8000 | (newLength >> 15)); 199 array[limit++] = (char)(0x8000 | newLength); 200 } 201 array[length] = (char)head; 202 length = limit; 203 } 204 } 205 append(int r)206 private void append(int r) { 207 if(length < array.length || growArray()) { 208 array[length++] = (char)r; 209 } 210 } 211 growArray()212 private boolean growArray() { 213 int newCapacity; 214 if (array.length == STACK_CAPACITY) { 215 newCapacity = 2000; 216 } else if (array.length == Integer.MAX_VALUE) { 217 throw new BufferOverflowException(); 218 } else if (array.length >= (Integer.MAX_VALUE / 2)) { 219 newCapacity = Integer.MAX_VALUE; 220 } else { 221 newCapacity = 2 * array.length; 222 } 223 // Grow by at least 5 units so that a maximal change record will fit. 224 if ((newCapacity - array.length) < 5) { 225 throw new BufferOverflowException(); 226 } 227 array = Arrays.copyOf(array, newCapacity); 228 return true; 229 } 230 231 /** 232 * How much longer is the new text compared with the old text? 233 * @return new length minus old length 234 */ lengthDelta()235 public int lengthDelta() { return delta; } 236 /** 237 * @return true if there are any change edits 238 */ hasChanges()239 public boolean hasChanges() { return numChanges != 0; } 240 241 /** 242 * @return the number of change edits 243 */ numberOfChanges()244 public int numberOfChanges() { return numChanges; } 245 246 /** 247 * Access to the list of edits. 248 * <p> 249 * At any moment in time, an instance of this class points to a single edit: a "window" into a span 250 * of the source string and the corresponding span of the destination string. The source string span 251 * starts at {@link #sourceIndex()} and runs for {@link #oldLength()} chars; the destination string 252 * span starts at {@link #destinationIndex()} and runs for {@link #newLength()} chars. 253 * <p> 254 * The iterator can be moved between edits using the {@link #next()}, {@link #findSourceIndex(int)}, 255 * and {@link #findDestinationIndex(int)} methods. Calling any of these methods mutates the iterator 256 * to make it point to the corresponding edit. 257 * <p> 258 * For more information, see the documentation for {@link Edits}. 259 * <p> 260 * Note: Although this class is called "Iterator", it does not implement {@link java.util.Iterator}. 261 * 262 * @see #getCoarseIterator 263 * @see #getFineIterator 264 */ 265 public static final class Iterator { 266 private final char[] array; 267 private int index; 268 private final int length; 269 /** 270 * 0 if we are not within compressed equal-length changes. 271 * Otherwise the number of remaining changes, including the current one. 272 */ 273 private int remaining; 274 private final boolean onlyChanges_, coarse; 275 276 private int dir; // iteration direction: back(<0), initial(0), forward(>0) 277 private boolean changed; 278 private int oldLength_, newLength_; 279 private int srcIndex, replIndex, destIndex; 280 Iterator(char[] a, int len, boolean oc, boolean crs)281 private Iterator(char[] a, int len, boolean oc, boolean crs) { 282 array = a; 283 length = len; 284 onlyChanges_ = oc; 285 coarse = crs; 286 } 287 readLength(int head)288 private int readLength(int head) { 289 if (head < LENGTH_IN_1TRAIL) { 290 return head; 291 } else if (head < LENGTH_IN_2TRAIL) { 292 assert(index < length); 293 assert(array[index] >= 0x8000); 294 return array[index++] & 0x7fff; 295 } else { 296 assert((index + 2) <= length); 297 assert(array[index] >= 0x8000); 298 assert(array[index + 1] >= 0x8000); 299 int len = ((head & 1) << 30) | 300 ((array[index] & 0x7fff) << 15) | 301 (array[index + 1] & 0x7fff); 302 index += 2; 303 return len; 304 } 305 } 306 updateNextIndexes()307 private void updateNextIndexes() { 308 srcIndex += oldLength_; 309 if (changed) { 310 replIndex += newLength_; 311 } 312 destIndex += newLength_; 313 } 314 updatePreviousIndexes()315 private void updatePreviousIndexes() { 316 srcIndex -= oldLength_; 317 if (changed) { 318 replIndex -= newLength_; 319 } 320 destIndex -= newLength_; 321 } 322 noNext()323 private boolean noNext() { 324 // No change before or beyond the string. 325 dir = 0; 326 changed = false; 327 oldLength_ = newLength_ = 0; 328 return false; 329 } 330 331 /** 332 * Advances the iterator to the next edit. 333 * @return true if there is another edit 334 */ next()335 public boolean next() { 336 return next(onlyChanges_); 337 } 338 next(boolean onlyChanges)339 private boolean next(boolean onlyChanges) { 340 // Forward iteration: Update the string indexes to the limit of the current span, 341 // and post-increment-read array units to assemble a new span. 342 // Leaves the array index one after the last unit of that span. 343 if (dir > 0) { 344 updateNextIndexes(); 345 } else { 346 if (dir < 0) { 347 // Turn around from previous() to next(). 348 // Post-increment-read the same span again. 349 if (remaining > 0) { 350 // Fine-grained iterator: 351 // Stay on the current one of a sequence of compressed changes. 352 ++index; // next() rests on the index after the sequence unit. 353 dir = 1; 354 return true; 355 } 356 } 357 dir = 1; 358 } 359 if (remaining >= 1) { 360 // Fine-grained iterator: Continue a sequence of compressed changes. 361 if (remaining > 1) { 362 --remaining; 363 return true; 364 } 365 remaining = 0; 366 } 367 if (index >= length) { 368 return noNext(); 369 } 370 int u = array[index++]; 371 if (u <= MAX_UNCHANGED) { 372 // Combine adjacent unchanged ranges. 373 changed = false; 374 oldLength_ = u + 1; 375 while (index < length && (u = array[index]) <= MAX_UNCHANGED) { 376 ++index; 377 oldLength_ += u + 1; 378 } 379 newLength_ = oldLength_; 380 if (onlyChanges) { 381 updateNextIndexes(); 382 if (index >= length) { 383 return noNext(); 384 } 385 // already fetched u > MAX_UNCHANGED at index 386 ++index; 387 } else { 388 return true; 389 } 390 } 391 changed = true; 392 if (u <= MAX_SHORT_CHANGE) { 393 int oldLen = u >> 12; 394 int newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH; 395 int num = (u & SHORT_CHANGE_NUM_MASK) + 1; 396 if (coarse) { 397 oldLength_ = num * oldLen; 398 newLength_ = num * newLen; 399 } else { 400 // Split a sequence of changes that was compressed into one unit. 401 oldLength_ = oldLen; 402 newLength_ = newLen; 403 if (num > 1) { 404 remaining = num; // This is the first of two or more changes. 405 } 406 return true; 407 } 408 } else { 409 assert(u <= 0x7fff); 410 oldLength_ = readLength((u >> 6) & 0x3f); 411 newLength_ = readLength(u & 0x3f); 412 if (!coarse) { 413 return true; 414 } 415 } 416 // Combine adjacent changes. 417 while (index < length && (u = array[index]) > MAX_UNCHANGED) { 418 ++index; 419 if (u <= MAX_SHORT_CHANGE) { 420 int num = (u & SHORT_CHANGE_NUM_MASK) + 1; 421 oldLength_ += (u >> 12) * num; 422 newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num; 423 } else { 424 assert(u <= 0x7fff); 425 oldLength_ += readLength((u >> 6) & 0x3f); 426 newLength_ += readLength(u & 0x3f); 427 } 428 } 429 return true; 430 } 431 previous()432 private boolean previous() { 433 // Backward iteration: Pre-decrement-read array units to assemble a new span, 434 // then update the string indexes to the start of that span. 435 // Leaves the array index on the head unit of that span. 436 if (dir >= 0) { 437 if (dir > 0) { 438 // Turn around from next() to previous(). 439 // Set the string indexes to the span limit and 440 // pre-decrement-read the same span again. 441 if (remaining > 0) { 442 // Fine-grained iterator: 443 // Stay on the current one of a sequence of compressed changes. 444 --index; // previous() rests on the sequence unit. 445 dir = -1; 446 return true; 447 } 448 updateNextIndexes(); 449 } 450 dir = -1; 451 } 452 if (remaining > 0) { 453 // Fine-grained iterator: Continue a sequence of compressed changes. 454 int u = array[index]; 455 assert(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE); 456 if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) { 457 ++remaining; 458 updatePreviousIndexes(); 459 return true; 460 } 461 remaining = 0; 462 } 463 if (index <= 0) { 464 return noNext(); 465 } 466 int u = array[--index]; 467 if (u <= MAX_UNCHANGED) { 468 // Combine adjacent unchanged ranges. 469 changed = false; 470 oldLength_ = u + 1; 471 while (index > 0 && (u = array[index - 1]) <= MAX_UNCHANGED) { 472 --index; 473 oldLength_ += u + 1; 474 } 475 newLength_ = oldLength_; 476 // No need to handle onlyChanges as long as previous() is called only from findIndex(). 477 updatePreviousIndexes(); 478 return true; 479 } 480 changed = true; 481 if (u <= MAX_SHORT_CHANGE) { 482 int oldLen = u >> 12; 483 int newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH; 484 int num = (u & SHORT_CHANGE_NUM_MASK) + 1; 485 if (coarse) { 486 oldLength_ = num * oldLen; 487 newLength_ = num * newLen; 488 } else { 489 // Split a sequence of changes that was compressed into one unit. 490 oldLength_ = oldLen; 491 newLength_ = newLen; 492 if (num > 1) { 493 remaining = 1; // This is the last of two or more changes. 494 } 495 updatePreviousIndexes(); 496 return true; 497 } 498 } else { 499 if (u <= 0x7fff) { 500 // The change is encoded in u alone. 501 oldLength_ = readLength((u >> 6) & 0x3f); 502 newLength_ = readLength(u & 0x3f); 503 } else { 504 // Back up to the head of the change, read the lengths, 505 // and reset the index to the head again. 506 assert(index > 0); 507 while ((u = array[--index]) > 0x7fff) {} 508 assert(u > MAX_SHORT_CHANGE); 509 int headIndex = index++; 510 oldLength_ = readLength((u >> 6) & 0x3f); 511 newLength_ = readLength(u & 0x3f); 512 index = headIndex; 513 } 514 if (!coarse) { 515 updatePreviousIndexes(); 516 return true; 517 } 518 } 519 // Combine adjacent changes. 520 while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) { 521 --index; 522 if (u <= MAX_SHORT_CHANGE) { 523 int num = (u & SHORT_CHANGE_NUM_MASK) + 1; 524 oldLength_ += (u >> 12) * num; 525 newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num; 526 } else if (u <= 0x7fff) { 527 // Read the lengths, and reset the index to the head again. 528 int headIndex = index++; 529 oldLength_ += readLength((u >> 6) & 0x3f); 530 newLength_ += readLength(u & 0x3f); 531 index = headIndex; 532 } 533 } 534 updatePreviousIndexes(); 535 return true; 536 } 537 538 /** 539 * Moves the iterator to the edit that contains the source index. 540 * The source index may be found in a no-change edit 541 * even if normal iteration would skip no-change edits. 542 * Normal iteration can continue from a found edit. 543 * 544 * <p>The iterator state before this search logically does not matter. 545 * (It may affect the performance of the search.) 546 * 547 * <p>The iterator state after this search is undefined 548 * if the source index is out of bounds for the source string. 549 * 550 * @param i source index 551 * @return true if the edit for the source index was found 552 */ findSourceIndex(int i)553 public boolean findSourceIndex(int i) { 554 return findIndex(i, true) == 0; 555 } 556 557 /** 558 * Moves the iterator to the edit that contains the destination index. 559 * The destination index may be found in a no-change edit 560 * even if normal iteration would skip no-change edits. 561 * Normal iteration can continue from a found edit. 562 * 563 * <p>The iterator state before this search logically does not matter. 564 * (It may affect the performance of the search.) 565 * 566 * <p>The iterator state after this search is undefined 567 * if the source index is out of bounds for the source string. 568 * 569 * @param i destination index 570 * @return true if the edit for the destination index was found 571 */ findDestinationIndex(int i)572 public boolean findDestinationIndex(int i) { 573 return findIndex(i, false) == 0; 574 } 575 576 /** @return -1: error or i<0; 0: found; 1: i>=string length */ findIndex(int i, boolean findSource)577 private int findIndex(int i, boolean findSource) { 578 if (i < 0) { return -1; } 579 int spanStart, spanLength; 580 if (findSource) { // find source index 581 spanStart = srcIndex; 582 spanLength = oldLength_; 583 } else { // find destination index 584 spanStart = destIndex; 585 spanLength = newLength_; 586 } 587 if (i < spanStart) { 588 if (i >= (spanStart / 2)) { 589 // Search backwards. 590 for (;;) { 591 boolean hasPrevious = previous(); 592 assert(hasPrevious); // because i>=0 and the first span starts at 0 593 spanStart = findSource ? srcIndex : destIndex; 594 if (i >= spanStart) { 595 // The index is in the current span. 596 return 0; 597 } 598 if (remaining > 0) { 599 // Is the index in one of the remaining compressed edits? 600 // spanStart is the start of the current span, first of the remaining ones. 601 spanLength = findSource ? oldLength_ : newLength_; 602 int u = array[index]; 603 assert(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE); 604 int num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining; 605 int len = num * spanLength; 606 if (i >= (spanStart - len)) { 607 int n = ((spanStart - i - 1) / spanLength) + 1; 608 // 1 <= n <= num 609 srcIndex -= n * oldLength_; 610 replIndex -= n * newLength_; 611 destIndex -= n * newLength_; 612 remaining += n; 613 return 0; 614 } 615 // Skip all of these edits at once. 616 srcIndex -= num * oldLength_; 617 replIndex -= num * newLength_; 618 destIndex -= num * newLength_; 619 remaining = 0; 620 } 621 } 622 } 623 // Reset the iterator to the start. 624 dir = 0; 625 index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0; 626 } else if (i < (spanStart + spanLength)) { 627 // The index is in the current span. 628 return 0; 629 } 630 while (next(false)) { 631 if (findSource) { 632 spanStart = srcIndex; 633 spanLength = oldLength_; 634 } else { 635 spanStart = destIndex; 636 spanLength = newLength_; 637 } 638 if (i < (spanStart + spanLength)) { 639 // The index is in the current span. 640 return 0; 641 } 642 if (remaining > 1) { 643 // Is the index in one of the remaining compressed edits? 644 // spanStart is the start of the current span, first of the remaining ones. 645 int len = remaining * spanLength; 646 if (i < (spanStart + len)) { 647 int n = (i - spanStart) / spanLength; // 1 <= n <= remaining - 1 648 srcIndex += n * oldLength_; 649 replIndex += n * newLength_; 650 destIndex += n * newLength_; 651 remaining -= n; 652 return 0; 653 } 654 // Make next() skip all of these edits at once. 655 oldLength_ *= remaining; 656 newLength_ *= remaining; 657 remaining = 0; 658 } 659 } 660 return 1; 661 } 662 663 /** 664 * Computes the destination index corresponding to the given source index. 665 * If the source index is inside a change edit (not at its start), 666 * then the destination index at the end of that edit is returned, 667 * since there is no information about index mapping inside a change edit. 668 * 669 * <p>(This means that indexes to the start and middle of an edit, 670 * for example around a grapheme cluster, are mapped to indexes 671 * encompassing the entire edit. 672 * The alternative, mapping an interior index to the start, 673 * would map such an interval to an empty one.) 674 * 675 * <p>This operation will usually but not always modify this object. 676 * The iterator state after this search is undefined. 677 * 678 * @param i source index 679 * @return destination index; undefined if i is not 0..string length 680 */ 681 public int destinationIndexFromSourceIndex(int i) { 682 int where = findIndex(i, true); 683 if (where < 0) { 684 // Error or before the string. 685 return 0; 686 } 687 if (where > 0 || i == srcIndex) { 688 // At or after string length, or at start of the found span. 689 return destIndex; 690 } 691 if (changed) { 692 // In a change span, map to its end. 693 return destIndex + newLength_; 694 } else { 695 // In an unchanged span, offset 1:1 within it. 696 return destIndex + (i - srcIndex); 697 } 698 } 699 700 /** 701 * Computes the source index corresponding to the given destination index. 702 * If the destination index is inside a change edit (not at its start), 703 * then the source index at the end of that edit is returned, 704 * since there is no information about index mapping inside a change edit. 705 * 706 * <p>(This means that indexes to the start and middle of an edit, 707 * for example around a grapheme cluster, are mapped to indexes 708 * encompassing the entire edit. 709 * The alternative, mapping an interior index to the start, 710 * would map such an interval to an empty one.) 711 * 712 * <p>This operation will usually but not always modify this object. 713 * The iterator state after this search is undefined. 714 * 715 * @param i destination index 716 * @return source index; undefined if i is not 0..string length 717 */ 718 public int sourceIndexFromDestinationIndex(int i) { 719 int where = findIndex(i, false); 720 if (where < 0) { 721 // Error or before the string. 722 return 0; 723 } 724 if (where > 0 || i == destIndex) { 725 // At or after string length, or at start of the found span. 726 return srcIndex; 727 } 728 if (changed) { 729 // In a change span, map to its end. 730 return srcIndex + oldLength_; 731 } else { 732 // In an unchanged span, offset within it. 733 return srcIndex + (i - destIndex); 734 } 735 } 736 737 /** 738 * Returns whether the edit currently represented by the iterator is a change edit. 739 * 740 * @return true if this edit replaces oldLength() units with newLength() different ones. 741 * false if oldLength units remain unchanged. 742 */ 743 public boolean hasChange() { return changed; } 744 745 /** 746 * The length of the current span in the source string, which starts at {@link #sourceIndex}. 747 * 748 * @return the number of units in the source string which are replaced or remain unchanged. 749 */ 750 public int oldLength() { return oldLength_; } 751 752 /** 753 * The length of the current span in the destination string, which starts at 754 * {@link #destinationIndex}, or in the replacement string, which starts at 755 * {@link #replacementIndex}. 756 * 757 * @return the number of units in the destination string, if hasChange() is true. Same as 758 * oldLength if hasChange() is false. 759 */ 760 public int newLength() { return newLength_; } 761 762 /** 763 * The start index of the current span in the source string; the span has length 764 * {@link #oldLength}. 765 * 766 * @return the current index into the source string 767 */ 768 public int sourceIndex() { return srcIndex; } 769 770 /** 771 * The start index of the current span in the replacement string; the span has length 772 * {@link #newLength}. Well-defined only if the current edit is a change edit. 773 * <p> 774 * The <em>replacement string</em> is the concatenation of all substrings of the destination 775 * string corresponding to change edits. 776 * <p> 777 * This method is intended to be used together with operations that write only replacement 778 * characters (e.g., {@link CaseMap#omitUnchangedText()}). The source string can then be modified 779 * in-place. 780 * 781 * @return the current index into the replacement-characters-only string, not counting unchanged 782 * spans 783 */ 784 public int replacementIndex() { 785 // TODO: Throw an exception if we aren't in a change edit? 786 return replIndex; 787 } 788 789 /** 790 * The start index of the current span in the destination string; the span has length 791 * {@link #newLength}. 792 * 793 * @return the current index into the full destination string 794 */ 795 public int destinationIndex() { return destIndex; } 796 797 /** 798 * A string representation of the current edit represented by the iterator for debugging. You 799 * should not depend on the contents of the return string; it may change over time. 800 * @return a string representation of the object. 801 */ 802 @Override 803 public String toString() { 804 StringBuilder sb = new StringBuilder(); 805 sb.append(super.toString()); 806 sb.append("{ src["); 807 sb.append(srcIndex); 808 sb.append(".."); 809 sb.append(srcIndex + oldLength_); 810 if (changed) { 811 sb.append("] ⇝ dest["); 812 } else { 813 sb.append("] ≡ dest["); 814 } 815 sb.append(destIndex); 816 sb.append(".."); 817 sb.append(destIndex + newLength_); 818 if (changed) { 819 sb.append("], repl["); 820 sb.append(replIndex); 821 sb.append(".."); 822 sb.append(replIndex + newLength_); 823 sb.append("] }"); 824 } else { 825 sb.append("] (no-change) }"); 826 } 827 return sb.toString(); 828 } 829 }; 830 831 /** 832 * Returns an Iterator for coarse-grained change edits 833 * (adjacent change edits are treated as one). 834 * Can be used to perform simple string updates. 835 * Skips no-change edits. 836 * @return an Iterator that merges adjacent changes. 837 */ 838 public Iterator getCoarseChangesIterator() { 839 return new Iterator(array, length, true, true); 840 } 841 842 /** 843 * Returns an Iterator for coarse-grained change and no-change edits 844 * (adjacent change edits are treated as one). 845 * Can be used to perform simple string updates. 846 * Adjacent change edits are treated as one edit. 847 * @return an Iterator that merges adjacent changes. 848 */ 849 public Iterator getCoarseIterator() { 850 return new Iterator(array, length, false, true); 851 } 852 853 /** 854 * Returns an Iterator for fine-grained change edits 855 * (full granularity of change edits is retained). 856 * Can be used for modifying styled text. 857 * Skips no-change edits. 858 * @return an Iterator that separates adjacent changes. 859 */ 860 public Iterator getFineChangesIterator() { 861 return new Iterator(array, length, true, false); 862 } 863 864 /** 865 * Returns an Iterator for fine-grained change and no-change edits 866 * (full granularity of change edits is retained). 867 * Can be used for modifying styled text. 868 * @return an Iterator that separates adjacent changes. 869 */ 870 public Iterator getFineIterator() { 871 return new Iterator(array, length, false, false); 872 } 873 874 /** 875 * Merges the two input Edits and appends the result to this object. 876 * 877 * <p>Consider two string transformations (for example, normalization and case mapping) 878 * where each records Edits in addition to writing an output string.<br> 879 * Edits ab reflect how substrings of input string a 880 * map to substrings of intermediate string b.<br> 881 * Edits bc reflect how substrings of intermediate string b 882 * map to substrings of output string c.<br> 883 * This function merges ab and bc such that the additional edits 884 * recorded in this object reflect how substrings of input string a 885 * map to substrings of output string c. 886 * 887 * <p>If unrelated Edits are passed in where the output string of the first 888 * has a different length than the input string of the second, 889 * then an IllegalArgumentException is thrown. 890 * 891 * @param ab reflects how substrings of input string a 892 * map to substrings of intermediate string b. 893 * @param bc reflects how substrings of intermediate string b 894 * map to substrings of output string c. 895 * @return this, with the merged edits appended 896 */ 897 public Edits mergeAndAppend(Edits ab, Edits bc) { 898 // Picture string a --(Edits ab)--> string b --(Edits bc)--> string c. 899 // Parallel iteration over both Edits. 900 Iterator abIter = ab.getFineIterator(); 901 Iterator bcIter = bc.getFineIterator(); 902 boolean abHasNext = true, bcHasNext = true; 903 // Copy iterator state into local variables, so that we can modify and subdivide spans. 904 // ab old & new length, bc old & new length 905 int aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0; 906 // When we have different-intermediate-length changes, we accumulate a larger change. 907 int pending_aLength = 0, pending_cLength = 0; 908 for (;;) { 909 // At this point, for each of the two iterators: 910 // Either we are done with the locally cached current edit, 911 // and its intermediate-string length has been reset, 912 // or we will continue to work with a truncated remainder of this edit. 913 // 914 // If the current edit is done, and the iterator has not yet reached the end, 915 // then we fetch the next edit. This is true for at least one of the iterators. 916 // 917 // Normally it does not matter whether we fetch from ab and then bc or vice versa. 918 // However, the result is observably different when 919 // ab deletions meet bc insertions at the same intermediate-string index. 920 // Some users expect the bc insertions to come first, so we fetch from bc first. 921 if (bc_bLength == 0) { 922 if (bcHasNext && (bcHasNext = bcIter.next())) { 923 bc_bLength = bcIter.oldLength(); 924 cLength = bcIter.newLength(); 925 if (bc_bLength == 0) { 926 // insertion 927 if (ab_bLength == 0 || !abIter.hasChange()) { 928 addReplace(pending_aLength, pending_cLength + cLength); 929 pending_aLength = pending_cLength = 0; 930 } else { 931 pending_cLength += cLength; 932 } 933 continue; 934 } 935 } 936 // else see if the other iterator is done, too. 937 } 938 if (ab_bLength == 0) { 939 if (abHasNext && (abHasNext = abIter.next())) { 940 aLength = abIter.oldLength(); 941 ab_bLength = abIter.newLength(); 942 if (ab_bLength == 0) { 943 // deletion 944 if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) { 945 addReplace(pending_aLength + aLength, pending_cLength); 946 pending_aLength = pending_cLength = 0; 947 } else { 948 pending_aLength += aLength; 949 } 950 continue; 951 } 952 } else if (bc_bLength == 0) { 953 // Both iterators are done at the same time: 954 // The intermediate-string lengths match. 955 break; 956 } else { 957 throw new IllegalArgumentException( 958 "The ab output string is shorter than the bc input string."); 959 } 960 } 961 if (bc_bLength == 0) { 962 throw new IllegalArgumentException( 963 "The bc input string is shorter than the ab output string."); 964 } 965 // Done fetching: ab_bLength > 0 && bc_bLength > 0 966 967 // The current state has two parts: 968 // - Past: We accumulate a longer ac edit in the "pending" variables. 969 // - Current: We have copies of the current ab/bc edits in local variables. 970 // At least one side is newly fetched. 971 // One side might be a truncated remainder of an edit we fetched earlier. 972 973 if (!abIter.hasChange() && !bcIter.hasChange()) { 974 // An unchanged span all the way from string a to string c. 975 if (pending_aLength != 0 || pending_cLength != 0) { 976 addReplace(pending_aLength, pending_cLength); 977 pending_aLength = pending_cLength = 0; 978 } 979 int unchangedLength = aLength <= cLength ? aLength : cLength; 980 addUnchanged(unchangedLength); 981 ab_bLength = aLength -= unchangedLength; 982 bc_bLength = cLength -= unchangedLength; 983 // At least one of the unchanged spans is now empty. 984 continue; 985 } 986 if (!abIter.hasChange() && bcIter.hasChange()) { 987 // Unchanged a->b but changed b->c. 988 if (ab_bLength >= bc_bLength) { 989 // Split the longer unchanged span into change + remainder. 990 addReplace(pending_aLength + bc_bLength, pending_cLength + cLength); 991 pending_aLength = pending_cLength = 0; 992 aLength = ab_bLength -= bc_bLength; 993 bc_bLength = 0; 994 continue; 995 } 996 // Handle the shorter unchanged span below like a change. 997 } else if (abIter.hasChange() && !bcIter.hasChange()) { 998 // Changed a->b and then unchanged b->c. 999 if (ab_bLength <= bc_bLength) { 1000 // Split the longer unchanged span into change + remainder. 1001 addReplace(pending_aLength + aLength, pending_cLength + ab_bLength); 1002 pending_aLength = pending_cLength = 0; 1003 cLength = bc_bLength -= ab_bLength; 1004 ab_bLength = 0; 1005 continue; 1006 } 1007 // Handle the shorter unchanged span below like a change. 1008 } else { // both abIter.hasChange() && bcIter.hasChange() 1009 if (ab_bLength == bc_bLength) { 1010 // Changes on both sides up to the same position. Emit & reset. 1011 addReplace(pending_aLength + aLength, pending_cLength + cLength); 1012 pending_aLength = pending_cLength = 0; 1013 ab_bLength = bc_bLength = 0; 1014 continue; 1015 } 1016 } 1017 // Accumulate the a->c change, reset the shorter side, 1018 // keep a remainder of the longer one. 1019 pending_aLength += aLength; 1020 pending_cLength += cLength; 1021 if (ab_bLength < bc_bLength) { 1022 bc_bLength -= ab_bLength; 1023 cLength = ab_bLength = 0; 1024 } else { // ab_bLength > bc_bLength 1025 ab_bLength -= bc_bLength; 1026 aLength = bc_bLength = 0; 1027 } 1028 } 1029 if (pending_aLength != 0 || pending_cLength != 0) { 1030 addReplace(pending_aLength, pending_cLength); 1031 } 1032 return this; 1033 } 1034 } 1035