1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3
4 // edits.cpp
5 // created: 2017feb08 Markus W. Scherer
6
7 #include "unicode/edits.h"
8 #include "unicode/unistr.h"
9 #include "unicode/utypes.h"
10 #include "cmemory.h"
11 #include "uassert.h"
12 #include "util.h"
13
14 U_NAMESPACE_BEGIN
15
16 namespace {
17
18 // 0000uuuuuuuuuuuu records u+1 unchanged text units.
19 const int32_t MAX_UNCHANGED_LENGTH = 0x1000;
20 const int32_t MAX_UNCHANGED = MAX_UNCHANGED_LENGTH - 1;
21
22 // 0mmmnnnccccccccc with m=1..6 records ccc+1 replacements of m:n text units.
23 const int32_t MAX_SHORT_CHANGE_OLD_LENGTH = 6;
24 const int32_t MAX_SHORT_CHANGE_NEW_LENGTH = 7;
25 const int32_t SHORT_CHANGE_NUM_MASK = 0x1ff;
26 const int32_t MAX_SHORT_CHANGE = 0x6fff;
27
28 // 0111mmmmmmnnnnnn records a replacement of m text units with n.
29 // m or n = 61: actual length follows in the next edits array unit.
30 // m or n = 62..63: actual length follows in the next two edits array units.
31 // Bit 30 of the actual length is in the head unit.
32 // Trailing units have bit 15 set.
33 const int32_t LENGTH_IN_1TRAIL = 61;
34 const int32_t LENGTH_IN_2TRAIL = 62;
35
36 } // namespace
37
releaseArray()38 void Edits::releaseArray() U_NOEXCEPT {
39 if (array != stackArray) {
40 uprv_free(array);
41 }
42 }
43
copyArray(const Edits & other)44 Edits &Edits::copyArray(const Edits &other) {
45 if (U_FAILURE(errorCode_)) {
46 length = delta = numChanges = 0;
47 return *this;
48 }
49 if (length > capacity) {
50 uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)length * 2);
51 if (newArray == nullptr) {
52 length = delta = numChanges = 0;
53 errorCode_ = U_MEMORY_ALLOCATION_ERROR;
54 return *this;
55 }
56 releaseArray();
57 array = newArray;
58 capacity = length;
59 }
60 if (length > 0) {
61 uprv_memcpy(array, other.array, (size_t)length * 2);
62 }
63 return *this;
64 }
65
moveArray(Edits & src)66 Edits &Edits::moveArray(Edits &src) U_NOEXCEPT {
67 if (U_FAILURE(errorCode_)) {
68 length = delta = numChanges = 0;
69 return *this;
70 }
71 releaseArray();
72 if (length > STACK_CAPACITY) {
73 array = src.array;
74 capacity = src.capacity;
75 src.array = src.stackArray;
76 src.capacity = STACK_CAPACITY;
77 src.reset();
78 return *this;
79 }
80 array = stackArray;
81 capacity = STACK_CAPACITY;
82 if (length > 0) {
83 uprv_memcpy(array, src.array, (size_t)length * 2);
84 }
85 return *this;
86 }
87
operator =(const Edits & other)88 Edits &Edits::operator=(const Edits &other) {
89 if (this == &other) { return *this; } // self-assignment: no-op
90 length = other.length;
91 delta = other.delta;
92 numChanges = other.numChanges;
93 errorCode_ = other.errorCode_;
94 return copyArray(other);
95 }
96
operator =(Edits && src)97 Edits &Edits::operator=(Edits &&src) U_NOEXCEPT {
98 length = src.length;
99 delta = src.delta;
100 numChanges = src.numChanges;
101 errorCode_ = src.errorCode_;
102 return moveArray(src);
103 }
104
~Edits()105 Edits::~Edits() {
106 releaseArray();
107 }
108
reset()109 void Edits::reset() U_NOEXCEPT {
110 length = delta = numChanges = 0;
111 errorCode_ = U_ZERO_ERROR;
112 }
113
addUnchanged(int32_t unchangedLength)114 void Edits::addUnchanged(int32_t unchangedLength) {
115 if(U_FAILURE(errorCode_) || unchangedLength == 0) { return; }
116 if(unchangedLength < 0) {
117 errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;
118 return;
119 }
120 // Merge into previous unchanged-text record, if any.
121 int32_t last = lastUnit();
122 if(last < MAX_UNCHANGED) {
123 int32_t remaining = MAX_UNCHANGED - last;
124 if (remaining >= unchangedLength) {
125 setLastUnit(last + unchangedLength);
126 return;
127 }
128 setLastUnit(MAX_UNCHANGED);
129 unchangedLength -= remaining;
130 }
131 // Split large lengths into multiple units.
132 while(unchangedLength >= MAX_UNCHANGED_LENGTH) {
133 append(MAX_UNCHANGED);
134 unchangedLength -= MAX_UNCHANGED_LENGTH;
135 }
136 // Write a small (remaining) length.
137 if(unchangedLength > 0) {
138 append(unchangedLength - 1);
139 }
140 }
141
addReplace(int32_t oldLength,int32_t newLength)142 void Edits::addReplace(int32_t oldLength, int32_t newLength) {
143 if(U_FAILURE(errorCode_)) { return; }
144 if(oldLength < 0 || newLength < 0) {
145 errorCode_ = U_ILLEGAL_ARGUMENT_ERROR;
146 return;
147 }
148 if (oldLength == 0 && newLength == 0) {
149 return;
150 }
151 ++numChanges;
152 int32_t newDelta = newLength - oldLength;
153 if (newDelta != 0) {
154 if ((newDelta > 0 && delta >= 0 && newDelta > (INT32_MAX - delta)) ||
155 (newDelta < 0 && delta < 0 && newDelta < (INT32_MIN - delta))) {
156 // Integer overflow or underflow.
157 errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
158 return;
159 }
160 delta += newDelta;
161 }
162
163 if(0 < oldLength && oldLength <= MAX_SHORT_CHANGE_OLD_LENGTH &&
164 newLength <= MAX_SHORT_CHANGE_NEW_LENGTH) {
165 // Merge into previous same-lengths short-replacement record, if any.
166 int32_t u = (oldLength << 12) | (newLength << 9);
167 int32_t last = lastUnit();
168 if(MAX_UNCHANGED < last && last < MAX_SHORT_CHANGE &&
169 (last & ~SHORT_CHANGE_NUM_MASK) == u &&
170 (last & SHORT_CHANGE_NUM_MASK) < SHORT_CHANGE_NUM_MASK) {
171 setLastUnit(last + 1);
172 return;
173 }
174 append(u);
175 return;
176 }
177
178 int32_t head = 0x7000;
179 if (oldLength < LENGTH_IN_1TRAIL && newLength < LENGTH_IN_1TRAIL) {
180 head |= oldLength << 6;
181 head |= newLength;
182 append(head);
183 } else if ((capacity - length) >= 5 || growArray()) {
184 int32_t limit = length + 1;
185 if(oldLength < LENGTH_IN_1TRAIL) {
186 head |= oldLength << 6;
187 } else if(oldLength <= 0x7fff) {
188 head |= LENGTH_IN_1TRAIL << 6;
189 array[limit++] = (uint16_t)(0x8000 | oldLength);
190 } else {
191 head |= (LENGTH_IN_2TRAIL + (oldLength >> 30)) << 6;
192 array[limit++] = (uint16_t)(0x8000 | (oldLength >> 15));
193 array[limit++] = (uint16_t)(0x8000 | oldLength);
194 }
195 if(newLength < LENGTH_IN_1TRAIL) {
196 head |= newLength;
197 } else if(newLength <= 0x7fff) {
198 head |= LENGTH_IN_1TRAIL;
199 array[limit++] = (uint16_t)(0x8000 | newLength);
200 } else {
201 head |= LENGTH_IN_2TRAIL + (newLength >> 30);
202 array[limit++] = (uint16_t)(0x8000 | (newLength >> 15));
203 array[limit++] = (uint16_t)(0x8000 | newLength);
204 }
205 array[length] = (uint16_t)head;
206 length = limit;
207 }
208 }
209
append(int32_t r)210 void Edits::append(int32_t r) {
211 if(length < capacity || growArray()) {
212 array[length++] = (uint16_t)r;
213 }
214 }
215
growArray()216 UBool Edits::growArray() {
217 int32_t newCapacity;
218 if (array == stackArray) {
219 newCapacity = 2000;
220 } else if (capacity == INT32_MAX) {
221 // Not U_BUFFER_OVERFLOW_ERROR because that could be confused on a string transform API
222 // with a result-string-buffer overflow.
223 errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
224 return false;
225 } else if (capacity >= (INT32_MAX / 2)) {
226 newCapacity = INT32_MAX;
227 } else {
228 newCapacity = 2 * capacity;
229 }
230 // Grow by at least 5 units so that a maximal change record will fit.
231 if ((newCapacity - capacity) < 5) {
232 errorCode_ = U_INDEX_OUTOFBOUNDS_ERROR;
233 return false;
234 }
235 uint16_t *newArray = (uint16_t *)uprv_malloc((size_t)newCapacity * 2);
236 if (newArray == NULL) {
237 errorCode_ = U_MEMORY_ALLOCATION_ERROR;
238 return false;
239 }
240 uprv_memcpy(newArray, array, (size_t)length * 2);
241 releaseArray();
242 array = newArray;
243 capacity = newCapacity;
244 return true;
245 }
246
copyErrorTo(UErrorCode & outErrorCode) const247 UBool Edits::copyErrorTo(UErrorCode &outErrorCode) const {
248 if (U_FAILURE(outErrorCode)) { return true; }
249 if (U_SUCCESS(errorCode_)) { return false; }
250 outErrorCode = errorCode_;
251 return true;
252 }
253
mergeAndAppend(const Edits & ab,const Edits & bc,UErrorCode & errorCode)254 Edits &Edits::mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode) {
255 if (copyErrorTo(errorCode)) { return *this; }
256 // Picture string a --(Edits ab)--> string b --(Edits bc)--> string c.
257 // Parallel iteration over both Edits.
258 Iterator abIter = ab.getFineIterator();
259 Iterator bcIter = bc.getFineIterator();
260 UBool abHasNext = true, bcHasNext = true;
261 // Copy iterator state into local variables, so that we can modify and subdivide spans.
262 // ab old & new length, bc old & new length
263 int32_t aLength = 0, ab_bLength = 0, bc_bLength = 0, cLength = 0;
264 // When we have different-intermediate-length changes, we accumulate a larger change.
265 int32_t pending_aLength = 0, pending_cLength = 0;
266 for (;;) {
267 // At this point, for each of the two iterators:
268 // Either we are done with the locally cached current edit,
269 // and its intermediate-string length has been reset,
270 // or we will continue to work with a truncated remainder of this edit.
271 //
272 // If the current edit is done, and the iterator has not yet reached the end,
273 // then we fetch the next edit. This is true for at least one of the iterators.
274 //
275 // Normally it does not matter whether we fetch from ab and then bc or vice versa.
276 // However, the result is observably different when
277 // ab deletions meet bc insertions at the same intermediate-string index.
278 // Some users expect the bc insertions to come first, so we fetch from bc first.
279 if (bc_bLength == 0) {
280 if (bcHasNext && (bcHasNext = bcIter.next(errorCode)) != 0) {
281 bc_bLength = bcIter.oldLength();
282 cLength = bcIter.newLength();
283 if (bc_bLength == 0) {
284 // insertion
285 if (ab_bLength == 0 || !abIter.hasChange()) {
286 addReplace(pending_aLength, pending_cLength + cLength);
287 pending_aLength = pending_cLength = 0;
288 } else {
289 pending_cLength += cLength;
290 }
291 continue;
292 }
293 }
294 // else see if the other iterator is done, too.
295 }
296 if (ab_bLength == 0) {
297 if (abHasNext && (abHasNext = abIter.next(errorCode)) != 0) {
298 aLength = abIter.oldLength();
299 ab_bLength = abIter.newLength();
300 if (ab_bLength == 0) {
301 // deletion
302 if (bc_bLength == bcIter.oldLength() || !bcIter.hasChange()) {
303 addReplace(pending_aLength + aLength, pending_cLength);
304 pending_aLength = pending_cLength = 0;
305 } else {
306 pending_aLength += aLength;
307 }
308 continue;
309 }
310 } else if (bc_bLength == 0) {
311 // Both iterators are done at the same time:
312 // The intermediate-string lengths match.
313 break;
314 } else {
315 // The ab output string is shorter than the bc input string.
316 if (!copyErrorTo(errorCode)) {
317 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
318 }
319 return *this;
320 }
321 }
322 if (bc_bLength == 0) {
323 // The bc input string is shorter than the ab output string.
324 if (!copyErrorTo(errorCode)) {
325 errorCode = U_ILLEGAL_ARGUMENT_ERROR;
326 }
327 return *this;
328 }
329 // Done fetching: ab_bLength > 0 && bc_bLength > 0
330
331 // The current state has two parts:
332 // - Past: We accumulate a longer ac edit in the "pending" variables.
333 // - Current: We have copies of the current ab/bc edits in local variables.
334 // At least one side is newly fetched.
335 // One side might be a truncated remainder of an edit we fetched earlier.
336
337 if (!abIter.hasChange() && !bcIter.hasChange()) {
338 // An unchanged span all the way from string a to string c.
339 if (pending_aLength != 0 || pending_cLength != 0) {
340 addReplace(pending_aLength, pending_cLength);
341 pending_aLength = pending_cLength = 0;
342 }
343 int32_t unchangedLength = aLength <= cLength ? aLength : cLength;
344 addUnchanged(unchangedLength);
345 ab_bLength = aLength -= unchangedLength;
346 bc_bLength = cLength -= unchangedLength;
347 // At least one of the unchanged spans is now empty.
348 continue;
349 }
350 if (!abIter.hasChange() && bcIter.hasChange()) {
351 // Unchanged a->b but changed b->c.
352 if (ab_bLength >= bc_bLength) {
353 // Split the longer unchanged span into change + remainder.
354 addReplace(pending_aLength + bc_bLength, pending_cLength + cLength);
355 pending_aLength = pending_cLength = 0;
356 aLength = ab_bLength -= bc_bLength;
357 bc_bLength = 0;
358 continue;
359 }
360 // Handle the shorter unchanged span below like a change.
361 } else if (abIter.hasChange() && !bcIter.hasChange()) {
362 // Changed a->b and then unchanged b->c.
363 if (ab_bLength <= bc_bLength) {
364 // Split the longer unchanged span into change + remainder.
365 addReplace(pending_aLength + aLength, pending_cLength + ab_bLength);
366 pending_aLength = pending_cLength = 0;
367 cLength = bc_bLength -= ab_bLength;
368 ab_bLength = 0;
369 continue;
370 }
371 // Handle the shorter unchanged span below like a change.
372 } else { // both abIter.hasChange() && bcIter.hasChange()
373 if (ab_bLength == bc_bLength) {
374 // Changes on both sides up to the same position. Emit & reset.
375 addReplace(pending_aLength + aLength, pending_cLength + cLength);
376 pending_aLength = pending_cLength = 0;
377 ab_bLength = bc_bLength = 0;
378 continue;
379 }
380 }
381 // Accumulate the a->c change, reset the shorter side,
382 // keep a remainder of the longer one.
383 pending_aLength += aLength;
384 pending_cLength += cLength;
385 if (ab_bLength < bc_bLength) {
386 bc_bLength -= ab_bLength;
387 cLength = ab_bLength = 0;
388 } else { // ab_bLength > bc_bLength
389 ab_bLength -= bc_bLength;
390 aLength = bc_bLength = 0;
391 }
392 }
393 if (pending_aLength != 0 || pending_cLength != 0) {
394 addReplace(pending_aLength, pending_cLength);
395 }
396 copyErrorTo(errorCode);
397 return *this;
398 }
399
Iterator(const uint16_t * a,int32_t len,UBool oc,UBool crs)400 Edits::Iterator::Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs) :
401 array(a), index(0), length(len), remaining(0),
402 onlyChanges_(oc), coarse(crs),
403 dir(0), changed(false), oldLength_(0), newLength_(0),
404 srcIndex(0), replIndex(0), destIndex(0) {}
405
readLength(int32_t head)406 int32_t Edits::Iterator::readLength(int32_t head) {
407 if (head < LENGTH_IN_1TRAIL) {
408 return head;
409 } else if (head < LENGTH_IN_2TRAIL) {
410 U_ASSERT(index < length);
411 U_ASSERT(array[index] >= 0x8000);
412 return array[index++] & 0x7fff;
413 } else {
414 U_ASSERT((index + 2) <= length);
415 U_ASSERT(array[index] >= 0x8000);
416 U_ASSERT(array[index + 1] >= 0x8000);
417 int32_t len = ((head & 1) << 30) |
418 ((int32_t)(array[index] & 0x7fff) << 15) |
419 (array[index + 1] & 0x7fff);
420 index += 2;
421 return len;
422 }
423 }
424
updateNextIndexes()425 void Edits::Iterator::updateNextIndexes() {
426 srcIndex += oldLength_;
427 if (changed) {
428 replIndex += newLength_;
429 }
430 destIndex += newLength_;
431 }
432
updatePreviousIndexes()433 void Edits::Iterator::updatePreviousIndexes() {
434 srcIndex -= oldLength_;
435 if (changed) {
436 replIndex -= newLength_;
437 }
438 destIndex -= newLength_;
439 }
440
noNext()441 UBool Edits::Iterator::noNext() {
442 // No change before or beyond the string.
443 dir = 0;
444 changed = false;
445 oldLength_ = newLength_ = 0;
446 return false;
447 }
448
next(UBool onlyChanges,UErrorCode & errorCode)449 UBool Edits::Iterator::next(UBool onlyChanges, UErrorCode &errorCode) {
450 // Forward iteration: Update the string indexes to the limit of the current span,
451 // and post-increment-read array units to assemble a new span.
452 // Leaves the array index one after the last unit of that span.
453 if (U_FAILURE(errorCode)) { return false; }
454 // We have an errorCode in case we need to start guarding against integer overflows.
455 // It is also convenient for caller loops if we bail out when an error was set elsewhere.
456 if (dir > 0) {
457 updateNextIndexes();
458 } else {
459 if (dir < 0) {
460 // Turn around from previous() to next().
461 // Post-increment-read the same span again.
462 if (remaining > 0) {
463 // Fine-grained iterator:
464 // Stay on the current one of a sequence of compressed changes.
465 ++index; // next() rests on the index after the sequence unit.
466 dir = 1;
467 return true;
468 }
469 }
470 dir = 1;
471 }
472 if (remaining >= 1) {
473 // Fine-grained iterator: Continue a sequence of compressed changes.
474 if (remaining > 1) {
475 --remaining;
476 return true;
477 }
478 remaining = 0;
479 }
480 if (index >= length) {
481 return noNext();
482 }
483 int32_t u = array[index++];
484 if (u <= MAX_UNCHANGED) {
485 // Combine adjacent unchanged ranges.
486 changed = false;
487 oldLength_ = u + 1;
488 while (index < length && (u = array[index]) <= MAX_UNCHANGED) {
489 ++index;
490 oldLength_ += u + 1;
491 }
492 newLength_ = oldLength_;
493 if (onlyChanges) {
494 updateNextIndexes();
495 if (index >= length) {
496 return noNext();
497 }
498 // already fetched u > MAX_UNCHANGED at index
499 ++index;
500 } else {
501 return true;
502 }
503 }
504 changed = true;
505 if (u <= MAX_SHORT_CHANGE) {
506 int32_t oldLen = u >> 12;
507 int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
508 int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
509 if (coarse) {
510 oldLength_ = num * oldLen;
511 newLength_ = num * newLen;
512 } else {
513 // Split a sequence of changes that was compressed into one unit.
514 oldLength_ = oldLen;
515 newLength_ = newLen;
516 if (num > 1) {
517 remaining = num; // This is the first of two or more changes.
518 }
519 return true;
520 }
521 } else {
522 U_ASSERT(u <= 0x7fff);
523 oldLength_ = readLength((u >> 6) & 0x3f);
524 newLength_ = readLength(u & 0x3f);
525 if (!coarse) {
526 return true;
527 }
528 }
529 // Combine adjacent changes.
530 while (index < length && (u = array[index]) > MAX_UNCHANGED) {
531 ++index;
532 if (u <= MAX_SHORT_CHANGE) {
533 int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
534 oldLength_ += (u >> 12) * num;
535 newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
536 } else {
537 U_ASSERT(u <= 0x7fff);
538 oldLength_ += readLength((u >> 6) & 0x3f);
539 newLength_ += readLength(u & 0x3f);
540 }
541 }
542 return true;
543 }
544
previous(UErrorCode & errorCode)545 UBool Edits::Iterator::previous(UErrorCode &errorCode) {
546 // Backward iteration: Pre-decrement-read array units to assemble a new span,
547 // then update the string indexes to the start of that span.
548 // Leaves the array index on the head unit of that span.
549 if (U_FAILURE(errorCode)) { return false; }
550 // We have an errorCode in case we need to start guarding against integer overflows.
551 // It is also convenient for caller loops if we bail out when an error was set elsewhere.
552 if (dir >= 0) {
553 if (dir > 0) {
554 // Turn around from next() to previous().
555 // Set the string indexes to the span limit and
556 // pre-decrement-read the same span again.
557 if (remaining > 0) {
558 // Fine-grained iterator:
559 // Stay on the current one of a sequence of compressed changes.
560 --index; // previous() rests on the sequence unit.
561 dir = -1;
562 return true;
563 }
564 updateNextIndexes();
565 }
566 dir = -1;
567 }
568 if (remaining > 0) {
569 // Fine-grained iterator: Continue a sequence of compressed changes.
570 int32_t u = array[index];
571 U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
572 if (remaining <= (u & SHORT_CHANGE_NUM_MASK)) {
573 ++remaining;
574 updatePreviousIndexes();
575 return true;
576 }
577 remaining = 0;
578 }
579 if (index <= 0) {
580 return noNext();
581 }
582 int32_t u = array[--index];
583 if (u <= MAX_UNCHANGED) {
584 // Combine adjacent unchanged ranges.
585 changed = false;
586 oldLength_ = u + 1;
587 while (index > 0 && (u = array[index - 1]) <= MAX_UNCHANGED) {
588 --index;
589 oldLength_ += u + 1;
590 }
591 newLength_ = oldLength_;
592 // No need to handle onlyChanges as long as previous() is called only from findIndex().
593 updatePreviousIndexes();
594 return true;
595 }
596 changed = true;
597 if (u <= MAX_SHORT_CHANGE) {
598 int32_t oldLen = u >> 12;
599 int32_t newLen = (u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH;
600 int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
601 if (coarse) {
602 oldLength_ = num * oldLen;
603 newLength_ = num * newLen;
604 } else {
605 // Split a sequence of changes that was compressed into one unit.
606 oldLength_ = oldLen;
607 newLength_ = newLen;
608 if (num > 1) {
609 remaining = 1; // This is the last of two or more changes.
610 }
611 updatePreviousIndexes();
612 return true;
613 }
614 } else {
615 if (u <= 0x7fff) {
616 // The change is encoded in u alone.
617 oldLength_ = readLength((u >> 6) & 0x3f);
618 newLength_ = readLength(u & 0x3f);
619 } else {
620 // Back up to the head of the change, read the lengths,
621 // and reset the index to the head again.
622 U_ASSERT(index > 0);
623 while ((u = array[--index]) > 0x7fff) {}
624 U_ASSERT(u > MAX_SHORT_CHANGE);
625 int32_t headIndex = index++;
626 oldLength_ = readLength((u >> 6) & 0x3f);
627 newLength_ = readLength(u & 0x3f);
628 index = headIndex;
629 }
630 if (!coarse) {
631 updatePreviousIndexes();
632 return true;
633 }
634 }
635 // Combine adjacent changes.
636 while (index > 0 && (u = array[index - 1]) > MAX_UNCHANGED) {
637 --index;
638 if (u <= MAX_SHORT_CHANGE) {
639 int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1;
640 oldLength_ += (u >> 12) * num;
641 newLength_ += ((u >> 9) & MAX_SHORT_CHANGE_NEW_LENGTH) * num;
642 } else if (u <= 0x7fff) {
643 // Read the lengths, and reset the index to the head again.
644 int32_t headIndex = index++;
645 oldLength_ += readLength((u >> 6) & 0x3f);
646 newLength_ += readLength(u & 0x3f);
647 index = headIndex;
648 }
649 }
650 updatePreviousIndexes();
651 return true;
652 }
653
findIndex(int32_t i,UBool findSource,UErrorCode & errorCode)654 int32_t Edits::Iterator::findIndex(int32_t i, UBool findSource, UErrorCode &errorCode) {
655 if (U_FAILURE(errorCode) || i < 0) { return -1; }
656 int32_t spanStart, spanLength;
657 if (findSource) { // find source index
658 spanStart = srcIndex;
659 spanLength = oldLength_;
660 } else { // find destination index
661 spanStart = destIndex;
662 spanLength = newLength_;
663 }
664 if (i < spanStart) {
665 if (i >= (spanStart / 2)) {
666 // Search backwards.
667 for (;;) {
668 UBool hasPrevious = previous(errorCode);
669 U_ASSERT(hasPrevious); // because i>=0 and the first span starts at 0
670 (void)hasPrevious; // avoid unused-variable warning
671 spanStart = findSource ? srcIndex : destIndex;
672 if (i >= spanStart) {
673 // The index is in the current span.
674 return 0;
675 }
676 if (remaining > 0) {
677 // Is the index in one of the remaining compressed edits?
678 // spanStart is the start of the current span, first of the remaining ones.
679 spanLength = findSource ? oldLength_ : newLength_;
680 int32_t u = array[index];
681 U_ASSERT(MAX_UNCHANGED < u && u <= MAX_SHORT_CHANGE);
682 int32_t num = (u & SHORT_CHANGE_NUM_MASK) + 1 - remaining;
683 int32_t len = num * spanLength;
684 if (i >= (spanStart - len)) {
685 int32_t n = ((spanStart - i - 1) / spanLength) + 1;
686 // 1 <= n <= num
687 srcIndex -= n * oldLength_;
688 replIndex -= n * newLength_;
689 destIndex -= n * newLength_;
690 remaining += n;
691 return 0;
692 }
693 // Skip all of these edits at once.
694 srcIndex -= num * oldLength_;
695 replIndex -= num * newLength_;
696 destIndex -= num * newLength_;
697 remaining = 0;
698 }
699 }
700 }
701 // Reset the iterator to the start.
702 dir = 0;
703 index = remaining = oldLength_ = newLength_ = srcIndex = replIndex = destIndex = 0;
704 } else if (i < (spanStart + spanLength)) {
705 // The index is in the current span.
706 return 0;
707 }
708 while (next(false, errorCode)) {
709 if (findSource) {
710 spanStart = srcIndex;
711 spanLength = oldLength_;
712 } else {
713 spanStart = destIndex;
714 spanLength = newLength_;
715 }
716 if (i < (spanStart + spanLength)) {
717 // The index is in the current span.
718 return 0;
719 }
720 if (remaining > 1) {
721 // Is the index in one of the remaining compressed edits?
722 // spanStart is the start of the current span, first of the remaining ones.
723 int32_t len = remaining * spanLength;
724 if (i < (spanStart + len)) {
725 int32_t n = (i - spanStart) / spanLength; // 1 <= n <= remaining - 1
726 srcIndex += n * oldLength_;
727 replIndex += n * newLength_;
728 destIndex += n * newLength_;
729 remaining -= n;
730 return 0;
731 }
732 // Make next() skip all of these edits at once.
733 oldLength_ *= remaining;
734 newLength_ *= remaining;
735 remaining = 0;
736 }
737 }
738 return 1;
739 }
740
destinationIndexFromSourceIndex(int32_t i,UErrorCode & errorCode)741 int32_t Edits::Iterator::destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode) {
742 int32_t where = findIndex(i, true, errorCode);
743 if (where < 0) {
744 // Error or before the string.
745 return 0;
746 }
747 if (where > 0 || i == srcIndex) {
748 // At or after string length, or at start of the found span.
749 return destIndex;
750 }
751 if (changed) {
752 // In a change span, map to its end.
753 return destIndex + newLength_;
754 } else {
755 // In an unchanged span, offset 1:1 within it.
756 return destIndex + (i - srcIndex);
757 }
758 }
759
sourceIndexFromDestinationIndex(int32_t i,UErrorCode & errorCode)760 int32_t Edits::Iterator::sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode) {
761 int32_t where = findIndex(i, false, errorCode);
762 if (where < 0) {
763 // Error or before the string.
764 return 0;
765 }
766 if (where > 0 || i == destIndex) {
767 // At or after string length, or at start of the found span.
768 return srcIndex;
769 }
770 if (changed) {
771 // In a change span, map to its end.
772 return srcIndex + oldLength_;
773 } else {
774 // In an unchanged span, offset within it.
775 return srcIndex + (i - destIndex);
776 }
777 }
778
toString(UnicodeString & sb) const779 UnicodeString& Edits::Iterator::toString(UnicodeString& sb) const {
780 sb.append(u"{ src[", -1);
781 ICU_Utility::appendNumber(sb, srcIndex);
782 sb.append(u"..", -1);
783 ICU_Utility::appendNumber(sb, srcIndex + oldLength_);
784 if (changed) {
785 sb.append(u"] ⇝ dest[", -1);
786 } else {
787 sb.append(u"] ≡ dest[", -1);
788 }
789 ICU_Utility::appendNumber(sb, destIndex);
790 sb.append(u"..", -1);
791 ICU_Utility::appendNumber(sb, destIndex + newLength_);
792 if (changed) {
793 sb.append(u"], repl[", -1);
794 ICU_Utility::appendNumber(sb, replIndex);
795 sb.append(u"..", -1);
796 ICU_Utility::appendNumber(sb, replIndex + newLength_);
797 sb.append(u"] }", -1);
798 } else {
799 sb.append(u"] (no-change) }", -1);
800 }
801 return sb;
802 }
803
804 U_NAMESPACE_END
805