1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2009-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 */ 11 12 package com.ibm.icu.impl; 13 14 import java.util.ArrayList; 15 16 import com.ibm.icu.text.UnicodeSet; 17 import com.ibm.icu.text.UnicodeSet.SpanCondition; 18 import com.ibm.icu.util.OutputInt; 19 20 /* 21 * Implement span() etc. for a set with strings. 22 * Avoid recursion because of its exponential complexity. 23 * Instead, try multiple paths at once and track them with an IndexList. 24 */ 25 public class UnicodeSetStringSpan { 26 27 /* 28 * Which span() variant will be used? The object is either built for one variant and used once, 29 * or built for all and may be used many times. 30 */ 31 public static final int WITH_COUNT = 0x40; // spanAndCount() may be called 32 public static final int FWD = 0x20; 33 public static final int BACK = 0x10; 34 // public static final int UTF16 = 8; 35 public static final int CONTAINED = 2; 36 public static final int NOT_CONTAINED = 1; 37 38 public static final int ALL = 0x7f; 39 40 public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED; 41 public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED; 42 public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED; 43 public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED; 44 45 /** 46 * Special spanLength short values. (since Java has not unsigned byte type) 47 * All code points in the string are contained in the parent set. 48 */ 49 static final short ALL_CP_CONTAINED = 0xff; 50 /** The spanLength is >=0xfe. */ 51 static final short LONG_SPAN = ALL_CP_CONTAINED - 1; 52 53 /** Set for span(). Same as parent but without strings. */ 54 private UnicodeSet spanSet; 55 56 /** 57 * Set for span(not contained). 58 * Same as spanSet, plus characters that start or end strings. 59 */ 60 private UnicodeSet spanNotSet; 61 62 /** The strings of the parent set. */ 63 private ArrayList<String> strings; 64 65 /** The lengths of span(), spanBack() etc. for each string. */ 66 private short[] spanLengths; 67 68 /** Maximum lengths of relevant strings. */ 69 private final int maxLength16; 70 71 /** Are there strings that are not fully contained in the code point set? */ 72 private boolean someRelevant; 73 74 /** Set up for all variants of span()? */ 75 private boolean all; 76 77 /** Span helper */ 78 private OffsetList offsets; 79 80 /** 81 * Constructs for all variants of span(), or only for any one variant. 82 * Initializes as little as possible, for single use. 83 */ UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which)84 public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) { 85 spanSet = new UnicodeSet(0, 0x10ffff); 86 // TODO: With Java 6, just take the parent set's strings as is, 87 // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings. 88 // Then iterate via the first() and higher() methods. 89 // (We do not want to create multiple Iterator objects in each span().) 90 // See ICU ticket #7454. 91 strings = setStrings; 92 all = (which == ALL); 93 spanSet.retainAll(set); 94 if (0 != (which & NOT_CONTAINED)) { 95 // Default to the same sets. 96 // addToSpanNotSet() will create a separate set if necessary. 97 spanNotSet = spanSet; 98 } 99 offsets = new OffsetList(); 100 101 // Determine if the strings even need to be taken into account at all for span() etc. 102 // If any string is relevant, then all strings need to be used for 103 // span(longest match) but only the relevant ones for span(while contained). 104 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 105 // and do not store UTF-8 strings if !thisRelevant and CONTAINED. 106 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 107 // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 108 int stringsLength = strings.size(); 109 110 int i, spanLength; 111 int maxLength16 = 0; 112 someRelevant = false; 113 for (i = 0; i < stringsLength;) { 114 String string = strings.get(i); 115 int length16 = string.length(); 116 if (length16 == 0) { 117 // Remove the empty string. 118 strings.remove(i); 119 --stringsLength; 120 continue; 121 } 122 spanLength = spanSet.span(string, SpanCondition.CONTAINED); 123 if (spanLength < length16) { // Relevant string. 124 someRelevant = true; 125 } 126 if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) { 127 maxLength16 = length16; 128 } 129 ++i; 130 } 131 this.maxLength16 = maxLength16; 132 if (!someRelevant && (which & WITH_COUNT) == 0) { 133 return; 134 } 135 136 // Freeze after checking for the need to use strings at all because freezing 137 // a set takes some time and memory which are wasted if there are no relevant strings. 138 if (all) { 139 spanSet.freeze(); 140 } 141 142 int spanBackLengthsOffset; 143 144 // Allocate a block of meta data. 145 int allocSize; 146 if (all) { 147 // 2 sets of span lengths 148 allocSize = stringsLength * (2); 149 } else { 150 allocSize = stringsLength; // One set of span lengths. 151 } 152 spanLengths = new short[allocSize]; 153 154 if (all) { 155 // Store span lengths for all span() variants. 156 spanBackLengthsOffset = stringsLength; 157 } else { 158 // Store span lengths for only one span() variant. 159 spanBackLengthsOffset = 0; 160 } 161 162 // Set the meta data and spanNotSet and write the UTF-8 strings. 163 164 for (i = 0; i < stringsLength; ++i) { 165 String string = strings.get(i); 166 int length16 = string.length(); 167 spanLength = spanSet.span(string, SpanCondition.CONTAINED); 168 if (spanLength < length16) { // Relevant string. 169 if (true /* 0 != (which & UTF16) */) { 170 if (0 != (which & CONTAINED)) { 171 if (0 != (which & FWD)) { 172 spanLengths[i] = makeSpanLengthByte(spanLength); 173 } 174 if (0 != (which & BACK)) { 175 spanLength = length16 176 - spanSet.spanBack(string, length16, SpanCondition.CONTAINED); 177 spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength); 178 } 179 } else /* not CONTAINED, not all, but NOT_CONTAINED */{ 180 spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant 181 // flag. 182 } 183 } 184 if (0 != (which & NOT_CONTAINED)) { 185 // Add string start and end code points to the spanNotSet so that 186 // a span(while not contained) stops before any string. 187 int c; 188 if (0 != (which & FWD)) { 189 c = string.codePointAt(0); 190 addToSpanNotSet(c); 191 } 192 if (0 != (which & BACK)) { 193 c = string.codePointBefore(length16); 194 addToSpanNotSet(c); 195 } 196 } 197 } else { // Irrelevant string. 198 if (all) { 199 spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED; 200 } else { 201 // All spanXYZLengths pointers contain the same address. 202 spanLengths[i] = ALL_CP_CONTAINED; 203 } 204 } 205 } 206 207 // Finish. 208 if (all) { 209 spanNotSet.freeze(); 210 } 211 } 212 213 /** 214 * Constructs a copy of an existing UnicodeSetStringSpan. 215 * Assumes which==ALL for a frozen set. 216 */ UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, final ArrayList<String> newParentSetStrings)217 public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, 218 final ArrayList<String> newParentSetStrings) { 219 spanSet = otherStringSpan.spanSet; 220 strings = newParentSetStrings; 221 maxLength16 = otherStringSpan.maxLength16; 222 someRelevant = otherStringSpan.someRelevant; 223 all = true; 224 if (Utility.sameObjects(otherStringSpan.spanNotSet, otherStringSpan.spanSet)) { 225 spanNotSet = spanSet; 226 } else { 227 spanNotSet = (UnicodeSet) otherStringSpan.spanNotSet.clone(); 228 } 229 offsets = new OffsetList(); 230 231 spanLengths = otherStringSpan.spanLengths.clone(); 232 } 233 234 /** 235 * Do the strings need to be checked in span() etc.? 236 * 237 * @return true if strings need to be checked (call span() here), 238 * false if not (use a BMPSet for best performance). 239 */ needsStringSpanUTF16()240 public boolean needsStringSpanUTF16() { 241 return someRelevant; 242 } 243 244 /** For fast UnicodeSet::contains(c). */ contains(int c)245 public boolean contains(int c) { 246 return spanSet.contains(c); 247 } 248 249 /** 250 * Adds a starting or ending string character to the spanNotSet 251 * so that a character span ends before any string. 252 */ addToSpanNotSet(int c)253 private void addToSpanNotSet(int c) { 254 if (Utility.sameObjects(spanNotSet, null) || Utility.sameObjects(spanNotSet, spanSet)) { 255 if (spanSet.contains(c)) { 256 return; // Nothing to do. 257 } 258 spanNotSet = spanSet.cloneAsThawed(); 259 } 260 spanNotSet.add(c); 261 } 262 263 /* 264 * Note: In span() when spanLength==0 265 * (after a string match, or at the beginning after an empty code point span) 266 * and in spanNot() and spanNotUTF8(), 267 * string matching could use a binary search because all string matches are done 268 * from the same start index. 269 * 270 * For UTF-8, this would require a comparison function that returns UTF-16 order. 271 * 272 * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets 273 * with strings have very few very short strings. For cases with many strings, it might be better to use a different 274 * API and implementation with a DFA (state machine). 275 */ 276 277 /* 278 * Algorithm for span(SpanCondition.CONTAINED) 279 * 280 * Theoretical algorithm: 281 * - Iterate through the string, and at each code point boundary: 282 * + If the code point there is in the set, then remember to continue after it. 283 * + If a set string matches at the current position, then remember to continue after it. 284 * + Either recursively span for each code point or string match, or recursively span 285 * for all but the shortest one and iteratively continue the span with the shortest local match. 286 * + Remember the longest recursive span (the farthest end point). 287 * + If there is no match at the current position, 288 * neither for the code point there nor for any set string, 289 * then stop and return the longest recursive span length. 290 * 291 * Optimized implementation: 292 * 293 * (We assume that most sets will have very few very short strings. 294 * A span using a string-less set is extremely fast.) 295 * 296 * Create and cache a spanSet which contains all of the single code points of the original set 297 * but none of its strings. 298 * 299 * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). 300 * - Loop: 301 * + Try to match each set string at the end of the spanLength. 302 * ~ Set strings that start with set-contained code points 303 * must be matched with a partial overlap 304 * because the recursive algorithm would have tried to match them at every position. 305 * ~ Set strings that entirely consist of set-contained code points 306 * are irrelevant for span(SpanCondition.CONTAINED) 307 * because the recursive algorithm would continue after them anyway and 308 * find the longest recursive match from their end. 309 * ~ Rather than recursing, note each end point of a set string match. 310 * + If no set string matched after spanSet.span(), 311 * then return with where the spanSet.span() ended. 312 * + If at least one set string matched after spanSet.span(), 313 * then pop the shortest string match end point and continue the loop, 314 * trying to match all set strings from there. 315 * + If at least one more set string matched after a previous string match, then test if the 316 * code point after the previous string match is also contained in the set. 317 * Continue the loop with the shortest end point of 318 * either this code point or a matching set string. 319 * + If no more set string matched after a previous string match, 320 * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). 321 * Stop if spanLength==0, otherwise continue the loop. 322 * 323 * By noting each end point of a set string match, the function visits each string position at most once and 324 * finishes in linear time. 325 * 326 * The recursive algorithm may visit the same string position many times 327 * if multiple paths lead to it and finishes in exponential time. 328 */ 329 330 /* 331 * Algorithm for span(SIMPLE) 332 * 333 * Theoretical algorithm: 334 * - Iterate through the string, and at each code point boundary: 335 * + If the code point there is in the set, then remember to continue after it. 336 * + If a set string matches at the current position, then remember to continue after it. 337 * + Continue from the farthest match position and ignore all others. 338 * + If there is no match at the current position, then stop and return the current position. 339 * 340 * Optimized implementation: 341 * 342 * (Same assumption and spanSet as above.) 343 * 344 * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). 345 * - Loop: 346 * + Try to match each set string at the end of the spanLength. 347 * ~ Set strings that start with set-contained code points 348 * must be matched with a partial overlap 349 * because the standard algorithm would have tried to match them earlier. 350 * ~ Set strings that entirely consist of set-contained code points 351 * must be matched with a full overlap because the longest-match algorithm 352 * would hide set string matches that end earlier. 353 * Such set strings need not be matched earlier inside the code point span 354 * because the standard algorithm would then have 355 * continued after the set string match anyway. 356 * ~ Remember the longest set string match (farthest end point) 357 * from the earliest starting point. 358 * + If no set string matched after spanSet.span(), 359 * then return with where the spanSet.span() ended. 360 * + If at least one set string matched, 361 * then continue the loop after the longest match from the earliest position. 362 * + If no more set string matched after a previous string match, 363 * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). 364 * Stop if spanLength==0, otherwise continue the loop. 365 */ 366 /** 367 * Spans a string. 368 * 369 * @param s The string to be spanned 370 * @param start The start index that the span begins 371 * @param spanCondition The span condition 372 * @return the limit (exclusive end) of the span 373 */ span(CharSequence s, int start, SpanCondition spanCondition)374 public int span(CharSequence s, int start, SpanCondition spanCondition) { 375 if (spanCondition == SpanCondition.NOT_CONTAINED) { 376 return spanNot(s, start, null); 377 } 378 int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED); 379 if (spanLimit == s.length()) { 380 return spanLimit; 381 } 382 return spanWithStrings(s, start, spanLimit, spanCondition); 383 } 384 385 /** 386 * Synchronized method for complicated spans using the offsets. 387 * Avoids synchronization for simple cases. 388 * 389 * @param spanLimit = spanSet.span(s, start, CONTAINED) 390 */ spanWithStrings(CharSequence s, int start, int spanLimit, SpanCondition spanCondition)391 private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit, 392 SpanCondition spanCondition) { 393 // Consider strings; they may overlap with the span. 394 int initSize = 0; 395 if (spanCondition == SpanCondition.CONTAINED) { 396 // Use offset list to try all possibilities. 397 initSize = maxLength16; 398 } 399 offsets.setMaxLength(initSize); 400 int length = s.length(); 401 int pos = spanLimit, rest = length - spanLimit; 402 int spanLength = spanLimit - start; 403 int i, stringsLength = strings.size(); 404 for (;;) { 405 if (spanCondition == SpanCondition.CONTAINED) { 406 for (i = 0; i < stringsLength; ++i) { 407 int overlap = spanLengths[i]; 408 if (overlap == ALL_CP_CONTAINED) { 409 continue; // Irrelevant string. 410 } 411 String string = strings.get(i); 412 413 int length16 = string.length(); 414 415 // Try to match this string at pos-overlap..pos. 416 if (overlap >= LONG_SPAN) { 417 overlap = length16; 418 // While contained: No point matching fully inside the code point span. 419 overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code 420 // point. 421 } 422 if (overlap > spanLength) { 423 overlap = spanLength; 424 } 425 int inc = length16 - overlap; // Keep overlap+inc==length16. 426 for (;;) { 427 if (inc > rest) { 428 break; 429 } 430 // Try to match if the increment is not listed already. 431 if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) { 432 if (inc == rest) { 433 return length; // Reached the end of the string. 434 } 435 offsets.addOffset(inc); 436 } 437 if (overlap == 0) { 438 break; 439 } 440 --overlap; 441 ++inc; 442 } 443 } 444 } else /* SIMPLE */{ 445 int maxInc = 0, maxOverlap = 0; 446 for (i = 0; i < stringsLength; ++i) { 447 int overlap = spanLengths[i]; 448 // For longest match, we do need to try to match even an all-contained string 449 // to find the match from the earliest start. 450 451 String string = strings.get(i); 452 453 int length16 = string.length(); 454 455 // Try to match this string at pos-overlap..pos. 456 if (overlap >= LONG_SPAN) { 457 overlap = length16; 458 // Longest match: Need to match fully inside the code point span 459 // to find the match from the earliest start. 460 } 461 if (overlap > spanLength) { 462 overlap = spanLength; 463 } 464 int inc = length16 - overlap; // Keep overlap+inc==length16. 465 for (;;) { 466 if (inc > rest || overlap < maxOverlap) { 467 break; 468 } 469 // Try to match if the string is longer or starts earlier. 470 if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc) 471 && matches16CPB(s, pos - overlap, length, string, length16)) { 472 maxInc = inc; // Longest match from earliest start. 473 maxOverlap = overlap; 474 break; 475 } 476 --overlap; 477 ++inc; 478 } 479 } 480 481 if (maxInc != 0 || maxOverlap != 0) { 482 // Longest-match algorithm, and there was a string match. 483 // Simply continue after it. 484 pos += maxInc; 485 rest -= maxInc; 486 if (rest == 0) { 487 return length; // Reached the end of the string. 488 } 489 spanLength = 0; // Match strings from after a string match. 490 continue; 491 } 492 } 493 // Finished trying to match all strings at pos. 494 495 if (spanLength != 0 || pos == 0) { 496 // The position is after an unlimited code point span (spanLength!=0), 497 // not after a string match. 498 // The only position where spanLength==0 after a span is pos==0. 499 // Otherwise, an unlimited code point span is only tried again when no 500 // strings match, and if such a non-initial span fails we stop. 501 if (offsets.isEmpty()) { 502 return pos; // No strings matched after a span. 503 } 504 // Match strings from after the next string match. 505 } else { 506 // The position is after a string match (or a single code point). 507 if (offsets.isEmpty()) { 508 // No more strings matched after a previous string match. 509 // Try another code point span from after the last string match. 510 spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED); 511 spanLength = spanLimit - pos; 512 if (spanLength == rest || // Reached the end of the string, or 513 spanLength == 0 // neither strings nor span progressed. 514 ) { 515 return spanLimit; 516 } 517 pos += spanLength; 518 rest -= spanLength; 519 continue; // spanLength>0: Match strings from after a span. 520 } else { 521 // Try to match only one code point from after a string match if some 522 // string matched beyond it, so that we try all possible positions 523 // and don't overshoot. 524 spanLength = spanOne(spanSet, s, pos, rest); 525 if (spanLength > 0) { 526 if (spanLength == rest) { 527 return length; // Reached the end of the string. 528 } 529 // Match strings after this code point. 530 // There cannot be any increments below it because UnicodeSet strings 531 // contain multiple code points. 532 pos += spanLength; 533 rest -= spanLength; 534 offsets.shift(spanLength); 535 spanLength = 0; 536 continue; // Match strings from after a single code point. 537 } 538 // Match strings from after the next string match. 539 } 540 } 541 int minOffset = offsets.popMinimum(null); 542 pos += minOffset; 543 rest -= minOffset; 544 spanLength = 0; // Match strings from after a string match. 545 } 546 } 547 548 /** 549 * Spans a string and counts the smallest number of set elements on any path across the span. 550 * 551 * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans. 552 * 553 * <p>If the set does not have any fully-contained strings, then we could optimize this 554 * like span(), but such sets are likely rare, and this is at least still linear. 555 * 556 * @param s The string to be spanned 557 * @param start The start index that the span begins 558 * @param spanCondition The span condition 559 * @param outCount The count 560 * @return the limit (exclusive end) of the span 561 */ spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)562 public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, 563 OutputInt outCount) { 564 if (spanCondition == SpanCondition.NOT_CONTAINED) { 565 return spanNot(s, start, outCount); 566 } 567 // Consider strings; they may overlap with the span, 568 // and they may result in a smaller count that with just code points. 569 if (spanCondition == SpanCondition.CONTAINED) { 570 return spanContainedAndCount(s, start, outCount); 571 } 572 // SIMPLE (not synchronized, does not use offsets) 573 int stringsLength = strings.size(); 574 int length = s.length(); 575 int pos = start; 576 int rest = length - start; 577 int count = 0; 578 while (rest != 0) { 579 // Try to match the next code point. 580 int cpLength = spanOne(spanSet, s, pos, rest); 581 int maxInc = (cpLength > 0) ? cpLength : 0; 582 // Try to match all of the strings. 583 for (int i = 0; i < stringsLength; ++i) { 584 String string = strings.get(i); 585 int length16 = string.length(); 586 if (maxInc < length16 && length16 <= rest && 587 matches16CPB(s, pos, length, string, length16)) { 588 maxInc = length16; 589 } 590 } 591 // We are done if there is no match beyond pos. 592 if (maxInc == 0) { 593 outCount.value = count; 594 return pos; 595 } 596 // Continue from the longest match. 597 ++count; 598 pos += maxInc; 599 rest -= maxInc; 600 } 601 outCount.value = count; 602 return pos; 603 } 604 spanContainedAndCount(CharSequence s, int start, OutputInt outCount)605 private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) { 606 // Use offset list to try all possibilities. 607 offsets.setMaxLength(maxLength16); 608 int stringsLength = strings.size(); 609 int length = s.length(); 610 int pos = start; 611 int rest = length - start; 612 int count = 0; 613 while (rest != 0) { 614 // Try to match the next code point. 615 int cpLength = spanOne(spanSet, s, pos, rest); 616 if (cpLength > 0) { 617 offsets.addOffsetAndCount(cpLength, count + 1); 618 } 619 // Try to match all of the strings. 620 for (int i = 0; i < stringsLength; ++i) { 621 String string = strings.get(i); 622 int length16 = string.length(); 623 // Note: If the strings were sorted by length, then we could also 624 // avoid trying to match if there is already a match of the same length. 625 if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) && 626 matches16CPB(s, pos, length, string, length16)) { 627 offsets.addOffsetAndCount(length16, count + 1); 628 } 629 } 630 // We are done if there is no match beyond pos. 631 if (offsets.isEmpty()) { 632 outCount.value = count; 633 return pos; 634 } 635 // Continue from the nearest match. 636 int minOffset = offsets.popMinimum(outCount); 637 count = outCount.value; 638 pos += minOffset; 639 rest -= minOffset; 640 } 641 outCount.value = count; 642 return pos; 643 } 644 645 /** 646 * Span a string backwards. 647 * 648 * @param s The string to be spanned 649 * @param spanCondition The span condition 650 * @return The string index which starts the span (i.e. inclusive). 651 */ spanBack(CharSequence s, int length, SpanCondition spanCondition)652 public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) { 653 if (spanCondition == SpanCondition.NOT_CONTAINED) { 654 return spanNotBack(s, length); 655 } 656 int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED); 657 if (pos == 0) { 658 return 0; 659 } 660 int spanLength = length - pos; 661 662 // Consider strings; they may overlap with the span. 663 int initSize = 0; 664 if (spanCondition == SpanCondition.CONTAINED) { 665 // Use offset list to try all possibilities. 666 initSize = maxLength16; 667 } 668 offsets.setMaxLength(initSize); 669 int i, stringsLength = strings.size(); 670 int spanBackLengthsOffset = 0; 671 if (all) { 672 spanBackLengthsOffset = stringsLength; 673 } 674 for (;;) { 675 if (spanCondition == SpanCondition.CONTAINED) { 676 for (i = 0; i < stringsLength; ++i) { 677 int overlap = spanLengths[spanBackLengthsOffset + i]; 678 if (overlap == ALL_CP_CONTAINED) { 679 continue; // Irrelevant string. 680 } 681 String string = strings.get(i); 682 683 int length16 = string.length(); 684 685 // Try to match this string at pos-(length16-overlap)..pos-length16. 686 if (overlap >= LONG_SPAN) { 687 overlap = length16; 688 // While contained: No point matching fully inside the code point span. 689 int len1 = 0; 690 len1 = string.offsetByCodePoints(0, 1); 691 overlap -= len1; // Length of the string minus the first code point. 692 } 693 if (overlap > spanLength) { 694 overlap = spanLength; 695 } 696 int dec = length16 - overlap; // Keep dec+overlap==length16. 697 for (;;) { 698 if (dec > pos) { 699 break; 700 } 701 // Try to match if the decrement is not listed already. 702 if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) { 703 if (dec == pos) { 704 return 0; // Reached the start of the string. 705 } 706 offsets.addOffset(dec); 707 } 708 if (overlap == 0) { 709 break; 710 } 711 --overlap; 712 ++dec; 713 } 714 } 715 } else /* SIMPLE */{ 716 int maxDec = 0, maxOverlap = 0; 717 for (i = 0; i < stringsLength; ++i) { 718 int overlap = spanLengths[spanBackLengthsOffset + i]; 719 // For longest match, we do need to try to match even an all-contained string 720 // to find the match from the latest end. 721 722 String string = strings.get(i); 723 724 int length16 = string.length(); 725 726 // Try to match this string at pos-(length16-overlap)..pos-length16. 727 if (overlap >= LONG_SPAN) { 728 overlap = length16; 729 // Longest match: Need to match fully inside the code point span 730 // to find the match from the latest end. 731 } 732 if (overlap > spanLength) { 733 overlap = spanLength; 734 } 735 int dec = length16 - overlap; // Keep dec+overlap==length16. 736 for (;;) { 737 if (dec > pos || overlap < maxOverlap) { 738 break; 739 } 740 // Try to match if the string is longer or ends later. 741 if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec) 742 && matches16CPB(s, pos - dec, length, string, length16)) { 743 maxDec = dec; // Longest match from latest end. 744 maxOverlap = overlap; 745 break; 746 } 747 --overlap; 748 ++dec; 749 } 750 } 751 752 if (maxDec != 0 || maxOverlap != 0) { 753 // Longest-match algorithm, and there was a string match. 754 // Simply continue before it. 755 pos -= maxDec; 756 if (pos == 0) { 757 return 0; // Reached the start of the string. 758 } 759 spanLength = 0; // Match strings from before a string match. 760 continue; 761 } 762 } 763 // Finished trying to match all strings at pos. 764 765 if (spanLength != 0 || pos == length) { 766 // The position is before an unlimited code point span (spanLength!=0), 767 // not before a string match. 768 // The only position where spanLength==0 before a span is pos==length. 769 // Otherwise, an unlimited code point span is only tried again when no 770 // strings match, and if such a non-initial span fails we stop. 771 if (offsets.isEmpty()) { 772 return pos; // No strings matched before a span. 773 } 774 // Match strings from before the next string match. 775 } else { 776 // The position is before a string match (or a single code point). 777 if (offsets.isEmpty()) { 778 // No more strings matched before a previous string match. 779 // Try another code point span from before the last string match. 780 int oldPos = pos; 781 pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED); 782 spanLength = oldPos - pos; 783 if (pos == 0 || // Reached the start of the string, or 784 spanLength == 0 // neither strings nor span progressed. 785 ) { 786 return pos; 787 } 788 continue; // spanLength>0: Match strings from before a span. 789 } else { 790 // Try to match only one code point from before a string match if some 791 // string matched beyond it, so that we try all possible positions 792 // and don't overshoot. 793 spanLength = spanOneBack(spanSet, s, pos); 794 if (spanLength > 0) { 795 if (spanLength == pos) { 796 return 0; // Reached the start of the string. 797 } 798 // Match strings before this code point. 799 // There cannot be any decrements below it because UnicodeSet strings 800 // contain multiple code points. 801 pos -= spanLength; 802 offsets.shift(spanLength); 803 spanLength = 0; 804 continue; // Match strings from before a single code point. 805 } 806 // Match strings from before the next string match. 807 } 808 } 809 pos -= offsets.popMinimum(null); 810 spanLength = 0; // Match strings from before a string match. 811 } 812 } 813 814 /** 815 * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED) 816 * 817 * Theoretical algorithm: 818 * - Iterate through the string, and at each code point boundary: 819 * + If the code point there is in the set, then return with the current position. 820 * + If a set string matches at the current position, then return with the current position. 821 * 822 * Optimized implementation: 823 * 824 * (Same assumption as for span() above.) 825 * 826 * Create and cache a spanNotSet which contains 827 * all of the single code points of the original set but none of its strings. 828 * For each set string add its initial code point to the spanNotSet. 829 * (Also add its final code point for spanNotBack().) 830 * 831 * - Loop: 832 * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED). 833 * + If the current code point is in the original set, then return the current position. 834 * + If any set string matches at the current position, then return the current position. 835 * + If there is no match at the current position, neither for the code point 836 * there nor for any set string, then skip this code point and continue the loop. 837 * This happens for set-string-initial code points that were added to spanNotSet 838 * when there is not actually a match for such a set string. 839 * 840 * @param s The string to be spanned 841 * @param start The start index that the span begins 842 * @param outCount If not null: Receives the number of code points across the span. 843 * @return the limit (exclusive end) of the span 844 */ spanNot(CharSequence s, int start, OutputInt outCount)845 private int spanNot(CharSequence s, int start, OutputInt outCount) { 846 int length = s.length(); 847 int pos = start, rest = length - start; 848 int stringsLength = strings.size(); 849 int count = 0; 850 do { 851 // Span until we find a code point from the set, 852 // or a code point that starts or ends some string. 853 int spanLimit; 854 if (outCount == null) { 855 spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED); 856 } else { 857 spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount); 858 outCount.value = count = count + outCount.value; 859 } 860 if (spanLimit == length) { 861 return length; // Reached the end of the string. 862 } 863 pos = spanLimit; 864 rest = length - spanLimit; 865 866 // Check whether the current code point is in the original set, 867 // without the string starts and ends. 868 int cpLength = spanOne(spanSet, s, pos, rest); 869 if (cpLength > 0) { 870 return pos; // There is a set element at pos. 871 } 872 873 // Try to match the strings at pos. 874 for (int i = 0; i < stringsLength; ++i) { 875 if (spanLengths[i] == ALL_CP_CONTAINED) { 876 continue; // Irrelevant string. 877 } 878 String string = strings.get(i); 879 880 int length16 = string.length(); 881 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) { 882 return pos; // There is a set element at pos. 883 } 884 } 885 886 // The span(while not contained) ended on a string start/end which is 887 // not in the original set. Skip this code point and continue. 888 // cpLength<0 889 pos -= cpLength; 890 rest += cpLength; 891 ++count; 892 } while (rest != 0); 893 if (outCount != null) { 894 outCount.value = count; 895 } 896 return length; // Reached the end of the string. 897 } 898 spanNotBack(CharSequence s, int length)899 private int spanNotBack(CharSequence s, int length) { 900 int pos = length; 901 int i, stringsLength = strings.size(); 902 do { 903 // Span until we find a code point from the set, 904 // or a code point that starts or ends some string. 905 pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED); 906 if (pos == 0) { 907 return 0; // Reached the start of the string. 908 } 909 910 // Check whether the current code point is in the original set, 911 // without the string starts and ends. 912 int cpLength = spanOneBack(spanSet, s, pos); 913 if (cpLength > 0) { 914 return pos; // There is a set element at pos. 915 } 916 917 // Try to match the strings at pos. 918 for (i = 0; i < stringsLength; ++i) { 919 // Use spanLengths rather than a spanLengths pointer because 920 // it is easier and we only need to know whether the string is irrelevant 921 // which is the same in either array. 922 if (spanLengths[i] == ALL_CP_CONTAINED) { 923 continue; // Irrelevant string. 924 } 925 String string = strings.get(i); 926 927 int length16 = string.length(); 928 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) { 929 return pos; // There is a set element at pos. 930 } 931 } 932 933 // The span(while not contained) ended on a string start/end which is 934 // not in the original set. Skip this code point and continue. 935 // cpLength<0 936 pos += cpLength; 937 } while (pos != 0); 938 return 0; // Reached the start of the string. 939 } 940 makeSpanLengthByte(int spanLength)941 static short makeSpanLengthByte(int spanLength) { 942 // 0xfe==UnicodeSetStringSpan::LONG_SPAN 943 return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN; 944 } 945 946 // Compare strings without any argument checks. Requires length>0. matches16(CharSequence s, int start, final String t, int length)947 private static boolean matches16(CharSequence s, int start, final String t, int length) { 948 int end = start + length; 949 while (length-- > 0) { 950 if (s.charAt(--end) != t.charAt(length)) { 951 return false; 952 } 953 } 954 return true; 955 } 956 957 /** 958 * Compare 16-bit Unicode strings (which may be malformed UTF-16) 959 * at code point boundaries. 960 * That is, each edge of a match must not be in the middle of a surrogate pair. 961 * @param s The string to match in. 962 * @param start The start index of s. 963 * @param limit The limit of the subsequence of s being spanned. 964 * @param t The substring to be matched in s. 965 * @param tlength The length of t. 966 */ matches16CPB(CharSequence s, int start, int limit, final String t, int tlength)967 static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) { 968 return matches16(s, start, t, tlength) 969 && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) && 970 Character.isLowSurrogate(s.charAt(start))) 971 && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) && 972 Character.isLowSurrogate(s.charAt(start + tlength))); 973 } 974 975 /** 976 * Does the set contain the next code point? 977 * If so, return its length; otherwise return its negative length. 978 */ spanOne(final UnicodeSet set, CharSequence s, int start, int length)979 static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) { 980 char c = s.charAt(start); 981 if (c >= 0xd800 && c <= 0xdbff && length >= 2) { 982 char c2 = s.charAt(start + 1); 983 if (com.ibm.icu.text.UTF16.isTrailSurrogate(c2)) { 984 int supplementary = Character.toCodePoint(c, c2); 985 return set.contains(supplementary) ? 2 : -2; 986 } 987 } 988 return set.contains(c) ? 1 : -1; 989 } 990 spanOneBack(final UnicodeSet set, CharSequence s, int length)991 static int spanOneBack(final UnicodeSet set, CharSequence s, int length) { 992 char c = s.charAt(length - 1); 993 if (c >= 0xdc00 && c <= 0xdfff && length >= 2) { 994 char c2 = s.charAt(length - 2); 995 if (com.ibm.icu.text.UTF16.isLeadSurrogate(c2)) { 996 int supplementary = Character.toCodePoint(c2, c); 997 return set.contains(supplementary) ? 2 : -2; 998 } 999 } 1000 return set.contains(c) ? 1 : -1; 1001 } 1002 1003 /** 1004 * Helper class for UnicodeSetStringSpan. 1005 * 1006 * <p>List of offsets from the current position from where to try matching 1007 * a code point or a string. 1008 * Stores offsets rather than indexes to simplify the code and use the same list 1009 * for both increments (in span()) and decrements (in spanBack()). 1010 * 1011 * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time 1012 * are relatively dense, that is, 1013 * there are normally no gaps of hundreds or thousands of offset values. 1014 * 1015 * <p>This class optionally also tracks the minimum non-negative count for each position, 1016 * intended to count the smallest number of elements of any path leading to that position. 1017 * 1018 * <p>The implementation uses a circular buffer of count integers, 1019 * each indicating whether the corresponding offset is in the list, 1020 * and its path element count. 1021 * This avoids inserting into a sorted list of offsets (or absolute indexes) 1022 * and physically moving part of the list. 1023 * 1024 * <p>Note: In principle, the caller should setMaxLength() to 1025 * the maximum of the max string length and U16_LENGTH/U8_LENGTH 1026 * to account for "long" single code points. 1027 * 1028 * <p>Note: An earlier version did not track counts and stored only byte flags. 1029 * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64, 1030 * the list could be stored as bit flags in a single integer. 1031 * Rather than handling a circular buffer with a start list index, 1032 * the integer would simply be shifted when lower offsets are removed. 1033 * UnicodeSet does not have a limit on the lengths of strings. 1034 */ 1035 private static final class OffsetList { 1036 private int[] list; 1037 private int length; 1038 private int start; 1039 OffsetList()1040 public OffsetList() { 1041 list = new int[16]; // default size 1042 } 1043 setMaxLength(int maxLength)1044 public void setMaxLength(int maxLength) { 1045 if (maxLength > list.length) { 1046 list = new int[maxLength]; 1047 } 1048 clear(); 1049 } 1050 clear()1051 public void clear() { 1052 for (int i = list.length; i-- > 0;) { 1053 list[i] = 0; 1054 } 1055 start = length = 0; 1056 } 1057 isEmpty()1058 public boolean isEmpty() { 1059 return (length == 0); 1060 } 1061 1062 /** 1063 * Reduces all stored offsets by delta, used when the current position moves by delta. 1064 * There must not be any offsets lower than delta. 1065 * If there is an offset equal to delta, it is removed. 1066 * 1067 * @param delta [1..maxLength] 1068 */ shift(int delta)1069 public void shift(int delta) { 1070 int i = start + delta; 1071 if (i >= list.length) { 1072 i -= list.length; 1073 } 1074 if (list[i] != 0) { 1075 list[i] = 0; 1076 --length; 1077 } 1078 start = i; 1079 } 1080 1081 /** 1082 * Adds an offset. The list must not contain it yet. 1083 * @param offset [1..maxLength] 1084 */ addOffset(int offset)1085 public void addOffset(int offset) { 1086 int i = start + offset; 1087 if (i >= list.length) { 1088 i -= list.length; 1089 } 1090 assert list[i] == 0; 1091 list[i] = 1; 1092 ++length; 1093 } 1094 1095 /** 1096 * Adds an offset and updates its count. 1097 * The list may already contain the offset. 1098 * @param offset [1..maxLength] 1099 */ addOffsetAndCount(int offset, int count)1100 public void addOffsetAndCount(int offset, int count) { 1101 assert count > 0; 1102 int i = start + offset; 1103 if (i >= list.length) { 1104 i -= list.length; 1105 } 1106 if (list[i] == 0) { 1107 list[i] = count; 1108 ++length; 1109 } else if (count < list[i]) { 1110 list[i] = count; 1111 } 1112 } 1113 1114 /** 1115 * @param offset [1..maxLength] 1116 */ containsOffset(int offset)1117 public boolean containsOffset(int offset) { 1118 int i = start + offset; 1119 if (i >= list.length) { 1120 i -= list.length; 1121 } 1122 return list[i] != 0; 1123 } 1124 1125 /** 1126 * @param offset [1..maxLength] 1127 */ hasCountAtOffset(int offset, int count)1128 public boolean hasCountAtOffset(int offset, int count) { 1129 int i = start + offset; 1130 if (i >= list.length) { 1131 i -= list.length; 1132 } 1133 int oldCount = list[i]; 1134 return oldCount != 0 && oldCount <= count; 1135 } 1136 1137 /** 1138 * Finds the lowest stored offset from a non-empty list, removes it, 1139 * and reduces all other offsets by this minimum. 1140 * @return min=[1..maxLength] 1141 */ popMinimum(OutputInt outCount)1142 public int popMinimum(OutputInt outCount) { 1143 // Look for the next offset in list[start+1..list.length-1]. 1144 int i = start, result; 1145 while (++i < list.length) { 1146 int count = list[i]; 1147 if (count != 0) { 1148 list[i] = 0; 1149 --length; 1150 result = i - start; 1151 start = i; 1152 if (outCount != null) { outCount.value = count; } 1153 return result; 1154 } 1155 } 1156 // i==list.length 1157 1158 // Wrap around and look for the next offset in list[0..start]. 1159 // Since the list is not empty, there will be one. 1160 result = list.length - start; 1161 i = 0; 1162 int count; 1163 while ((count = list[i]) == 0) { 1164 ++i; 1165 } 1166 list[i] = 0; 1167 --length; 1168 start = i; 1169 if (outCount != null) { outCount.value = count; } 1170 return result + i; 1171 } 1172 } 1173 } 1174