1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ****************************************************************************** 6 * 7 * Copyright (C) 2009-2015, International Business Machines 8 * Corporation and others. All Rights Reserved. 9 * 10 ****************************************************************************** 11 */ 12 13 package ohos.global.icu.impl; 14 15 import java.util.ArrayList; 16 17 import ohos.global.icu.text.UnicodeSet; 18 import ohos.global.icu.text.UnicodeSet.SpanCondition; 19 import ohos.global.icu.util.OutputInt; 20 21 /* 22 * Implement span() etc. for a set with strings. 23 * Avoid recursion because of its exponential complexity. 24 * Instead, try multiple paths at once and track them with an IndexList. 25 */ 26 /** 27 * @hide exposed on OHOS 28 */ 29 public class UnicodeSetStringSpan { 30 31 /* 32 * Which span() variant will be used? The object is either built for one variant and used once, 33 * or built for all and may be used many times. 34 */ 35 public static final int WITH_COUNT = 0x40; // spanAndCount() may be called 36 public static final int FWD = 0x20; 37 public static final int BACK = 0x10; 38 // public static final int UTF16 = 8; 39 public static final int CONTAINED = 2; 40 public static final int NOT_CONTAINED = 1; 41 42 public static final int ALL = 0x7f; 43 44 public static final int FWD_UTF16_CONTAINED = FWD | /* UTF16 | */ CONTAINED; 45 public static final int FWD_UTF16_NOT_CONTAINED = FWD | /* UTF16 | */NOT_CONTAINED; 46 public static final int BACK_UTF16_CONTAINED = BACK | /* UTF16 | */ CONTAINED; 47 public static final int BACK_UTF16_NOT_CONTAINED = BACK | /* UTF16 | */NOT_CONTAINED; 48 49 /** 50 * Special spanLength short values. (since Java has not unsigned byte type) 51 * All code points in the string are contained in the parent set. 52 */ 53 static final short ALL_CP_CONTAINED = 0xff; 54 /** The spanLength is >=0xfe. */ 55 static final short LONG_SPAN = ALL_CP_CONTAINED - 1; 56 57 /** Set for span(). Same as parent but without strings. */ 58 private UnicodeSet spanSet; 59 60 /** 61 * Set for span(not contained). 62 * Same as spanSet, plus characters that start or end strings. 63 */ 64 private UnicodeSet spanNotSet; 65 66 /** The strings of the parent set. */ 67 private ArrayList<String> strings; 68 69 /** The lengths of span(), spanBack() etc. for each string. */ 70 private short[] spanLengths; 71 72 /** Maximum lengths of relevant strings. */ 73 private final int maxLength16; 74 75 /** Are there strings that are not fully contained in the code point set? */ 76 private boolean someRelevant; 77 78 /** Set up for all variants of span()? */ 79 private boolean all; 80 81 /** Span helper */ 82 private OffsetList offsets; 83 84 /** 85 * Constructs for all variants of span(), or only for any one variant. 86 * Initializes as little as possible, for single use. 87 */ UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which)88 public UnicodeSetStringSpan(final UnicodeSet set, final ArrayList<String> setStrings, int which) { 89 spanSet = new UnicodeSet(0, 0x10ffff); 90 // TODO: With Java 6, just take the parent set's strings as is, 91 // as a NavigableSet<String>, rather than as an ArrayList copy of the set of strings. 92 // Then iterate via the first() and higher() methods. 93 // (We do not want to create multiple Iterator objects in each span().) 94 // See ICU ticket #7454. 95 strings = setStrings; 96 all = (which == ALL); 97 spanSet.retainAll(set); 98 if (0 != (which & NOT_CONTAINED)) { 99 // Default to the same sets. 100 // addToSpanNotSet() will create a separate set if necessary. 101 spanNotSet = spanSet; 102 } 103 offsets = new OffsetList(); 104 105 // Determine if the strings even need to be taken into account at all for span() etc. 106 // If any string is relevant, then all strings need to be used for 107 // span(longest match) but only the relevant ones for span(while contained). 108 // TODO: Possible optimization: Distinguish CONTAINED vs. LONGEST_MATCH 109 // and do not store UTF-8 strings if !thisRelevant and CONTAINED. 110 // (Only store irrelevant UTF-8 strings for LONGEST_MATCH where they are relevant after all.) 111 // Also count the lengths of the UTF-8 versions of the strings for memory allocation. 112 int stringsLength = strings.size(); 113 114 int i, spanLength; 115 int maxLength16 = 0; 116 someRelevant = false; 117 for (i = 0; i < stringsLength; ++i) { 118 String string = strings.get(i); 119 int length16 = string.length(); 120 spanLength = spanSet.span(string, SpanCondition.CONTAINED); 121 if (spanLength < length16) { // Relevant string. 122 someRelevant = true; 123 } 124 if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) { 125 maxLength16 = length16; 126 } 127 } 128 this.maxLength16 = maxLength16; 129 if (!someRelevant && (which & WITH_COUNT) == 0) { 130 return; 131 } 132 133 // Freeze after checking for the need to use strings at all because freezing 134 // a set takes some time and memory which are wasted if there are no relevant strings. 135 if (all) { 136 spanSet.freeze(); 137 } 138 139 int spanBackLengthsOffset; 140 141 // Allocate a block of meta data. 142 int allocSize; 143 if (all) { 144 // 2 sets of span lengths 145 allocSize = stringsLength * (2); 146 } else { 147 allocSize = stringsLength; // One set of span lengths. 148 } 149 spanLengths = new short[allocSize]; 150 151 if (all) { 152 // Store span lengths for all span() variants. 153 spanBackLengthsOffset = stringsLength; 154 } else { 155 // Store span lengths for only one span() variant. 156 spanBackLengthsOffset = 0; 157 } 158 159 // Set the meta data and spanNotSet and write the UTF-8 strings. 160 161 for (i = 0; i < stringsLength; ++i) { 162 String string = strings.get(i); 163 int length16 = string.length(); 164 spanLength = spanSet.span(string, SpanCondition.CONTAINED); 165 if (spanLength < length16) { // Relevant string. 166 if (true /* 0 != (which & UTF16) */) { 167 if (0 != (which & CONTAINED)) { 168 if (0 != (which & FWD)) { 169 spanLengths[i] = makeSpanLengthByte(spanLength); 170 } 171 if (0 != (which & BACK)) { 172 spanLength = length16 173 - spanSet.spanBack(string, length16, SpanCondition.CONTAINED); 174 spanLengths[spanBackLengthsOffset + i] = makeSpanLengthByte(spanLength); 175 } 176 } else /* not CONTAINED, not all, but NOT_CONTAINED */{ 177 spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = 0; // Only store a relevant/irrelevant 178 // flag. 179 } 180 } 181 if (0 != (which & NOT_CONTAINED)) { 182 // Add string start and end code points to the spanNotSet so that 183 // a span(while not contained) stops before any string. 184 int c; 185 if (0 != (which & FWD)) { 186 c = string.codePointAt(0); 187 addToSpanNotSet(c); 188 } 189 if (0 != (which & BACK)) { 190 c = string.codePointBefore(length16); 191 addToSpanNotSet(c); 192 } 193 } 194 } else { // Irrelevant string. 195 if (all) { 196 spanLengths[i] = spanLengths[spanBackLengthsOffset + i] = ALL_CP_CONTAINED; 197 } else { 198 // All spanXYZLengths pointers contain the same address. 199 spanLengths[i] = ALL_CP_CONTAINED; 200 } 201 } 202 } 203 204 // Finish. 205 if (all) { 206 spanNotSet.freeze(); 207 } 208 } 209 210 /** 211 * Constructs a copy of an existing UnicodeSetStringSpan. 212 * Assumes which==ALL for a frozen set. 213 */ UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, final ArrayList<String> newParentSetStrings)214 public UnicodeSetStringSpan(final UnicodeSetStringSpan otherStringSpan, 215 final ArrayList<String> newParentSetStrings) { 216 spanSet = otherStringSpan.spanSet; 217 strings = newParentSetStrings; 218 maxLength16 = otherStringSpan.maxLength16; 219 someRelevant = otherStringSpan.someRelevant; 220 all = true; 221 if (Utility.sameObjects(otherStringSpan.spanNotSet, otherStringSpan.spanSet)) { 222 spanNotSet = spanSet; 223 } else { 224 spanNotSet = (UnicodeSet) otherStringSpan.spanNotSet.clone(); 225 } 226 offsets = new OffsetList(); 227 228 spanLengths = otherStringSpan.spanLengths.clone(); 229 } 230 231 /** 232 * Do the strings need to be checked in span() etc.? 233 * 234 * @return true if strings need to be checked (call span() here), 235 * false if not (use a BMPSet for best performance). 236 */ needsStringSpanUTF16()237 public boolean needsStringSpanUTF16() { 238 return someRelevant; 239 } 240 241 /** For fast UnicodeSet::contains(c). */ contains(int c)242 public boolean contains(int c) { 243 return spanSet.contains(c); 244 } 245 246 /** 247 * Adds a starting or ending string character to the spanNotSet 248 * so that a character span ends before any string. 249 */ addToSpanNotSet(int c)250 private void addToSpanNotSet(int c) { 251 if (Utility.sameObjects(spanNotSet, null) || Utility.sameObjects(spanNotSet, spanSet)) { 252 if (spanSet.contains(c)) { 253 return; // Nothing to do. 254 } 255 spanNotSet = spanSet.cloneAsThawed(); 256 } 257 spanNotSet.add(c); 258 } 259 260 /* 261 * Note: In span() when spanLength==0 262 * (after a string match, or at the beginning after an empty code point span) 263 * and in spanNot() and spanNotUTF8(), 264 * string matching could use a binary search because all string matches are done 265 * from the same start index. 266 * 267 * For UTF-8, this would require a comparison function that returns UTF-16 order. 268 * 269 * This optimization should not be necessary for normal UnicodeSets because most sets have no strings, and most sets 270 * with strings have very few very short strings. For cases with many strings, it might be better to use a different 271 * API and implementation with a DFA (state machine). 272 */ 273 274 /* 275 * Algorithm for span(SpanCondition.CONTAINED) 276 * 277 * Theoretical algorithm: 278 * - Iterate through the string, and at each code point boundary: 279 * + If the code point there is in the set, then remember to continue after it. 280 * + If a set string matches at the current position, then remember to continue after it. 281 * + Either recursively span for each code point or string match, or recursively span 282 * for all but the shortest one and iteratively continue the span with the shortest local match. 283 * + Remember the longest recursive span (the farthest end point). 284 * + If there is no match at the current position, 285 * neither for the code point there nor for any set string, 286 * then stop and return the longest recursive span length. 287 * 288 * Optimized implementation: 289 * 290 * (We assume that most sets will have very few very short strings. 291 * A span using a string-less set is extremely fast.) 292 * 293 * Create and cache a spanSet which contains all of the single code points of the original set 294 * but none of its strings. 295 * 296 * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). 297 * - Loop: 298 * + Try to match each set string at the end of the spanLength. 299 * ~ Set strings that start with set-contained code points 300 * must be matched with a partial overlap 301 * because the recursive algorithm would have tried to match them at every position. 302 * ~ Set strings that entirely consist of set-contained code points 303 * are irrelevant for span(SpanCondition.CONTAINED) 304 * because the recursive algorithm would continue after them anyway and 305 * find the longest recursive match from their end. 306 * ~ Rather than recursing, note each end point of a set string match. 307 * + If no set string matched after spanSet.span(), 308 * then return with where the spanSet.span() ended. 309 * + If at least one set string matched after spanSet.span(), 310 * then pop the shortest string match end point and continue the loop, 311 * trying to match all set strings from there. 312 * + If at least one more set string matched after a previous string match, then test if the 313 * code point after the previous string match is also contained in the set. 314 * Continue the loop with the shortest end point of 315 * either this code point or a matching set string. 316 * + If no more set string matched after a previous string match, 317 * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). 318 * Stop if spanLength==0, otherwise continue the loop. 319 * 320 * By noting each end point of a set string match, the function visits each string position at most once and 321 * finishes in linear time. 322 * 323 * The recursive algorithm may visit the same string position many times 324 * if multiple paths lead to it and finishes in exponential time. 325 */ 326 327 /* 328 * Algorithm for span(SIMPLE) 329 * 330 * Theoretical algorithm: 331 * - Iterate through the string, and at each code point boundary: 332 * + If the code point there is in the set, then remember to continue after it. 333 * + If a set string matches at the current position, then remember to continue after it. 334 * + Continue from the farthest match position and ignore all others. 335 * + If there is no match at the current position, then stop and return the current position. 336 * 337 * Optimized implementation: 338 * 339 * (Same assumption and spanSet as above.) 340 * 341 * - Start with spanLength=spanSet.span(SpanCondition.CONTAINED). 342 * - Loop: 343 * + Try to match each set string at the end of the spanLength. 344 * ~ Set strings that start with set-contained code points 345 * must be matched with a partial overlap 346 * because the standard algorithm would have tried to match them earlier. 347 * ~ Set strings that entirely consist of set-contained code points 348 * must be matched with a full overlap because the longest-match algorithm 349 * would hide set string matches that end earlier. 350 * Such set strings need not be matched earlier inside the code point span 351 * because the standard algorithm would then have 352 * continued after the set string match anyway. 353 * ~ Remember the longest set string match (farthest end point) 354 * from the earliest starting point. 355 * + If no set string matched after spanSet.span(), 356 * then return with where the spanSet.span() ended. 357 * + If at least one set string matched, 358 * then continue the loop after the longest match from the earliest position. 359 * + If no more set string matched after a previous string match, 360 * then try another spanLength=spanSet.span(SpanCondition.CONTAINED). 361 * Stop if spanLength==0, otherwise continue the loop. 362 */ 363 /** 364 * Spans a string. 365 * 366 * @param s The string to be spanned 367 * @param start The start index that the span begins 368 * @param spanCondition The span condition 369 * @return the limit (exclusive end) of the span 370 */ span(CharSequence s, int start, SpanCondition spanCondition)371 public int span(CharSequence s, int start, SpanCondition spanCondition) { 372 if (spanCondition == SpanCondition.NOT_CONTAINED) { 373 return spanNot(s, start, null); 374 } 375 int spanLimit = spanSet.span(s, start, SpanCondition.CONTAINED); 376 if (spanLimit == s.length()) { 377 return spanLimit; 378 } 379 return spanWithStrings(s, start, spanLimit, spanCondition); 380 } 381 382 /** 383 * Synchronized method for complicated spans using the offsets. 384 * Avoids synchronization for simple cases. 385 * 386 * @param spanLimit = spanSet.span(s, start, CONTAINED) 387 */ spanWithStrings(CharSequence s, int start, int spanLimit, SpanCondition spanCondition)388 private synchronized int spanWithStrings(CharSequence s, int start, int spanLimit, 389 SpanCondition spanCondition) { 390 // Consider strings; they may overlap with the span. 391 int initSize = 0; 392 if (spanCondition == SpanCondition.CONTAINED) { 393 // Use offset list to try all possibilities. 394 initSize = maxLength16; 395 } 396 offsets.setMaxLength(initSize); 397 int length = s.length(); 398 int pos = spanLimit, rest = length - spanLimit; 399 int spanLength = spanLimit - start; 400 int i, stringsLength = strings.size(); 401 for (;;) { 402 if (spanCondition == SpanCondition.CONTAINED) { 403 for (i = 0; i < stringsLength; ++i) { 404 int overlap = spanLengths[i]; 405 if (overlap == ALL_CP_CONTAINED) { 406 continue; // Irrelevant string. 407 } 408 String string = strings.get(i); 409 410 int length16 = string.length(); 411 412 // Try to match this string at pos-overlap..pos. 413 if (overlap >= LONG_SPAN) { 414 overlap = length16; 415 // While contained: No point matching fully inside the code point span. 416 overlap = string.offsetByCodePoints(overlap, -1); // Length of the string minus the last code 417 // point. 418 } 419 if (overlap > spanLength) { 420 overlap = spanLength; 421 } 422 int inc = length16 - overlap; // Keep overlap+inc==length16. 423 for (;;) { 424 if (inc > rest) { 425 break; 426 } 427 // Try to match if the increment is not listed already. 428 if (!offsets.containsOffset(inc) && matches16CPB(s, pos - overlap, length, string, length16)) { 429 if (inc == rest) { 430 return length; // Reached the end of the string. 431 } 432 offsets.addOffset(inc); 433 } 434 if (overlap == 0) { 435 break; 436 } 437 --overlap; 438 ++inc; 439 } 440 } 441 } else /* SIMPLE */{ 442 int maxInc = 0, maxOverlap = 0; 443 for (i = 0; i < stringsLength; ++i) { 444 int overlap = spanLengths[i]; 445 // For longest match, we do need to try to match even an all-contained string 446 // to find the match from the earliest start. 447 448 String string = strings.get(i); 449 450 int length16 = string.length(); 451 452 // Try to match this string at pos-overlap..pos. 453 if (overlap >= LONG_SPAN) { 454 overlap = length16; 455 // Longest match: Need to match fully inside the code point span 456 // to find the match from the earliest start. 457 } 458 if (overlap > spanLength) { 459 overlap = spanLength; 460 } 461 int inc = length16 - overlap; // Keep overlap+inc==length16. 462 for (;;) { 463 if (inc > rest || overlap < maxOverlap) { 464 break; 465 } 466 // Try to match if the string is longer or starts earlier. 467 if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */inc > maxInc) 468 && matches16CPB(s, pos - overlap, length, string, length16)) { 469 maxInc = inc; // Longest match from earliest start. 470 maxOverlap = overlap; 471 break; 472 } 473 --overlap; 474 ++inc; 475 } 476 } 477 478 if (maxInc != 0 || maxOverlap != 0) { 479 // Longest-match algorithm, and there was a string match. 480 // Simply continue after it. 481 pos += maxInc; 482 rest -= maxInc; 483 if (rest == 0) { 484 return length; // Reached the end of the string. 485 } 486 spanLength = 0; // Match strings from after a string match. 487 continue; 488 } 489 } 490 // Finished trying to match all strings at pos. 491 492 if (spanLength != 0 || pos == 0) { 493 // The position is after an unlimited code point span (spanLength!=0), 494 // not after a string match. 495 // The only position where spanLength==0 after a span is pos==0. 496 // Otherwise, an unlimited code point span is only tried again when no 497 // strings match, and if such a non-initial span fails we stop. 498 if (offsets.isEmpty()) { 499 return pos; // No strings matched after a span. 500 } 501 // Match strings from after the next string match. 502 } else { 503 // The position is after a string match (or a single code point). 504 if (offsets.isEmpty()) { 505 // No more strings matched after a previous string match. 506 // Try another code point span from after the last string match. 507 spanLimit = spanSet.span(s, pos, SpanCondition.CONTAINED); 508 spanLength = spanLimit - pos; 509 if (spanLength == rest || // Reached the end of the string, or 510 spanLength == 0 // neither strings nor span progressed. 511 ) { 512 return spanLimit; 513 } 514 pos += spanLength; 515 rest -= spanLength; 516 continue; // spanLength>0: Match strings from after a span. 517 } else { 518 // Try to match only one code point from after a string match if some 519 // string matched beyond it, so that we try all possible positions 520 // and don't overshoot. 521 spanLength = spanOne(spanSet, s, pos, rest); 522 if (spanLength > 0) { 523 if (spanLength == rest) { 524 return length; // Reached the end of the string. 525 } 526 // Match strings after this code point. 527 // There cannot be any increments below it because UnicodeSet strings 528 // contain multiple code points. 529 pos += spanLength; 530 rest -= spanLength; 531 offsets.shift(spanLength); 532 spanLength = 0; 533 continue; // Match strings from after a single code point. 534 } 535 // Match strings from after the next string match. 536 } 537 } 538 int minOffset = offsets.popMinimum(null); 539 pos += minOffset; 540 rest -= minOffset; 541 spanLength = 0; // Match strings from after a string match. 542 } 543 } 544 545 /** 546 * Spans a string and counts the smallest number of set elements on any path across the span. 547 * 548 * <p>For proper counting, we cannot ignore strings that are fully contained in code point spans. 549 * 550 * <p>If the set does not have any fully-contained strings, then we could optimize this 551 * like span(), but such sets are likely rare, and this is at least still linear. 552 * 553 * @param s The string to be spanned 554 * @param start The start index that the span begins 555 * @param spanCondition The span condition 556 * @param outCount The count 557 * @return the limit (exclusive end) of the span 558 */ spanAndCount(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)559 public int spanAndCount(CharSequence s, int start, SpanCondition spanCondition, 560 OutputInt outCount) { 561 if (spanCondition == SpanCondition.NOT_CONTAINED) { 562 return spanNot(s, start, outCount); 563 } 564 // Consider strings; they may overlap with the span, 565 // and they may result in a smaller count that with just code points. 566 if (spanCondition == SpanCondition.CONTAINED) { 567 return spanContainedAndCount(s, start, outCount); 568 } 569 // SIMPLE (not synchronized, does not use offsets) 570 int stringsLength = strings.size(); 571 int length = s.length(); 572 int pos = start; 573 int rest = length - start; 574 int count = 0; 575 while (rest != 0) { 576 // Try to match the next code point. 577 int cpLength = spanOne(spanSet, s, pos, rest); 578 int maxInc = (cpLength > 0) ? cpLength : 0; 579 // Try to match all of the strings. 580 for (int i = 0; i < stringsLength; ++i) { 581 String string = strings.get(i); 582 int length16 = string.length(); 583 if (maxInc < length16 && length16 <= rest && 584 matches16CPB(s, pos, length, string, length16)) { 585 maxInc = length16; 586 } 587 } 588 // We are done if there is no match beyond pos. 589 if (maxInc == 0) { 590 outCount.value = count; 591 return pos; 592 } 593 // Continue from the longest match. 594 ++count; 595 pos += maxInc; 596 rest -= maxInc; 597 } 598 outCount.value = count; 599 return pos; 600 } 601 spanContainedAndCount(CharSequence s, int start, OutputInt outCount)602 private synchronized int spanContainedAndCount(CharSequence s, int start, OutputInt outCount) { 603 // Use offset list to try all possibilities. 604 offsets.setMaxLength(maxLength16); 605 int stringsLength = strings.size(); 606 int length = s.length(); 607 int pos = start; 608 int rest = length - start; 609 int count = 0; 610 while (rest != 0) { 611 // Try to match the next code point. 612 int cpLength = spanOne(spanSet, s, pos, rest); 613 if (cpLength > 0) { 614 offsets.addOffsetAndCount(cpLength, count + 1); 615 } 616 // Try to match all of the strings. 617 for (int i = 0; i < stringsLength; ++i) { 618 String string = strings.get(i); 619 int length16 = string.length(); 620 // Note: If the strings were sorted by length, then we could also 621 // avoid trying to match if there is already a match of the same length. 622 if (length16 <= rest && !offsets.hasCountAtOffset(length16, count + 1) && 623 matches16CPB(s, pos, length, string, length16)) { 624 offsets.addOffsetAndCount(length16, count + 1); 625 } 626 } 627 // We are done if there is no match beyond pos. 628 if (offsets.isEmpty()) { 629 outCount.value = count; 630 return pos; 631 } 632 // Continue from the nearest match. 633 int minOffset = offsets.popMinimum(outCount); 634 count = outCount.value; 635 pos += minOffset; 636 rest -= minOffset; 637 } 638 outCount.value = count; 639 return pos; 640 } 641 642 /** 643 * Span a string backwards. 644 * 645 * @param s The string to be spanned 646 * @param spanCondition The span condition 647 * @return The string index which starts the span (i.e. inclusive). 648 */ spanBack(CharSequence s, int length, SpanCondition spanCondition)649 public synchronized int spanBack(CharSequence s, int length, SpanCondition spanCondition) { 650 if (spanCondition == SpanCondition.NOT_CONTAINED) { 651 return spanNotBack(s, length); 652 } 653 int pos = spanSet.spanBack(s, length, SpanCondition.CONTAINED); 654 if (pos == 0) { 655 return 0; 656 } 657 int spanLength = length - pos; 658 659 // Consider strings; they may overlap with the span. 660 int initSize = 0; 661 if (spanCondition == SpanCondition.CONTAINED) { 662 // Use offset list to try all possibilities. 663 initSize = maxLength16; 664 } 665 offsets.setMaxLength(initSize); 666 int i, stringsLength = strings.size(); 667 int spanBackLengthsOffset = 0; 668 if (all) { 669 spanBackLengthsOffset = stringsLength; 670 } 671 for (;;) { 672 if (spanCondition == SpanCondition.CONTAINED) { 673 for (i = 0; i < stringsLength; ++i) { 674 int overlap = spanLengths[spanBackLengthsOffset + i]; 675 if (overlap == ALL_CP_CONTAINED) { 676 continue; // Irrelevant string. 677 } 678 String string = strings.get(i); 679 680 int length16 = string.length(); 681 682 // Try to match this string at pos-(length16-overlap)..pos-length16. 683 if (overlap >= LONG_SPAN) { 684 overlap = length16; 685 // While contained: No point matching fully inside the code point span. 686 int len1 = 0; 687 len1 = string.offsetByCodePoints(0, 1); 688 overlap -= len1; // Length of the string minus the first code point. 689 } 690 if (overlap > spanLength) { 691 overlap = spanLength; 692 } 693 int dec = length16 - overlap; // Keep dec+overlap==length16. 694 for (;;) { 695 if (dec > pos) { 696 break; 697 } 698 // Try to match if the decrement is not listed already. 699 if (!offsets.containsOffset(dec) && matches16CPB(s, pos - dec, length, string, length16)) { 700 if (dec == pos) { 701 return 0; // Reached the start of the string. 702 } 703 offsets.addOffset(dec); 704 } 705 if (overlap == 0) { 706 break; 707 } 708 --overlap; 709 ++dec; 710 } 711 } 712 } else /* SIMPLE */{ 713 int maxDec = 0, maxOverlap = 0; 714 for (i = 0; i < stringsLength; ++i) { 715 int overlap = spanLengths[spanBackLengthsOffset + i]; 716 // For longest match, we do need to try to match even an all-contained string 717 // to find the match from the latest end. 718 719 String string = strings.get(i); 720 721 int length16 = string.length(); 722 723 // Try to match this string at pos-(length16-overlap)..pos-length16. 724 if (overlap >= LONG_SPAN) { 725 overlap = length16; 726 // Longest match: Need to match fully inside the code point span 727 // to find the match from the latest end. 728 } 729 if (overlap > spanLength) { 730 overlap = spanLength; 731 } 732 int dec = length16 - overlap; // Keep dec+overlap==length16. 733 for (;;) { 734 if (dec > pos || overlap < maxOverlap) { 735 break; 736 } 737 // Try to match if the string is longer or ends later. 738 if ((overlap > maxOverlap || /* redundant overlap==maxOverlap && */dec > maxDec) 739 && matches16CPB(s, pos - dec, length, string, length16)) { 740 maxDec = dec; // Longest match from latest end. 741 maxOverlap = overlap; 742 break; 743 } 744 --overlap; 745 ++dec; 746 } 747 } 748 749 if (maxDec != 0 || maxOverlap != 0) { 750 // Longest-match algorithm, and there was a string match. 751 // Simply continue before it. 752 pos -= maxDec; 753 if (pos == 0) { 754 return 0; // Reached the start of the string. 755 } 756 spanLength = 0; // Match strings from before a string match. 757 continue; 758 } 759 } 760 // Finished trying to match all strings at pos. 761 762 if (spanLength != 0 || pos == length) { 763 // The position is before an unlimited code point span (spanLength!=0), 764 // not before a string match. 765 // The only position where spanLength==0 before a span is pos==length. 766 // Otherwise, an unlimited code point span is only tried again when no 767 // strings match, and if such a non-initial span fails we stop. 768 if (offsets.isEmpty()) { 769 return pos; // No strings matched before a span. 770 } 771 // Match strings from before the next string match. 772 } else { 773 // The position is before a string match (or a single code point). 774 if (offsets.isEmpty()) { 775 // No more strings matched before a previous string match. 776 // Try another code point span from before the last string match. 777 int oldPos = pos; 778 pos = spanSet.spanBack(s, oldPos, SpanCondition.CONTAINED); 779 spanLength = oldPos - pos; 780 if (pos == 0 || // Reached the start of the string, or 781 spanLength == 0 // neither strings nor span progressed. 782 ) { 783 return pos; 784 } 785 continue; // spanLength>0: Match strings from before a span. 786 } else { 787 // Try to match only one code point from before a string match if some 788 // string matched beyond it, so that we try all possible positions 789 // and don't overshoot. 790 spanLength = spanOneBack(spanSet, s, pos); 791 if (spanLength > 0) { 792 if (spanLength == pos) { 793 return 0; // Reached the start of the string. 794 } 795 // Match strings before this code point. 796 // There cannot be any decrements below it because UnicodeSet strings 797 // contain multiple code points. 798 pos -= spanLength; 799 offsets.shift(spanLength); 800 spanLength = 0; 801 continue; // Match strings from before a single code point. 802 } 803 // Match strings from before the next string match. 804 } 805 } 806 pos -= offsets.popMinimum(null); 807 spanLength = 0; // Match strings from before a string match. 808 } 809 } 810 811 /** 812 * Algorithm for spanNot()==span(SpanCondition.NOT_CONTAINED) 813 * 814 * Theoretical algorithm: 815 * - Iterate through the string, and at each code point boundary: 816 * + If the code point there is in the set, then return with the current position. 817 * + If a set string matches at the current position, then return with the current position. 818 * 819 * Optimized implementation: 820 * 821 * (Same assumption as for span() above.) 822 * 823 * Create and cache a spanNotSet which contains 824 * all of the single code points of the original set but none of its strings. 825 * For each set string add its initial code point to the spanNotSet. 826 * (Also add its final code point for spanNotBack().) 827 * 828 * - Loop: 829 * + Do spanLength=spanNotSet.span(SpanCondition.NOT_CONTAINED). 830 * + If the current code point is in the original set, then return the current position. 831 * + If any set string matches at the current position, then return the current position. 832 * + If there is no match at the current position, neither for the code point 833 * there nor for any set string, then skip this code point and continue the loop. 834 * This happens for set-string-initial code points that were added to spanNotSet 835 * when there is not actually a match for such a set string. 836 * 837 * @param s The string to be spanned 838 * @param start The start index that the span begins 839 * @param outCount If not null: Receives the number of code points across the span. 840 * @return the limit (exclusive end) of the span 841 */ spanNot(CharSequence s, int start, OutputInt outCount)842 private int spanNot(CharSequence s, int start, OutputInt outCount) { 843 int length = s.length(); 844 int pos = start, rest = length - start; 845 int stringsLength = strings.size(); 846 int count = 0; 847 do { 848 // Span until we find a code point from the set, 849 // or a code point that starts or ends some string. 850 int spanLimit; 851 if (outCount == null) { 852 spanLimit = spanNotSet.span(s, pos, SpanCondition.NOT_CONTAINED); 853 } else { 854 spanLimit = spanNotSet.spanAndCount(s, pos, SpanCondition.NOT_CONTAINED, outCount); 855 outCount.value = count = count + outCount.value; 856 } 857 if (spanLimit == length) { 858 return length; // Reached the end of the string. 859 } 860 pos = spanLimit; 861 rest = length - spanLimit; 862 863 // Check whether the current code point is in the original set, 864 // without the string starts and ends. 865 int cpLength = spanOne(spanSet, s, pos, rest); 866 if (cpLength > 0) { 867 return pos; // There is a set element at pos. 868 } 869 870 // Try to match the strings at pos. 871 for (int i = 0; i < stringsLength; ++i) { 872 if (spanLengths[i] == ALL_CP_CONTAINED) { 873 continue; // Irrelevant string. 874 } 875 String string = strings.get(i); 876 877 int length16 = string.length(); 878 if (length16 <= rest && matches16CPB(s, pos, length, string, length16)) { 879 return pos; // There is a set element at pos. 880 } 881 } 882 883 // The span(while not contained) ended on a string start/end which is 884 // not in the original set. Skip this code point and continue. 885 // cpLength<0 886 pos -= cpLength; 887 rest += cpLength; 888 ++count; 889 } while (rest != 0); 890 if (outCount != null) { 891 outCount.value = count; 892 } 893 return length; // Reached the end of the string. 894 } 895 spanNotBack(CharSequence s, int length)896 private int spanNotBack(CharSequence s, int length) { 897 int pos = length; 898 int i, stringsLength = strings.size(); 899 do { 900 // Span until we find a code point from the set, 901 // or a code point that starts or ends some string. 902 pos = spanNotSet.spanBack(s, pos, SpanCondition.NOT_CONTAINED); 903 if (pos == 0) { 904 return 0; // Reached the start of the string. 905 } 906 907 // Check whether the current code point is in the original set, 908 // without the string starts and ends. 909 int cpLength = spanOneBack(spanSet, s, pos); 910 if (cpLength > 0) { 911 return pos; // There is a set element at pos. 912 } 913 914 // Try to match the strings at pos. 915 for (i = 0; i < stringsLength; ++i) { 916 // Use spanLengths rather than a spanLengths pointer because 917 // it is easier and we only need to know whether the string is irrelevant 918 // which is the same in either array. 919 if (spanLengths[i] == ALL_CP_CONTAINED) { 920 continue; // Irrelevant string. 921 } 922 String string = strings.get(i); 923 924 int length16 = string.length(); 925 if (length16 <= pos && matches16CPB(s, pos - length16, length, string, length16)) { 926 return pos; // There is a set element at pos. 927 } 928 } 929 930 // The span(while not contained) ended on a string start/end which is 931 // not in the original set. Skip this code point and continue. 932 // cpLength<0 933 pos += cpLength; 934 } while (pos != 0); 935 return 0; // Reached the start of the string. 936 } 937 makeSpanLengthByte(int spanLength)938 static short makeSpanLengthByte(int spanLength) { 939 // 0xfe==UnicodeSetStringSpan::LONG_SPAN 940 return spanLength < LONG_SPAN ? (short) spanLength : LONG_SPAN; 941 } 942 943 // Compare strings without any argument checks. Requires length>0. matches16(CharSequence s, int start, final String t, int length)944 private static boolean matches16(CharSequence s, int start, final String t, int length) { 945 int end = start + length; 946 while (length-- > 0) { 947 if (s.charAt(--end) != t.charAt(length)) { 948 return false; 949 } 950 } 951 return true; 952 } 953 954 /** 955 * Compare 16-bit Unicode strings (which may be malformed UTF-16) 956 * at code point boundaries. 957 * That is, each edge of a match must not be in the middle of a surrogate pair. 958 * @param s The string to match in. 959 * @param start The start index of s. 960 * @param limit The limit of the subsequence of s being spanned. 961 * @param t The substring to be matched in s. 962 * @param tlength The length of t. 963 */ matches16CPB(CharSequence s, int start, int limit, final String t, int tlength)964 static boolean matches16CPB(CharSequence s, int start, int limit, final String t, int tlength) { 965 return matches16(s, start, t, tlength) 966 && !(0 < start && Character.isHighSurrogate(s.charAt(start - 1)) && 967 Character.isLowSurrogate(s.charAt(start))) 968 && !((start + tlength) < limit && Character.isHighSurrogate(s.charAt(start + tlength - 1)) && 969 Character.isLowSurrogate(s.charAt(start + tlength))); 970 } 971 972 /** 973 * Does the set contain the next code point? 974 * If so, return its length; otherwise return its negative length. 975 */ spanOne(final UnicodeSet set, CharSequence s, int start, int length)976 static int spanOne(final UnicodeSet set, CharSequence s, int start, int length) { 977 char c = s.charAt(start); 978 if (c >= 0xd800 && c <= 0xdbff && length >= 2) { 979 char c2 = s.charAt(start + 1); 980 if (ohos.global.icu.text.UTF16.isTrailSurrogate(c2)) { 981 int supplementary = Character.toCodePoint(c, c2); 982 return set.contains(supplementary) ? 2 : -2; 983 } 984 } 985 return set.contains(c) ? 1 : -1; 986 } 987 spanOneBack(final UnicodeSet set, CharSequence s, int length)988 static int spanOneBack(final UnicodeSet set, CharSequence s, int length) { 989 char c = s.charAt(length - 1); 990 if (c >= 0xdc00 && c <= 0xdfff && length >= 2) { 991 char c2 = s.charAt(length - 2); 992 if (ohos.global.icu.text.UTF16.isLeadSurrogate(c2)) { 993 int supplementary = Character.toCodePoint(c2, c); 994 return set.contains(supplementary) ? 2 : -2; 995 } 996 } 997 return set.contains(c) ? 1 : -1; 998 } 999 1000 /** 1001 * Helper class for UnicodeSetStringSpan. 1002 * 1003 * <p>List of offsets from the current position from where to try matching 1004 * a code point or a string. 1005 * Stores offsets rather than indexes to simplify the code and use the same list 1006 * for both increments (in span()) and decrements (in spanBack()). 1007 * 1008 * <p>Assumption: The maximum offset is limited, and the offsets that are stored at any one time 1009 * are relatively dense, that is, 1010 * there are normally no gaps of hundreds or thousands of offset values. 1011 * 1012 * <p>This class optionally also tracks the minimum non-negative count for each position, 1013 * intended to count the smallest number of elements of any path leading to that position. 1014 * 1015 * <p>The implementation uses a circular buffer of count integers, 1016 * each indicating whether the corresponding offset is in the list, 1017 * and its path element count. 1018 * This avoids inserting into a sorted list of offsets (or absolute indexes) 1019 * and physically moving part of the list. 1020 * 1021 * <p>Note: In principle, the caller should setMaxLength() to 1022 * the maximum of the max string length and U16_LENGTH/U8_LENGTH 1023 * to account for "long" single code points. 1024 * 1025 * <p>Note: An earlier version did not track counts and stored only byte flags. 1026 * With boolean flags, if maxLength were guaranteed to be no more than 32 or 64, 1027 * the list could be stored as bit flags in a single integer. 1028 * Rather than handling a circular buffer with a start list index, 1029 * the integer would simply be shifted when lower offsets are removed. 1030 * UnicodeSet does not have a limit on the lengths of strings. 1031 */ 1032 private static final class OffsetList { 1033 private int[] list; 1034 private int length; 1035 private int start; 1036 OffsetList()1037 public OffsetList() { 1038 list = new int[16]; // default size 1039 } 1040 setMaxLength(int maxLength)1041 public void setMaxLength(int maxLength) { 1042 if (maxLength > list.length) { 1043 list = new int[maxLength]; 1044 } 1045 clear(); 1046 } 1047 clear()1048 public void clear() { 1049 for (int i = list.length; i-- > 0;) { 1050 list[i] = 0; 1051 } 1052 start = length = 0; 1053 } 1054 isEmpty()1055 public boolean isEmpty() { 1056 return (length == 0); 1057 } 1058 1059 /** 1060 * Reduces all stored offsets by delta, used when the current position moves by delta. 1061 * There must not be any offsets lower than delta. 1062 * If there is an offset equal to delta, it is removed. 1063 * 1064 * @param delta [1..maxLength] 1065 */ shift(int delta)1066 public void shift(int delta) { 1067 int i = start + delta; 1068 if (i >= list.length) { 1069 i -= list.length; 1070 } 1071 if (list[i] != 0) { 1072 list[i] = 0; 1073 --length; 1074 } 1075 start = i; 1076 } 1077 1078 /** 1079 * Adds an offset. The list must not contain it yet. 1080 * @param offset [1..maxLength] 1081 */ addOffset(int offset)1082 public void addOffset(int offset) { 1083 int i = start + offset; 1084 if (i >= list.length) { 1085 i -= list.length; 1086 } 1087 assert list[i] == 0; 1088 list[i] = 1; 1089 ++length; 1090 } 1091 1092 /** 1093 * Adds an offset and updates its count. 1094 * The list may already contain the offset. 1095 * @param offset [1..maxLength] 1096 */ addOffsetAndCount(int offset, int count)1097 public void addOffsetAndCount(int offset, int count) { 1098 assert count > 0; 1099 int i = start + offset; 1100 if (i >= list.length) { 1101 i -= list.length; 1102 } 1103 if (list[i] == 0) { 1104 list[i] = count; 1105 ++length; 1106 } else if (count < list[i]) { 1107 list[i] = count; 1108 } 1109 } 1110 1111 /** 1112 * @param offset [1..maxLength] 1113 */ containsOffset(int offset)1114 public boolean containsOffset(int offset) { 1115 int i = start + offset; 1116 if (i >= list.length) { 1117 i -= list.length; 1118 } 1119 return list[i] != 0; 1120 } 1121 1122 /** 1123 * @param offset [1..maxLength] 1124 */ hasCountAtOffset(int offset, int count)1125 public boolean hasCountAtOffset(int offset, int count) { 1126 int i = start + offset; 1127 if (i >= list.length) { 1128 i -= list.length; 1129 } 1130 int oldCount = list[i]; 1131 return oldCount != 0 && oldCount <= count; 1132 } 1133 1134 /** 1135 * Finds the lowest stored offset from a non-empty list, removes it, 1136 * and reduces all other offsets by this minimum. 1137 * @return min=[1..maxLength] 1138 */ popMinimum(OutputInt outCount)1139 public int popMinimum(OutputInt outCount) { 1140 // Look for the next offset in list[start+1..list.length-1]. 1141 int i = start, result; 1142 while (++i < list.length) { 1143 int count = list[i]; 1144 if (count != 0) { 1145 list[i] = 0; 1146 --length; 1147 result = i - start; 1148 start = i; 1149 if (outCount != null) { outCount.value = count; } 1150 return result; 1151 } 1152 } 1153 // i==list.length 1154 1155 // Wrap around and look for the next offset in list[0..start]. 1156 // Since the list is not empty, there will be one. 1157 result = list.length - start; 1158 i = 0; 1159 int count; 1160 while ((count = list[i]) == 0) { 1161 ++i; 1162 } 1163 list[i] = 0; 1164 --length; 1165 start = i; 1166 if (outCount != null) { outCount.value = count; } 1167 return result + i; 1168 } 1169 } 1170 } 1171