1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2018 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 5 // created: 2018may04 Markus W. Scherer 6 7 package ohos.global.icu.util; 8 9 import java.util.Arrays; 10 11 /** 12 * Mutable Unicode code point trie. 13 * Fast map from Unicode code points (U+0000..U+10FFFF) to 32-bit integer values. 14 * For details see http://site.icu-project.org/design/struct/utrie 15 * 16 * <p>Setting values (especially ranges) and lookup is fast. 17 * The mutable trie is only somewhat space-efficient. 18 * It builds a compacted, immutable {@link CodePointTrie}. 19 * 20 * <p>This trie can be modified while iterating over its contents. 21 * For example, it is possible to merge its values with those from another 22 * set of ranges (e.g., another @{link CodePointMap}): 23 * Iterate over those source ranges; for each of them iterate over this trie; 24 * add the source value into the value of each trie range. 25 * 26 * @hide exposed on OHOS 27 */ 28 public final class MutableCodePointTrie extends CodePointMap implements Cloneable { 29 /** 30 * Constructs a mutable trie that initially maps each Unicode code point to the same value. 31 * It uses 32-bit data values until 32 * {@link #buildImmutable(ohos.global.icu.util.CodePointTrie.Type, ohos.global.icu.util.CodePointTrie.ValueWidth)} 33 * is called. 34 * buildImmutable() takes a valueWidth parameter which 35 * determines the number of bits in the data value in the resulting {@link CodePointTrie}. 36 * 37 * @param initialValue the initial value that is set for all code points 38 * @param errorValue the value for out-of-range code points and ill-formed UTF-8/16 39 */ MutableCodePointTrie(int initialValue, int errorValue)40 public MutableCodePointTrie(int initialValue, int errorValue) { 41 index = new int[BMP_I_LIMIT]; 42 index3NullOffset = -1; 43 data = new int[INITIAL_DATA_LENGTH]; 44 dataNullOffset = -1; 45 origInitialValue = initialValue; 46 this.initialValue = initialValue; 47 this.errorValue = errorValue; 48 highValue = initialValue; 49 } 50 51 /** 52 * Clones this mutable trie. 53 * 54 * @return the clone 55 */ 56 @Override clone()57 public MutableCodePointTrie clone() { 58 try { 59 MutableCodePointTrie builder = (MutableCodePointTrie) super.clone(); 60 int iCapacity = highStart <= BMP_LIMIT ? BMP_I_LIMIT : I_LIMIT; 61 builder.index = new int[iCapacity]; 62 builder.flags = new byte[UNICODE_LIMIT >> CodePointTrie.SHIFT_3]; 63 for (int i = 0, iLimit = highStart >> CodePointTrie.SHIFT_3; i < iLimit; ++i) { 64 builder.index[i] = index[i]; 65 builder.flags[i] = flags[i]; 66 } 67 builder.index3NullOffset = index3NullOffset; 68 builder.data = data.clone(); 69 builder.dataLength = dataLength; 70 builder.dataNullOffset = dataNullOffset; 71 builder.origInitialValue = origInitialValue; 72 builder.initialValue = initialValue; 73 builder.errorValue = errorValue; 74 builder.highStart = highStart; 75 builder.highValue = highValue; 76 assert index16 == null; 77 return builder; 78 } catch (CloneNotSupportedException ignored) { 79 // Unreachable: Cloning *is* supported. 80 return null; 81 } 82 } 83 84 /** 85 * Creates a mutable trie with the same contents as the {@link CodePointMap}. 86 * 87 * @param map the source map or trie 88 * @return the mutable trie 89 */ fromCodePointMap(CodePointMap map)90 public static MutableCodePointTrie fromCodePointMap(CodePointMap map) { 91 // TODO: Consider special code branch for map instanceof CodePointTrie? 92 // Use the highValue as the initialValue to reduce the highStart. 93 int errorValue = map.get(-1); 94 int initialValue = map.get(MAX_UNICODE); 95 MutableCodePointTrie mutableTrie = new MutableCodePointTrie(initialValue, errorValue); 96 CodePointMap.Range range = new CodePointMap.Range(); 97 int start = 0; 98 while (map.getRange(start, null, range)) { 99 int end = range.getEnd(); 100 int value = range.getValue(); 101 if (value != initialValue) { 102 if (start == end) { 103 mutableTrie.set(start, value); 104 } else { 105 mutableTrie.setRange(start, end, value); 106 } 107 } 108 start = end + 1; 109 } 110 return mutableTrie; 111 } 112 clear()113 private void clear() { 114 index3NullOffset = dataNullOffset = -1; 115 dataLength = 0; 116 highValue = initialValue = origInitialValue; 117 highStart = 0; 118 index16 = null; 119 } 120 121 /** 122 * {@inheritDoc} 123 */ 124 @Override get(int c)125 public int get(int c) { 126 if (c < 0 || MAX_UNICODE < c) { 127 return errorValue; 128 } 129 if (c >= highStart) { 130 return highValue; 131 } 132 int i = c >> CodePointTrie.SHIFT_3; 133 if (flags[i] == ALL_SAME) { 134 return index[i]; 135 } else { 136 return data[index[i] + (c & CodePointTrie.SMALL_DATA_MASK)]; 137 } 138 } 139 maybeFilterValue(int value, int initialValue, int nullValue, ValueFilter filter)140 private static final int maybeFilterValue(int value, int initialValue, int nullValue, 141 ValueFilter filter) { 142 if (value == initialValue) { 143 value = nullValue; 144 } else if (filter != null) { 145 value = filter.apply(value); 146 } 147 return value; 148 } 149 150 /** 151 * {@inheritDoc} 152 * 153 * <p>The trie can be modified between calls to this function. 154 */ 155 @Override getRange(int start, CodePointTrie.ValueFilter filter, CodePointTrie.Range range)156 public boolean getRange(int start, CodePointTrie.ValueFilter filter, 157 CodePointTrie.Range range) { 158 if (start < 0 || MAX_UNICODE < start) { 159 return false; 160 } 161 if (start >= highStart) { 162 int value = highValue; 163 if (filter != null) { value = filter.apply(value); } 164 range.set(start, MAX_UNICODE, value); 165 return true; 166 } 167 int nullValue = initialValue; 168 if (filter != null) { nullValue = filter.apply(nullValue); } 169 int c = start; 170 // Initialize to make compiler happy. Real value when haveValue is true. 171 int trieValue = 0, value = 0; 172 boolean haveValue = false; 173 int i = c >> CodePointTrie.SHIFT_3; 174 do { 175 if (flags[i] == ALL_SAME) { 176 int trieValue2 = index[i]; 177 if (haveValue) { 178 if (trieValue2 != trieValue) { 179 if (filter == null || 180 maybeFilterValue(trieValue2, initialValue, nullValue, 181 filter) != value) { 182 range.set(start, c - 1, value); 183 return true; 184 } 185 trieValue = trieValue2; // may or may not help 186 } 187 } else { 188 trieValue = trieValue2; 189 value = maybeFilterValue(trieValue2, initialValue, nullValue, filter); 190 haveValue = true; 191 } 192 c = (c + CodePointTrie.SMALL_DATA_BLOCK_LENGTH) & ~CodePointTrie.SMALL_DATA_MASK; 193 } else /* MIXED */ { 194 int di = index[i] + (c & CodePointTrie.SMALL_DATA_MASK); 195 int trieValue2 = data[di]; 196 if (haveValue) { 197 if (trieValue2 != trieValue) { 198 if (filter == null || 199 maybeFilterValue(trieValue2, initialValue, nullValue, 200 filter) != value) { 201 range.set(start, c - 1, value); 202 return true; 203 } 204 trieValue = trieValue2; // may or may not help 205 } 206 } else { 207 trieValue = trieValue2; 208 value = maybeFilterValue(trieValue2, initialValue, nullValue, filter); 209 haveValue = true; 210 } 211 while ((++c & CodePointTrie.SMALL_DATA_MASK) != 0) { 212 trieValue2 = data[++di]; 213 if (trieValue2 != trieValue) { 214 if (filter == null || 215 maybeFilterValue(trieValue2, initialValue, nullValue, 216 filter) != value) { 217 range.set(start, c - 1, value); 218 return true; 219 } 220 trieValue = trieValue2; // may or may not help 221 } 222 } 223 } 224 ++i; 225 } while (c < highStart); 226 assert(haveValue); 227 if (maybeFilterValue(highValue, initialValue, nullValue, filter) != value) { 228 range.set(start, c - 1, value); 229 } else { 230 range.set(start, MAX_UNICODE, value); 231 } 232 return true; 233 } 234 writeBlock(int block, int value)235 private void writeBlock(int block, int value) { 236 int limit = block + CodePointTrie.SMALL_DATA_BLOCK_LENGTH; 237 Arrays.fill(data, block, limit, value); 238 } 239 240 /** 241 * Sets a value for a code point. 242 * 243 * @param c the code point 244 * @param value the value 245 */ set(int c, int value)246 public void set(int c, int value) { 247 if (c < 0 || MAX_UNICODE < c) { 248 throw new IllegalArgumentException("invalid code point"); 249 } 250 251 ensureHighStart(c); 252 int block = getDataBlock(c >> CodePointTrie.SHIFT_3); 253 data[block + (c & CodePointTrie.SMALL_DATA_MASK)] = value; 254 } 255 fillBlock(int block, int start, int limit, int value)256 private void fillBlock(int block, int start, int limit, int value) { 257 Arrays.fill(data, block + start, block + limit, value); 258 } 259 260 /** 261 * Sets a value for each code point [start..end]. 262 * Faster and more space-efficient than setting the value for each code point separately. 263 * 264 * @param start the first code point to get the value 265 * @param end the last code point to get the value (inclusive) 266 * @param value the value 267 */ setRange(int start, int end, int value)268 public void setRange(int start, int end, int value) { 269 if (start < 0 || MAX_UNICODE < start || end < 0 || MAX_UNICODE < end || start > end) { 270 throw new IllegalArgumentException("invalid code point range"); 271 } 272 ensureHighStart(end); 273 274 int limit = end + 1; 275 if ((start & CodePointTrie.SMALL_DATA_MASK) != 0) { 276 // Set partial block at [start..following block boundary[. 277 int block = getDataBlock(start >> CodePointTrie.SHIFT_3); 278 int nextStart = (start + CodePointTrie.SMALL_DATA_MASK) & ~CodePointTrie.SMALL_DATA_MASK; 279 if (nextStart <= limit) { 280 fillBlock(block, start & CodePointTrie.SMALL_DATA_MASK, 281 CodePointTrie.SMALL_DATA_BLOCK_LENGTH, value); 282 start = nextStart; 283 } else { 284 fillBlock(block, start & CodePointTrie.SMALL_DATA_MASK, 285 limit & CodePointTrie.SMALL_DATA_MASK, value); 286 return; 287 } 288 } 289 290 // Number of positions in the last, partial block. 291 int rest = limit & CodePointTrie.SMALL_DATA_MASK; 292 293 // Round down limit to a block boundary. 294 limit &= ~CodePointTrie.SMALL_DATA_MASK; 295 296 // Iterate over all-value blocks. 297 while (start < limit) { 298 int i = start >> CodePointTrie.SHIFT_3; 299 if (flags[i] == ALL_SAME) { 300 index[i] = value; 301 } else /* MIXED */ { 302 fillBlock(index[i], 0, CodePointTrie.SMALL_DATA_BLOCK_LENGTH, value); 303 } 304 start += CodePointTrie.SMALL_DATA_BLOCK_LENGTH; 305 } 306 307 if (rest > 0) { 308 // Set partial block at [last block boundary..limit[. 309 int block = getDataBlock(start >> CodePointTrie.SHIFT_3); 310 fillBlock(block, 0, rest, value); 311 } 312 } 313 314 /** 315 * Compacts the data and builds an immutable {@link CodePointTrie} according to the parameters. 316 * After this, the mutable trie will be empty. 317 * 318 * <p>The mutable trie stores 32-bit values until buildImmutable() is called. 319 * If values shorter than 32 bits are to be stored in the immutable trie, 320 * then the upper bits are discarded. 321 * For example, when the mutable trie contains values 0x81, -0x7f, and 0xa581, 322 * and the value width is 8 bits, then each of these is stored as 0x81 323 * and the immutable trie will return that as an unsigned value. 324 * (Some implementations may want to make productive temporary use of the upper bits 325 * until buildImmutable() discards them.) 326 * 327 * <p>Not every possible set of mappings can be built into a CodePointTrie, 328 * because of limitations resulting from speed and space optimizations. 329 * Every Unicode assigned character can be mapped to a unique value. 330 * Typical data yields data structures far smaller than the limitations. 331 * 332 * <p>It is possible to construct extremely unusual mappings that exceed the 333 * data structure limits. 334 * In such a case this function will throw an exception. 335 * 336 * @param type selects the trie type 337 * @param valueWidth selects the number of bits in a trie data value; if smaller than 32 bits, 338 * then the values stored in the trie will be truncated first 339 * 340 * @see #fromCodePointMap(CodePointMap) 341 */ buildImmutable(CodePointTrie.Type type, CodePointTrie.ValueWidth valueWidth)342 public CodePointTrie buildImmutable(CodePointTrie.Type type, CodePointTrie.ValueWidth valueWidth) { 343 if (type == null || valueWidth == null) { 344 throw new IllegalArgumentException("The type and valueWidth must be specified."); 345 } 346 347 try { 348 return build(type, valueWidth); 349 } finally { 350 clear(); 351 } 352 } 353 354 private static final int MAX_UNICODE = 0x10ffff; 355 356 private static final int UNICODE_LIMIT = 0x110000; 357 private static final int BMP_LIMIT = 0x10000; 358 private static final int ASCII_LIMIT = 0x80; 359 360 private static final int I_LIMIT = UNICODE_LIMIT >> CodePointTrie.SHIFT_3; 361 private static final int BMP_I_LIMIT = BMP_LIMIT >> CodePointTrie.SHIFT_3; 362 private static final int ASCII_I_LIMIT = ASCII_LIMIT >> CodePointTrie.SHIFT_3; 363 364 private static final int SMALL_DATA_BLOCKS_PER_BMP_BLOCK = (1 << (CodePointTrie.FAST_SHIFT - CodePointTrie.SHIFT_3)); 365 366 // Flag values for data blocks. 367 private static final byte ALL_SAME = 0; 368 private static final byte MIXED = 1; 369 private static final byte SAME_AS = 2; 370 371 /** Start with allocation of 16k data entries. */ 372 private static final int INITIAL_DATA_LENGTH = (1 << 14); 373 374 /** Grow about 8x each time. */ 375 private static final int MEDIUM_DATA_LENGTH = (1 << 17); 376 377 /** 378 * Maximum length of the build-time data array. 379 * One entry per 0x110000 code points. 380 */ 381 private static final int MAX_DATA_LENGTH = UNICODE_LIMIT; 382 383 // Flag values for index-3 blocks while compacting/building. 384 private static final byte I3_NULL = 0; 385 private static final byte I3_BMP = 1; 386 private static final byte I3_16 = 2; 387 private static final byte I3_18 = 3; 388 389 private static final int INDEX_3_18BIT_BLOCK_LENGTH = CodePointTrie.INDEX_3_BLOCK_LENGTH + CodePointTrie.INDEX_3_BLOCK_LENGTH / 8; 390 391 private int[] index; 392 private int index3NullOffset; 393 private int[] data; 394 private int dataLength; 395 private int dataNullOffset; 396 397 private int origInitialValue; 398 private int initialValue; 399 private int errorValue; 400 private int highStart; 401 private int highValue; 402 403 /** Temporary array while building the final data. */ 404 private char[] index16; 405 private byte[] flags = new byte[UNICODE_LIMIT >> CodePointTrie.SHIFT_3]; 406 ensureHighStart(int c)407 private void ensureHighStart(int c) { 408 if (c >= highStart) { 409 // Round up to a CodePointTrie.CP_PER_INDEX_2_ENTRY boundary to simplify compaction. 410 c = (c + CodePointTrie.CP_PER_INDEX_2_ENTRY) & ~(CodePointTrie.CP_PER_INDEX_2_ENTRY - 1); 411 int i = highStart >> CodePointTrie.SHIFT_3; 412 int iLimit = c >> CodePointTrie.SHIFT_3; 413 if (iLimit > index.length) { 414 int[] newIndex = new int[I_LIMIT]; 415 for (int j = 0; j < i; ++j) { newIndex[j] = index[j]; } 416 index = newIndex; 417 } 418 do { 419 flags[i] = ALL_SAME; 420 index[i] = initialValue; 421 } while(++i < iLimit); 422 highStart = c; 423 } 424 } 425 allocDataBlock(int blockLength)426 private int allocDataBlock(int blockLength) { 427 int newBlock = dataLength; 428 int newTop = newBlock + blockLength; 429 if (newTop > data.length) { 430 int capacity; 431 if (data.length < MEDIUM_DATA_LENGTH) { 432 capacity = MEDIUM_DATA_LENGTH; 433 } else if (data.length < MAX_DATA_LENGTH) { 434 capacity = MAX_DATA_LENGTH; 435 } else { 436 // Should never occur. 437 // Either MAX_DATA_LENGTH is incorrect, 438 // or the code writes more values than should be possible. 439 throw new AssertionError(); 440 } 441 int[] newData = new int[capacity]; 442 for (int j = 0; j < dataLength; ++j) { newData[j] = data[j]; } 443 data = newData; 444 } 445 dataLength = newTop; 446 return newBlock; 447 } 448 449 /** 450 * No error checking for illegal arguments. 451 * The Java version always returns non-negative values. 452 */ getDataBlock(int i)453 private int getDataBlock(int i) { 454 if (flags[i] == MIXED) { 455 return index[i]; 456 } 457 if (i < BMP_I_LIMIT) { 458 int newBlock = allocDataBlock(CodePointTrie.FAST_DATA_BLOCK_LENGTH); 459 int iStart = i & ~(SMALL_DATA_BLOCKS_PER_BMP_BLOCK -1); 460 int iLimit = iStart + SMALL_DATA_BLOCKS_PER_BMP_BLOCK; 461 do { 462 assert(flags[iStart] == ALL_SAME); 463 writeBlock(newBlock, index[iStart]); 464 flags[iStart] = MIXED; 465 index[iStart++] = newBlock; 466 newBlock += CodePointTrie.SMALL_DATA_BLOCK_LENGTH; 467 } while (iStart < iLimit); 468 return index[i]; 469 } else { 470 int newBlock = allocDataBlock(CodePointTrie.SMALL_DATA_BLOCK_LENGTH); 471 if (newBlock < 0) { return newBlock; } 472 writeBlock(newBlock, index[i]); 473 flags[i] = MIXED; 474 index[i] = newBlock; 475 return newBlock; 476 } 477 } 478 479 // compaction -------------------------------------------------------------- 480 maskValues(int mask)481 private void maskValues(int mask) { 482 initialValue &= mask; 483 errorValue &= mask; 484 highValue &= mask; 485 int iLimit = highStart >> CodePointTrie.SHIFT_3; 486 for (int i = 0; i < iLimit; ++i) { 487 if (flags[i] == ALL_SAME) { 488 index[i] &= mask; 489 } 490 } 491 for (int i = 0; i < dataLength; ++i) { 492 data[i] &= mask; 493 } 494 } 495 equalBlocks(int[] s, int si, int[] t, int ti, int length)496 private static boolean equalBlocks(int[] s, int si, int[] t, int ti, int length) { 497 while (length > 0 && s[si] == t[ti]) { 498 ++si; 499 ++ti; 500 --length; 501 } 502 return length == 0; 503 } 504 equalBlocks(char[] s, int si, int[] t, int ti, int length)505 private static boolean equalBlocks(char[] s, int si, int[] t, int ti, int length) { 506 while (length > 0 && s[si] == t[ti]) { 507 ++si; 508 ++ti; 509 --length; 510 } 511 return length == 0; 512 } 513 equalBlocks(char[] s, int si, char[] t, int ti, int length)514 private static boolean equalBlocks(char[] s, int si, char[] t, int ti, int length) { 515 while (length > 0 && s[si] == t[ti]) { 516 ++si; 517 ++ti; 518 --length; 519 } 520 return length == 0; 521 } 522 allValuesSameAs(int[] p, int pi, int length, int value)523 private static boolean allValuesSameAs(int[] p, int pi, int length, int value) { 524 int pLimit = pi + length; 525 while (pi < pLimit && p[pi] == value) { ++pi; } 526 return pi == pLimit; 527 } 528 529 /** Search for an identical block. */ findSameBlock(char[] p, int pStart, int length, char[] q, int qStart, int blockLength)530 private static int findSameBlock(char[] p, int pStart, int length, 531 char[] q, int qStart, int blockLength) { 532 // Ensure that we do not even partially get past length. 533 length -= blockLength; 534 535 while (pStart <= length) { 536 if (equalBlocks(p, pStart, q, qStart, blockLength)) { 537 return pStart; 538 } 539 ++pStart; 540 } 541 return -1; 542 } 543 findAllSameBlock(int[] p, int start, int limit, int value, int blockLength)544 private static int findAllSameBlock(int[] p, int start, int limit, 545 int value, int blockLength) { 546 // Ensure that we do not even partially get past limit. 547 limit -= blockLength; 548 549 for (int block = start; block <= limit; ++block) { 550 if (p[block] == value) { 551 for (int i = 1;; ++i) { 552 if (i == blockLength) { 553 return block; 554 } 555 if (p[block + i] != value) { 556 block += i; 557 break; 558 } 559 } 560 } 561 } 562 return -1; 563 } 564 565 /** 566 * Look for maximum overlap of the beginning of the other block 567 * with the previous, adjacent block. 568 */ getOverlap(int[] p, int length, int[] q, int qStart, int blockLength)569 private static int getOverlap(int[] p, int length, int[] q, int qStart, int blockLength) { 570 int overlap = blockLength - 1; 571 assert(overlap <= length); 572 while (overlap > 0 && !equalBlocks(p, length - overlap, q, qStart, overlap)) { 573 --overlap; 574 } 575 return overlap; 576 } 577 getOverlap(char[] p, int length, int[] q, int qStart, int blockLength)578 private static int getOverlap(char[] p, int length, int[] q, int qStart, int blockLength) { 579 int overlap = blockLength - 1; 580 assert(overlap <= length); 581 while (overlap > 0 && !equalBlocks(p, length - overlap, q, qStart, overlap)) { 582 --overlap; 583 } 584 return overlap; 585 } 586 getOverlap(char[] p, int length, char[] q, int qStart, int blockLength)587 private static int getOverlap(char[] p, int length, char[] q, int qStart, int blockLength) { 588 int overlap = blockLength - 1; 589 assert(overlap <= length); 590 while (overlap > 0 && !equalBlocks(p, length - overlap, q, qStart, overlap)) { 591 --overlap; 592 } 593 return overlap; 594 } 595 getAllSameOverlap(int[] p, int length, int value, int blockLength)596 private static int getAllSameOverlap(int[] p, int length, int value, int blockLength) { 597 int min = length - (blockLength - 1); 598 int i = length; 599 while (min < i && p[i - 1] == value) { --i; } 600 return length - i; 601 } 602 isStartOfSomeFastBlock(int dataOffset, int[] index, int fastILimit)603 private static boolean isStartOfSomeFastBlock(int dataOffset, int[] index, int fastILimit) { 604 for (int i = 0; i < fastILimit; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) { 605 if (index[i] == dataOffset) { 606 return true; 607 } 608 } 609 return false; 610 } 611 612 /** 613 * Finds the start of the last range in the trie by enumerating backward. 614 * Indexes for code points higher than this will be omitted. 615 */ findHighStart()616 private int findHighStart() { 617 int i = highStart >> CodePointTrie.SHIFT_3; 618 while (i > 0) { 619 boolean match; 620 if (flags[--i] == ALL_SAME) { 621 match = index[i] == highValue; 622 } else /* MIXED */ { 623 int p = index[i]; 624 for (int j = 0;; ++j) { 625 if (j == CodePointTrie.SMALL_DATA_BLOCK_LENGTH) { 626 match = true; 627 break; 628 } 629 if (data[p + j] != highValue) { 630 match = false; 631 break; 632 } 633 } 634 } 635 if (!match) { 636 return (i + 1) << CodePointTrie.SHIFT_3; 637 } 638 } 639 return 0; 640 } 641 642 private static final class AllSameBlocks { 643 static final int NEW_UNIQUE = -1; 644 static final int OVERFLOW = -2; 645 AllSameBlocks()646 AllSameBlocks() { 647 mostRecent = -1; 648 } 649 findOrAdd(int index, int count, int value)650 int findOrAdd(int index, int count, int value) { 651 if (mostRecent >= 0 && values[mostRecent] == value) { 652 refCounts[mostRecent] += count; 653 return indexes[mostRecent]; 654 } 655 for (int i = 0; i < length; ++i) { 656 if (values[i] == value) { 657 mostRecent = i; 658 refCounts[i] += count; 659 return indexes[i]; 660 } 661 } 662 if (length == CAPACITY) { 663 return OVERFLOW; 664 } 665 mostRecent = length; 666 indexes[length] = index; 667 values[length] = value; 668 refCounts[length++] = count; 669 return NEW_UNIQUE; 670 } 671 672 /** Replaces the block which has the lowest reference count. */ add(int index, int count, int value)673 void add(int index, int count, int value) { 674 assert(length == CAPACITY); 675 int least = -1; 676 int leastCount = I_LIMIT; 677 for (int i = 0; i < length; ++i) { 678 assert(values[i] != value); 679 if (refCounts[i] < leastCount) { 680 least = i; 681 leastCount = refCounts[i]; 682 } 683 } 684 assert(least >= 0); 685 mostRecent = least; 686 indexes[least] = index; 687 values[least] = value; 688 refCounts[least] = count; 689 } 690 findMostUsed()691 int findMostUsed() { 692 if (length == 0) { return -1; } 693 int max = -1; 694 int maxCount = 0; 695 for (int i = 0; i < length; ++i) { 696 if (refCounts[i] > maxCount) { 697 max = i; 698 maxCount = refCounts[i]; 699 } 700 } 701 return indexes[max]; 702 } 703 704 private static final int CAPACITY = 32; 705 706 private int length; 707 private int mostRecent; 708 709 private int[] indexes = new int[CAPACITY]; 710 private int[] values = new int[CAPACITY]; 711 private int[] refCounts = new int[CAPACITY]; 712 } 713 714 // Custom hash table for mixed-value blocks to be found anywhere in the 715 // compacted data or index so far. 716 private static final class MixedBlocks { init(int maxLength, int newBlockLength)717 void init(int maxLength, int newBlockLength) { 718 // We store actual data indexes + 1 to reserve 0 for empty entries. 719 int maxDataIndex = maxLength - newBlockLength + 1; 720 int newLength; 721 if (maxDataIndex <= 0xfff) { // 4k 722 newLength = 6007; 723 shift = 12; 724 mask = 0xfff; 725 } else if (maxDataIndex <= 0x7fff) { // 32k 726 newLength = 50021; 727 shift = 15; 728 mask = 0x7fff; 729 } else if (maxDataIndex <= 0x1ffff) { // 128k 730 newLength = 200003; 731 shift = 17; 732 mask = 0x1ffff; 733 } else { 734 // maxDataIndex up to around MAX_DATA_LENGTH, ca. 1.1M 735 newLength = 1500007; 736 shift = 21; 737 mask = 0x1fffff; 738 } 739 if (table == null || newLength > table.length) { 740 table = new int[newLength]; 741 } else { 742 Arrays.fill(table, 0, newLength, 0); 743 } 744 length = newLength; 745 746 blockLength = newBlockLength; 747 } 748 extend(int[] data, int minStart, int prevDataLength, int newDataLength)749 void extend(int[] data, int minStart, int prevDataLength, int newDataLength) { 750 int start = prevDataLength - blockLength; 751 if (start >= minStart) { 752 ++start; // Skip the last block that we added last time. 753 } else { 754 start = minStart; // Begin with the first full block. 755 } 756 for (int end = newDataLength - blockLength; start <= end; ++start) { 757 int hashCode = makeHashCode(data, start); 758 addEntry(data, null, start, hashCode, start); 759 } 760 } 761 extend(char[] data, int minStart, int prevDataLength, int newDataLength)762 void extend(char[] data, int minStart, int prevDataLength, int newDataLength) { 763 int start = prevDataLength - blockLength; 764 if (start >= minStart) { 765 ++start; // Skip the last block that we added last time. 766 } else { 767 start = minStart; // Begin with the first full block. 768 } 769 for (int end = newDataLength - blockLength; start <= end; ++start) { 770 int hashCode = makeHashCode(data, start); 771 addEntry(null, data, start, hashCode, start); 772 } 773 } 774 findBlock(int[] data, int[] blockData, int blockStart)775 int findBlock(int[] data, int[] blockData, int blockStart) { 776 int hashCode = makeHashCode(blockData, blockStart); 777 int entryIndex = findEntry(data, null, blockData, null, blockStart, hashCode); 778 if (entryIndex >= 0) { 779 return (table[entryIndex] & mask) - 1; 780 } else { 781 return -1; 782 } 783 } 784 findBlock(char[] data, int[] blockData, int blockStart)785 int findBlock(char[] data, int[] blockData, int blockStart) { 786 int hashCode = makeHashCode(blockData, blockStart); 787 int entryIndex = findEntry(null, data, blockData, null, blockStart, hashCode); 788 if (entryIndex >= 0) { 789 return (table[entryIndex] & mask) - 1; 790 } else { 791 return -1; 792 } 793 } 794 findBlock(char[] data, char[] blockData, int blockStart)795 int findBlock(char[] data, char[] blockData, int blockStart) { 796 int hashCode = makeHashCode(blockData, blockStart); 797 int entryIndex = findEntry(null, data, null, blockData, blockStart, hashCode); 798 if (entryIndex >= 0) { 799 return (table[entryIndex] & mask) - 1; 800 } else { 801 return -1; 802 } 803 } 804 findAllSameBlock(int[] data, int blockValue)805 int findAllSameBlock(int[] data, int blockValue) { 806 int hashCode = makeHashCode(blockValue); 807 int entryIndex = findEntry(data, blockValue, hashCode); 808 if (entryIndex >= 0) { 809 return (table[entryIndex] & mask) - 1; 810 } else { 811 return -1; 812 } 813 } 814 makeHashCode(int[] blockData, int blockStart)815 private int makeHashCode(int[] blockData, int blockStart) { 816 int blockLimit = blockStart + blockLength; 817 int hashCode = blockData[blockStart++]; 818 do { 819 hashCode = 37 * hashCode + blockData[blockStart++]; 820 } while (blockStart < blockLimit); 821 return hashCode; 822 } 823 makeHashCode(char[] blockData, int blockStart)824 private int makeHashCode(char[] blockData, int blockStart) { 825 int blockLimit = blockStart + blockLength; 826 int hashCode = blockData[blockStart++]; 827 do { 828 hashCode = 37 * hashCode + blockData[blockStart++]; 829 } while (blockStart < blockLimit); 830 return hashCode; 831 } 832 makeHashCode(int blockValue)833 private int makeHashCode(int blockValue) { 834 int hashCode = blockValue; 835 for (int i = 1; i < blockLength; ++i) { 836 hashCode = 37 * hashCode + blockValue; 837 } 838 return hashCode; 839 } 840 addEntry(int[] data32, char[] data16, int blockStart, int hashCode, int dataIndex)841 private void addEntry(int[] data32, char[] data16, int blockStart, int hashCode, int dataIndex) { 842 assert(0 <= dataIndex && dataIndex < mask); 843 int entryIndex = findEntry(data32, data16, data32, data16, blockStart, hashCode); 844 if (entryIndex < 0) { 845 table[~entryIndex] = (hashCode << shift) | (dataIndex + 1); 846 } 847 } 848 findEntry(int[] data32, char[] data16, int[] blockData32, char[] blockData16, int blockStart, int hashCode)849 private int findEntry(int[] data32, char[] data16, 850 int[] blockData32, char[] blockData16, int blockStart, int hashCode) { 851 int shiftedHashCode = hashCode << shift; 852 int initialEntryIndex = modulo(hashCode, length - 1) + 1; // 1..length-1 853 for (int entryIndex = initialEntryIndex;;) { 854 int entry = table[entryIndex]; 855 if (entry == 0) { 856 return ~entryIndex; 857 } 858 if ((entry & ~mask) == shiftedHashCode) { 859 int dataIndex = (entry & mask) - 1; 860 if (data32 != null ? 861 equalBlocks(data32, dataIndex, blockData32, blockStart, blockLength) : 862 blockData32 != null ? 863 equalBlocks(data16, dataIndex, blockData32, blockStart, blockLength) : 864 equalBlocks(data16, dataIndex, blockData16, blockStart, blockLength)) { 865 return entryIndex; 866 } 867 } 868 entryIndex = nextIndex(initialEntryIndex, entryIndex); 869 } 870 } 871 findEntry(int[] data, int blockValue, int hashCode)872 private int findEntry(int[] data, int blockValue, int hashCode) { 873 int shiftedHashCode = hashCode << shift; 874 int initialEntryIndex = modulo(hashCode, length - 1) + 1; // 1..length-1 875 for (int entryIndex = initialEntryIndex;;) { 876 int entry = table[entryIndex]; 877 if (entry == 0) { 878 return ~entryIndex; 879 } 880 if ((entry & ~mask) == shiftedHashCode) { 881 int dataIndex = (entry & mask) - 1; 882 if (allValuesSameAs(data, dataIndex, blockLength, blockValue)) { 883 return entryIndex; 884 } 885 } 886 entryIndex = nextIndex(initialEntryIndex, entryIndex); 887 } 888 } 889 nextIndex(int initialEntryIndex, int entryIndex)890 private int nextIndex(int initialEntryIndex, int entryIndex) { 891 // U_ASSERT(0 < initialEntryIndex && initialEntryIndex < length); 892 return (entryIndex + initialEntryIndex) % length; 893 } 894 895 /** Ensures non-negative n % m (that is 0..m-1). */ modulo(int n, int m)896 private int modulo(int n, int m) { 897 int i = n % m; 898 if (i < 0) { 899 i += m; 900 } 901 return i; 902 } 903 904 // Hash table. 905 // The length is a prime number, larger than the maximum data length. 906 // The "shift" lower bits store a data index + 1. 907 // The remaining upper bits store a partial hashCode of the block data values. 908 private int[] table; 909 private int length; 910 private int shift; 911 private int mask; 912 913 private int blockLength; 914 } 915 compactWholeDataBlocks(int fastILimit, AllSameBlocks allSameBlocks)916 private int compactWholeDataBlocks(int fastILimit, AllSameBlocks allSameBlocks) { 917 // ASCII data will be stored as a linear table, even if the following code 918 // does not yet count it that way. 919 int newDataCapacity = ASCII_LIMIT; 920 // Add room for a small data null block in case it would match the start of 921 // a fast data block where dataNullOffset must not be set in that case. 922 newDataCapacity += CodePointTrie.SMALL_DATA_BLOCK_LENGTH; 923 // Add room for special values (errorValue, highValue) and padding. 924 newDataCapacity += 4; 925 int iLimit = highStart >> CodePointTrie.SHIFT_3; 926 int blockLength = CodePointTrie.FAST_DATA_BLOCK_LENGTH; 927 int inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; 928 for (int i = 0; i < iLimit; i += inc) { 929 if (i == fastILimit) { 930 blockLength = CodePointTrie.SMALL_DATA_BLOCK_LENGTH; 931 inc = 1; 932 } 933 int value = index[i]; 934 if (flags[i] == MIXED) { 935 // Really mixed? 936 int p = value; 937 value = data[p]; 938 if (allValuesSameAs(data, p + 1, blockLength - 1, value)) { 939 flags[i] = ALL_SAME; 940 index[i] = value; 941 // Fall through to ALL_SAME handling. 942 } else { 943 newDataCapacity += blockLength; 944 continue; 945 } 946 } else { 947 assert(flags[i] == ALL_SAME); 948 if (inc > 1) { 949 // Do all of the fast-range data block's ALL_SAME parts have the same value? 950 boolean allSame = true; 951 int next_i = i + inc; 952 for (int j = i + 1; j < next_i; ++j) { 953 assert(flags[j] == ALL_SAME); 954 if (index[j] != value) { 955 allSame = false; 956 break; 957 } 958 } 959 if (!allSame) { 960 // Turn it into a MIXED block. 961 if (getDataBlock(i) < 0) { 962 return -1; 963 } 964 newDataCapacity += blockLength; 965 continue; 966 } 967 } 968 } 969 // Is there another ALL_SAME block with the same value? 970 int other = allSameBlocks.findOrAdd(i, inc, value); 971 if (other == AllSameBlocks.OVERFLOW) { 972 // The fixed-size array overflowed. Slow check for a duplicate block. 973 int jInc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; 974 for (int j = 0;; j += jInc) { 975 if (j == i) { 976 allSameBlocks.add(i, inc, value); 977 break; 978 } 979 if (j == fastILimit) { 980 jInc = 1; 981 } 982 if (flags[j] == ALL_SAME && index[j] == value) { 983 allSameBlocks.add(j, jInc + inc, value); 984 other = j; 985 break; 986 // We could keep counting blocks with the same value 987 // before we add the first one, which may improve compaction in rare cases, 988 // but it would make it slower. 989 } 990 } 991 } 992 if (other >= 0) { 993 flags[i] = SAME_AS; 994 index[i] = other; 995 } else { 996 // New unique same-value block. 997 newDataCapacity += blockLength; 998 } 999 } 1000 return newDataCapacity; 1001 } 1002 1003 /** 1004 * Compacts a build-time trie. 1005 * 1006 * The compaction 1007 * - removes blocks that are identical with earlier ones 1008 * - overlaps each new non-duplicate block as much as possible with the previously-written one 1009 * - works with fast-range data blocks whose length is a multiple of that of 1010 * higher-code-point data blocks 1011 * 1012 * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks. 1013 */ compactData( int fastILimit, int[] newData, int dataNullIndex, MixedBlocks mixedBlocks)1014 private int compactData( 1015 int fastILimit, int[] newData, int dataNullIndex, MixedBlocks mixedBlocks) { 1016 // The linear ASCII data has been copied into newData already. 1017 int newDataLength = 0; 1018 for (int i = 0; newDataLength < ASCII_LIMIT; 1019 newDataLength += CodePointTrie.FAST_DATA_BLOCK_LENGTH, i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) { 1020 index[i] = newDataLength; 1021 } 1022 1023 int blockLength = CodePointTrie.FAST_DATA_BLOCK_LENGTH; 1024 mixedBlocks.init(newData.length, blockLength); 1025 mixedBlocks.extend(newData, 0, 0, newDataLength); 1026 1027 int iLimit = highStart >> CodePointTrie.SHIFT_3; 1028 int inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK; 1029 int fastLength = 0; 1030 for (int i = ASCII_I_LIMIT; i < iLimit; i += inc) { 1031 if (i == fastILimit) { 1032 blockLength = CodePointTrie.SMALL_DATA_BLOCK_LENGTH; 1033 inc = 1; 1034 fastLength = newDataLength; 1035 mixedBlocks.init(newData.length, blockLength); 1036 mixedBlocks.extend(newData, 0, 0, newDataLength); 1037 } 1038 if (flags[i] == ALL_SAME) { 1039 int value = index[i]; 1040 // Find an earlier part of the data array of length blockLength 1041 // that is filled with this value. 1042 int n = mixedBlocks.findAllSameBlock(newData, value); 1043 // If we find a match, and the current block is the data null block, 1044 // and it is not a fast block but matches the start of a fast block, 1045 // then we need to continue looking. 1046 // This is because this small block is shorter than the fast block, 1047 // and not all of the rest of the fast block is filled with this value. 1048 // Otherwise trie.getRange() would detect that the fast block starts at 1049 // dataNullOffset and assume incorrectly that it is filled with the null value. 1050 while (n >= 0 && i == dataNullIndex && i >= fastILimit && n < fastLength && 1051 isStartOfSomeFastBlock(n, index, fastILimit)) { 1052 n = findAllSameBlock(newData, n + 1, newDataLength, value, blockLength); 1053 } 1054 if (n >= 0) { 1055 index[i] = n; 1056 } else { 1057 n = getAllSameOverlap(newData, newDataLength, value, blockLength); 1058 index[i] = newDataLength - n; 1059 int prevDataLength = newDataLength; 1060 while (n < blockLength) { 1061 newData[newDataLength++] = value; 1062 ++n; 1063 } 1064 mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); 1065 } 1066 } else if (flags[i] == MIXED) { 1067 int block = index[i]; 1068 int n = mixedBlocks.findBlock(newData, data, block); 1069 if (n >= 0) { 1070 index[i] = n; 1071 } else { 1072 n = getOverlap(newData, newDataLength, data, block, blockLength); 1073 index[i] = newDataLength - n; 1074 int prevDataLength = newDataLength; 1075 while (n < blockLength) { 1076 newData[newDataLength++] = data[block + n++]; 1077 } 1078 mixedBlocks.extend(newData, 0, prevDataLength, newDataLength); 1079 } 1080 } else /* SAME_AS */ { 1081 int j = index[i]; 1082 index[i] = index[j]; 1083 } 1084 } 1085 1086 return newDataLength; 1087 } 1088 compactIndex(int fastILimit, MixedBlocks mixedBlocks)1089 private int compactIndex(int fastILimit, MixedBlocks mixedBlocks) { 1090 int fastIndexLength = fastILimit >> (CodePointTrie.FAST_SHIFT - CodePointTrie.SHIFT_3); 1091 if ((highStart >> CodePointTrie.FAST_SHIFT) <= fastIndexLength) { 1092 // Only the linear fast index, no multi-stage index tables. 1093 index3NullOffset = CodePointTrie.NO_INDEX3_NULL_OFFSET; 1094 return fastIndexLength; 1095 } 1096 1097 // Condense the fast index table. 1098 // Also, does it contain an index-3 block with all dataNullOffset? 1099 char[] fastIndex = new char[fastIndexLength]; 1100 int i3FirstNull = -1; 1101 for (int i = 0, j = 0; i < fastILimit; ++j) { 1102 int i3 = index[i]; 1103 fastIndex[j] = (char)i3; 1104 if (i3 == dataNullOffset) { 1105 if (i3FirstNull < 0) { 1106 i3FirstNull = j; 1107 } else if (index3NullOffset < 0 && 1108 (j - i3FirstNull + 1) == CodePointTrie.INDEX_3_BLOCK_LENGTH) { 1109 index3NullOffset = i3FirstNull; 1110 } 1111 } else { 1112 i3FirstNull = -1; 1113 } 1114 // Set the index entries that compactData() skipped. 1115 // Needed when the multi-stage index covers the fast index range as well. 1116 int iNext = i + SMALL_DATA_BLOCKS_PER_BMP_BLOCK; 1117 while (++i < iNext) { 1118 i3 += CodePointTrie.SMALL_DATA_BLOCK_LENGTH; 1119 index[i] = i3; 1120 } 1121 } 1122 1123 mixedBlocks.init(fastIndexLength, CodePointTrie.INDEX_3_BLOCK_LENGTH); 1124 mixedBlocks.extend(fastIndex, 0, 0, fastIndexLength); 1125 1126 // Examine index-3 blocks. For each determine one of: 1127 // - same as the index-3 null block 1128 // - same as a fast-index block 1129 // - 16-bit indexes 1130 // - 18-bit indexes 1131 // We store this in the first flags entry for the index-3 block. 1132 // 1133 // Also determine an upper limit for the index-3 table length. 1134 int index3Capacity = 0; 1135 i3FirstNull = index3NullOffset; 1136 boolean hasLongI3Blocks = false; 1137 // If the fast index covers the whole BMP, then 1138 // the multi-stage index is only for supplementary code points. 1139 // Otherwise, the multi-stage index covers all of Unicode. 1140 int iStart = fastILimit < BMP_I_LIMIT ? 0 : BMP_I_LIMIT; 1141 int iLimit = highStart >> CodePointTrie.SHIFT_3; 1142 for (int i = iStart; i < iLimit;) { 1143 int j = i; 1144 int jLimit = i + CodePointTrie.INDEX_3_BLOCK_LENGTH; 1145 int oredI3 = 0; 1146 boolean isNull = true; 1147 do { 1148 int i3 = index[j]; 1149 oredI3 |= i3; 1150 if (i3 != dataNullOffset) { 1151 isNull = false; 1152 } 1153 } while (++j < jLimit); 1154 if (isNull) { 1155 flags[i] = I3_NULL; 1156 if (i3FirstNull < 0) { 1157 if (oredI3 <= 0xffff) { 1158 index3Capacity += CodePointTrie.INDEX_3_BLOCK_LENGTH; 1159 } else { 1160 index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; 1161 hasLongI3Blocks = true; 1162 } 1163 i3FirstNull = 0; 1164 } 1165 } else { 1166 if (oredI3 <= 0xffff) { 1167 int n = mixedBlocks.findBlock(fastIndex, index, i); 1168 if (n >= 0) { 1169 flags[i] = I3_BMP; 1170 index[i] = n; 1171 } else { 1172 flags[i] = I3_16; 1173 index3Capacity += CodePointTrie.INDEX_3_BLOCK_LENGTH; 1174 } 1175 } else { 1176 flags[i] = I3_18; 1177 index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH; 1178 hasLongI3Blocks = true; 1179 } 1180 } 1181 i = j; 1182 } 1183 1184 int index2Capacity = (iLimit - iStart) >> CodePointTrie.SHIFT_2_3; 1185 1186 // Length of the index-1 table, rounded up. 1187 int index1Length = (index2Capacity + CodePointTrie.INDEX_2_MASK) >> CodePointTrie.SHIFT_1_2; 1188 1189 // Index table: Fast index, index-1, index-3, index-2. 1190 // +1 for possible index table padding. 1191 int index16Capacity = fastIndexLength + index1Length + index3Capacity + index2Capacity + 1; 1192 index16 = Arrays.copyOf(fastIndex, index16Capacity); 1193 1194 mixedBlocks.init(index16Capacity, CodePointTrie.INDEX_3_BLOCK_LENGTH); 1195 MixedBlocks longI3Blocks = null; 1196 if (hasLongI3Blocks) { 1197 longI3Blocks = new MixedBlocks(); 1198 longI3Blocks.init(index16Capacity, INDEX_3_18BIT_BLOCK_LENGTH); 1199 } 1200 1201 // Compact the index-3 table and write an uncompacted version of the index-2 table. 1202 char[] index2 = new char[index2Capacity]; 1203 int i2Length = 0; 1204 i3FirstNull = index3NullOffset; 1205 int index3Start = fastIndexLength + index1Length; 1206 int indexLength = index3Start; 1207 for (int i = iStart; i < iLimit; i += CodePointTrie.INDEX_3_BLOCK_LENGTH) { 1208 int i3; 1209 byte f = flags[i]; 1210 if (f == I3_NULL && i3FirstNull < 0) { 1211 // First index-3 null block. Write & overlap it like a normal block, then remember it. 1212 f = dataNullOffset <= 0xffff ? I3_16 : I3_18; 1213 i3FirstNull = 0; 1214 } 1215 if (f == I3_NULL) { 1216 i3 = index3NullOffset; 1217 } else if (f == I3_BMP) { 1218 i3 = index[i]; 1219 } else if (f == I3_16) { 1220 int n = mixedBlocks.findBlock(index16, index, i); 1221 if (n >= 0) { 1222 i3 = n; 1223 } else { 1224 if (indexLength == index3Start) { 1225 // No overlap at the boundary between the index-1 and index-3 tables. 1226 n = 0; 1227 } else { 1228 n = getOverlap(index16, indexLength, 1229 index, i, CodePointTrie.INDEX_3_BLOCK_LENGTH); 1230 } 1231 i3 = indexLength - n; 1232 int prevIndexLength = indexLength; 1233 while (n < CodePointTrie.INDEX_3_BLOCK_LENGTH) { 1234 index16[indexLength++] = (char)index[i + n++]; 1235 } 1236 mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); 1237 if (hasLongI3Blocks) { 1238 longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); 1239 } 1240 } 1241 } else { 1242 assert(f == I3_18); 1243 assert(hasLongI3Blocks); 1244 // Encode an index-3 block that contains one or more data indexes exceeding 16 bits. 1245 int j = i; 1246 int jLimit = i + CodePointTrie.INDEX_3_BLOCK_LENGTH; 1247 int k = indexLength; 1248 do { 1249 ++k; 1250 int v = index[j++]; 1251 int upperBits = (v & 0x30000) >> 2; 1252 index16[k++] = (char)v; 1253 v = index[j++]; 1254 upperBits |= (v & 0x30000) >> 4; 1255 index16[k++] = (char)v; 1256 v = index[j++]; 1257 upperBits |= (v & 0x30000) >> 6; 1258 index16[k++] = (char)v; 1259 v = index[j++]; 1260 upperBits |= (v & 0x30000) >> 8; 1261 index16[k++] = (char)v; 1262 v = index[j++]; 1263 upperBits |= (v & 0x30000) >> 10; 1264 index16[k++] = (char)v; 1265 v = index[j++]; 1266 upperBits |= (v & 0x30000) >> 12; 1267 index16[k++] = (char)v; 1268 v = index[j++]; 1269 upperBits |= (v & 0x30000) >> 14; 1270 index16[k++] = (char)v; 1271 v = index[j++]; 1272 upperBits |= (v & 0x30000) >> 16; 1273 index16[k++] = (char)v; 1274 index16[k - 9] = (char)upperBits; 1275 } while (j < jLimit); 1276 int n = longI3Blocks.findBlock(index16, index16, indexLength); 1277 if (n >= 0) { 1278 i3 = n | 0x8000; 1279 } else { 1280 if (indexLength == index3Start) { 1281 // No overlap at the boundary between the index-1 and index-3 tables. 1282 n = 0; 1283 } else { 1284 n = getOverlap(index16, indexLength, 1285 index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH); 1286 } 1287 i3 = (indexLength - n) | 0x8000; 1288 int prevIndexLength = indexLength; 1289 if (n > 0) { 1290 int start = indexLength; 1291 while (n < INDEX_3_18BIT_BLOCK_LENGTH) { 1292 index16[indexLength++] = index16[start + n++]; 1293 } 1294 } else { 1295 indexLength += INDEX_3_18BIT_BLOCK_LENGTH; 1296 } 1297 mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); 1298 if (hasLongI3Blocks) { 1299 longI3Blocks.extend(index16, index3Start, prevIndexLength, indexLength); 1300 } 1301 } 1302 } 1303 if (index3NullOffset < 0 && i3FirstNull >= 0) { 1304 index3NullOffset = i3; 1305 } 1306 // Set the index-2 table entry. 1307 index2[i2Length++] = (char)i3; 1308 } 1309 assert(i2Length == index2Capacity); 1310 assert(indexLength <= index3Start + index3Capacity); 1311 1312 if (index3NullOffset < 0) { 1313 index3NullOffset = CodePointTrie.NO_INDEX3_NULL_OFFSET; 1314 } 1315 if (indexLength >= (CodePointTrie.NO_INDEX3_NULL_OFFSET + CodePointTrie.INDEX_3_BLOCK_LENGTH)) { 1316 // The index-3 offsets exceed 15 bits, or 1317 // the last one cannot be distinguished from the no-null-block value. 1318 throw new IndexOutOfBoundsException( 1319 "The trie data exceeds limitations of the data structure."); 1320 } 1321 1322 // Compact the index-2 table and write the index-1 table. 1323 // assert(CodePointTrie.INDEX_2_BLOCK_LENGTH == CodePointTrie.INDEX_3_BLOCK_LENGTH) : 1324 // "must re-init mixedBlocks"; 1325 int blockLength = CodePointTrie.INDEX_2_BLOCK_LENGTH; 1326 int i1 = fastIndexLength; 1327 for (int i = 0; i < i2Length; i += blockLength) { 1328 int n; 1329 if ((i2Length - i) >= blockLength) { 1330 // normal block 1331 assert(blockLength == CodePointTrie.INDEX_2_BLOCK_LENGTH); 1332 n = mixedBlocks.findBlock(index16, index2, i); 1333 } else { 1334 // highStart is inside the last index-2 block. Shorten it. 1335 blockLength = i2Length - i; 1336 n = findSameBlock(index16, index3Start, indexLength, 1337 index2, i, blockLength); 1338 } 1339 int i2; 1340 if (n >= 0) { 1341 i2 = n; 1342 } else { 1343 if (indexLength == index3Start) { 1344 // No overlap at the boundary between the index-1 and index-3/2 tables. 1345 n = 0; 1346 } else { 1347 n = getOverlap(index16, indexLength, index2, i, blockLength); 1348 } 1349 i2 = indexLength - n; 1350 int prevIndexLength = indexLength; 1351 while (n < blockLength) { 1352 index16[indexLength++] = index2[i + n++]; 1353 } 1354 mixedBlocks.extend(index16, index3Start, prevIndexLength, indexLength); 1355 } 1356 // Set the index-1 table entry. 1357 index16[i1++] = (char)i2; 1358 } 1359 assert(i1 == index3Start); 1360 assert(indexLength <= index16Capacity); 1361 1362 return indexLength; 1363 } 1364 compactTrie(int fastILimit)1365 private int compactTrie(int fastILimit) { 1366 // Find the real highStart and round it up. 1367 assert((highStart & (CodePointTrie.CP_PER_INDEX_2_ENTRY - 1)) == 0); 1368 highValue = get(MAX_UNICODE); 1369 int realHighStart = findHighStart(); 1370 realHighStart = (realHighStart + (CodePointTrie.CP_PER_INDEX_2_ENTRY - 1)) & 1371 ~(CodePointTrie.CP_PER_INDEX_2_ENTRY - 1); 1372 if (realHighStart == UNICODE_LIMIT) { 1373 highValue = initialValue; 1374 } 1375 1376 // We always store indexes and data values for the fast range. 1377 // Pin highStart to the top of that range while building. 1378 int fastLimit = fastILimit << CodePointTrie.SHIFT_3; 1379 if (realHighStart < fastLimit) { 1380 for (int i = (realHighStart >> CodePointTrie.SHIFT_3); i < fastILimit; ++i) { 1381 flags[i] = ALL_SAME; 1382 index[i] = highValue; 1383 } 1384 highStart = fastLimit; 1385 } else { 1386 highStart = realHighStart; 1387 } 1388 1389 int[] asciiData = new int[ASCII_LIMIT]; 1390 for (int i = 0; i < ASCII_LIMIT; ++i) { 1391 asciiData[i] = get(i); 1392 } 1393 1394 // First we look for which data blocks have the same value repeated over the whole block, 1395 // deduplicate such blocks, find a good null data block (for faster enumeration), 1396 // and get an upper bound for the necessary data array length. 1397 AllSameBlocks allSameBlocks = new AllSameBlocks(); 1398 int newDataCapacity = compactWholeDataBlocks(fastILimit, allSameBlocks); 1399 int[] newData = Arrays.copyOf(asciiData, newDataCapacity); 1400 1401 int dataNullIndex = allSameBlocks.findMostUsed(); 1402 1403 MixedBlocks mixedBlocks = new MixedBlocks(); 1404 int newDataLength = compactData(fastILimit, newData, dataNullIndex, mixedBlocks); 1405 assert(newDataLength <= newDataCapacity); 1406 data = newData; 1407 dataLength = newDataLength; 1408 if (dataLength > (0x3ffff + CodePointTrie.SMALL_DATA_BLOCK_LENGTH)) { 1409 // The offset of the last data block is too high to be stored in the index table. 1410 throw new IndexOutOfBoundsException( 1411 "The trie data exceeds limitations of the data structure."); 1412 } 1413 1414 if (dataNullIndex >= 0) { 1415 dataNullOffset = index[dataNullIndex]; 1416 initialValue = data[dataNullOffset]; 1417 } else { 1418 dataNullOffset = CodePointTrie.NO_DATA_NULL_OFFSET; 1419 } 1420 1421 int indexLength = compactIndex(fastILimit, mixedBlocks); 1422 highStart = realHighStart; 1423 return indexLength; 1424 } 1425 build(CodePointTrie.Type type, CodePointTrie.ValueWidth valueWidth)1426 private CodePointTrie build(CodePointTrie.Type type, CodePointTrie.ValueWidth valueWidth) { 1427 // The mutable trie always stores 32-bit values. 1428 // When we build a UCPTrie for a smaller value width, we first mask off unused bits 1429 // before compacting the data. 1430 switch (valueWidth) { 1431 case BITS_32: 1432 break; 1433 case BITS_16: 1434 maskValues(0xffff); 1435 break; 1436 case BITS_8: 1437 maskValues(0xff); 1438 break; 1439 default: 1440 // Should be unreachable. 1441 throw new IllegalArgumentException(); 1442 } 1443 1444 int fastLimit = type == CodePointTrie.Type.FAST ? BMP_LIMIT : CodePointTrie.SMALL_LIMIT; 1445 int indexLength = compactTrie(fastLimit >> CodePointTrie.SHIFT_3); 1446 1447 // Ensure data table alignment: The index length must be even for uint32_t data. 1448 if (valueWidth == CodePointTrie.ValueWidth.BITS_32 && (indexLength & 1) != 0) { 1449 index16[indexLength++] = 0xffee; // arbitrary value 1450 } 1451 1452 // Make the total trie structure length a multiple of 4 bytes by padding the data table, 1453 // and store special values as the last two data values. 1454 int length = indexLength * 2; 1455 if (valueWidth == CodePointTrie.ValueWidth.BITS_16) { 1456 if (((indexLength ^ dataLength) & 1) != 0) { 1457 // padding 1458 data[dataLength++] = errorValue; 1459 } 1460 if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) { 1461 data[dataLength++] = highValue; 1462 data[dataLength++] = errorValue; 1463 } 1464 length += dataLength * 2; 1465 } else if (valueWidth == CodePointTrie.ValueWidth.BITS_32) { 1466 // 32-bit data words never need padding to a multiple of 4 bytes. 1467 if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) { 1468 if (data[dataLength - 1] != highValue) { 1469 data[dataLength++] = highValue; 1470 } 1471 data[dataLength++] = errorValue; 1472 } 1473 length += dataLength * 4; 1474 } else { 1475 int and3 = (length + dataLength) & 3; 1476 if (and3 == 0 && data[dataLength - 1] == errorValue && data[dataLength - 2] == highValue) { 1477 // all set 1478 } else if(and3 == 3 && data[dataLength - 1] == highValue) { 1479 data[dataLength++] = errorValue; 1480 } else { 1481 while (and3 != 2) { 1482 data[dataLength++] = highValue; 1483 and3 = (and3 + 1) & 3; 1484 } 1485 data[dataLength++] = highValue; 1486 data[dataLength++] = errorValue; 1487 } 1488 length += dataLength; 1489 } 1490 assert((length & 3) == 0); 1491 1492 // Fill the index and data arrays. 1493 char[] trieIndex; 1494 if (highStart <= fastLimit) { 1495 // Condense only the fast index from the mutable-trie index. 1496 trieIndex = new char[indexLength]; 1497 for (int i = 0, j = 0; j < indexLength; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK, ++j) { 1498 trieIndex[j] = (char)index[i]; 1499 } 1500 } else { 1501 if (indexLength == index16.length) { 1502 trieIndex = index16; 1503 index16 = null; 1504 } else { 1505 trieIndex = Arrays.copyOf(index16, indexLength); 1506 } 1507 } 1508 1509 // Write the data array. 1510 switch (valueWidth) { 1511 case BITS_16: { 1512 // Write 16-bit data values. 1513 char[] data16 = new char[dataLength]; 1514 for (int i = 0; i < dataLength; ++i) { data16[i] = (char)data[i]; } 1515 return type == CodePointTrie.Type.FAST ? 1516 new CodePointTrie.Fast16(trieIndex, data16, highStart, 1517 index3NullOffset, dataNullOffset) : 1518 new CodePointTrie.Small16(trieIndex, data16, highStart, 1519 index3NullOffset, dataNullOffset); 1520 } 1521 case BITS_32: { 1522 // Write 32-bit data values. 1523 int[] data32 = Arrays.copyOf(data, dataLength); 1524 return type == CodePointTrie.Type.FAST ? 1525 new CodePointTrie.Fast32(trieIndex, data32, highStart, 1526 index3NullOffset, dataNullOffset) : 1527 new CodePointTrie.Small32(trieIndex, data32, highStart, 1528 index3NullOffset, dataNullOffset); 1529 } 1530 case BITS_8: { 1531 // Write 8-bit data values. 1532 byte[] data8 = new byte[dataLength]; 1533 for (int i = 0; i < dataLength; ++i) { data8[i] = (byte)data[i]; } 1534 return type == CodePointTrie.Type.FAST ? 1535 new CodePointTrie.Fast8(trieIndex, data8, highStart, 1536 index3NullOffset, dataNullOffset) : 1537 new CodePointTrie.Small8(trieIndex, data8, highStart, 1538 index3NullOffset, dataNullOffset); 1539 } 1540 default: 1541 // Should be unreachable. 1542 throw new IllegalArgumentException(); 1543 } 1544 } 1545 } 1546