1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ****************************************************************************** 6 * 7 * Copyright (C) 2009-2015, International Business Machines 8 * Corporation and others. All Rights Reserved. 9 * 10 ****************************************************************************** 11 */ 12 13 package ohos.global.icu.impl; 14 15 import ohos.global.icu.text.UnicodeSet.SpanCondition; 16 import ohos.global.icu.util.OutputInt; 17 18 /** 19 * Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points. 20 * 21 * Latin-1: Look up bytes. 22 * 2-byte characters: Bits organized vertically. 23 * 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges. 24 * Supplementary characters: Binary search over 25 * the supplementary part of the parent set's inversion list. 26 * @hide exposed on OHOS 27 */ 28 public final class BMPSet { 29 public static int U16_SURROGATE_OFFSET = ((0xd800 << 10) + 0xdc00 - 0x10000); 30 31 /** 32 * One boolean ('true' or 'false') per Latin-1 character. 33 */ 34 private boolean[] latin1Contains; 35 36 /** 37 * One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points 38 * correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6} 39 * trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead) 40 * 41 * Bits for 0..FF are unused (0). 42 */ 43 private int[] table7FF; 44 45 /** 46 * One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks 47 * correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12} 48 * t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit 49 * indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed 50 * and set.contains(c) must be called. 51 * 52 * Bits for 0..7FF are unused (0). 53 */ 54 private int[] bmpBlockBits; 55 56 /** 57 * Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000, 58 * U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are 59 * always looked up in the bit tables. The last pair of indexes is for finding supplementary code points. 60 */ 61 private int[] list4kStarts; 62 63 /** 64 * The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for 65 * supplementary code points. The list is terminated with list[listLength-1]=0x110000. 66 */ 67 private final int[] list; 68 private final int listLength; // length used; list may be longer to minimize reallocs 69 BMPSet(final int[] parentList, int parentListLength)70 public BMPSet(final int[] parentList, int parentListLength) { 71 list = parentList; 72 listLength = parentListLength; 73 latin1Contains = new boolean[0x100]; 74 table7FF = new int[64]; 75 bmpBlockBits = new int[64]; 76 list4kStarts = new int[18]; 77 78 /* 79 * Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the 80 * first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of 81 * indexes is for finding supplementary code points. 82 */ 83 list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1); 84 int i; 85 for (i = 1; i <= 0x10; ++i) { 86 list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1); 87 } 88 list4kStarts[0x11] = listLength - 1; 89 90 initBits(); 91 } 92 BMPSet(final BMPSet otherBMPSet, final int[] newParentList, int newParentListLength)93 public BMPSet(final BMPSet otherBMPSet, final int[] newParentList, int newParentListLength) { 94 list = newParentList; 95 listLength = newParentListLength; 96 latin1Contains = otherBMPSet.latin1Contains.clone(); 97 table7FF = otherBMPSet.table7FF.clone(); 98 bmpBlockBits = otherBMPSet.bmpBlockBits.clone(); 99 list4kStarts = otherBMPSet.list4kStarts.clone(); 100 } 101 contains(int c)102 public boolean contains(int c) { 103 if (c <= 0xff) { 104 return (latin1Contains[c]); 105 } else if (c <= 0x7ff) { 106 return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0); 107 } else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) { 108 int lead = c >> 12; 109 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 110 if (twoBits <= 1) { 111 // All 64 code points with the same bits 15..6 112 // are either in the set or not. 113 return (0 != twoBits); 114 } else { 115 // Look up the code point in its 4k block of code points. 116 return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]); 117 } 118 } else if (c <= 0x10ffff) { 119 // surrogate or supplementary code point 120 return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); 121 } else { 122 // Out-of-range code points get false, consistent with long-standing 123 // behavior of UnicodeSet.contains(c). 124 return false; 125 } 126 } 127 128 /** 129 * Span the initial substring for which each character c has spanCondition==contains(c). It must be 130 * spanCondition==0 or 1. 131 * 132 * @param start The start index 133 * @param outCount If not null: Receives the number of code points in the span. 134 * @return the limit (exclusive end) of the span 135 * 136 * NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for 137 * sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points 138 * as usual in ICU. 139 */ span(CharSequence s, int start, SpanCondition spanCondition, OutputInt outCount)140 public final int span(CharSequence s, int start, SpanCondition spanCondition, 141 OutputInt outCount) { 142 char c, c2; 143 int i = start; 144 int limit = s.length(); 145 int numSupplementary = 0; 146 if (SpanCondition.NOT_CONTAINED != spanCondition) { 147 // span 148 while (i < limit) { 149 c = s.charAt(i); 150 if (c <= 0xff) { 151 if (!latin1Contains[c]) { 152 break; 153 } 154 } else if (c <= 0x7ff) { 155 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { 156 break; 157 } 158 } else if (c < 0xd800 || 159 c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { 160 int lead = c >> 12; 161 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 162 if (twoBits <= 1) { 163 // All 64 code points with the same bits 15..6 164 // are either in the set or not. 165 if (twoBits == 0) { 166 break; 167 } 168 } else { 169 // Look up the code point in its 4k block of code points. 170 if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 171 break; 172 } 173 } 174 } else { 175 // surrogate pair 176 int supplementary = Character.toCodePoint(c, c2); 177 if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 178 break; 179 } 180 ++numSupplementary; 181 ++i; 182 } 183 ++i; 184 } 185 } else { 186 // span not 187 while (i < limit) { 188 c = s.charAt(i); 189 if (c <= 0xff) { 190 if (latin1Contains[c]) { 191 break; 192 } 193 } else if (c <= 0x7ff) { 194 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { 195 break; 196 } 197 } else if (c < 0xd800 || 198 c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) { 199 int lead = c >> 12; 200 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 201 if (twoBits <= 1) { 202 // All 64 code points with the same bits 15..6 203 // are either in the set or not. 204 if (twoBits != 0) { 205 break; 206 } 207 } else { 208 // Look up the code point in its 4k block of code points. 209 if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 210 break; 211 } 212 } 213 } else { 214 // surrogate pair 215 int supplementary = Character.toCodePoint(c, c2); 216 if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 217 break; 218 } 219 ++numSupplementary; 220 ++i; 221 } 222 ++i; 223 } 224 } 225 if (outCount != null) { 226 int spanLength = i - start; 227 outCount.value = spanLength - numSupplementary; // number of code points 228 } 229 return i; 230 } 231 232 /** 233 * Symmetrical with span(). 234 * Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >= 235 * limit and spanCondition==0 or 1. 236 * 237 * @return The string index which starts the span (i.e. inclusive). 238 */ spanBack(CharSequence s, int limit, SpanCondition spanCondition)239 public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) { 240 char c, c2; 241 242 if (SpanCondition.NOT_CONTAINED != spanCondition) { 243 // span 244 for (;;) { 245 c = s.charAt(--limit); 246 if (c <= 0xff) { 247 if (!latin1Contains[c]) { 248 break; 249 } 250 } else if (c <= 0x7ff) { 251 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) { 252 break; 253 } 254 } else if (c < 0xd800 || 255 c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { 256 int lead = c >> 12; 257 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 258 if (twoBits <= 1) { 259 // All 64 code points with the same bits 15..6 260 // are either in the set or not. 261 if (twoBits == 0) { 262 break; 263 } 264 } else { 265 // Look up the code point in its 4k block of code points. 266 if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 267 break; 268 } 269 } 270 } else { 271 // surrogate pair 272 int supplementary = Character.toCodePoint(c2, c); 273 if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 274 break; 275 } 276 --limit; 277 } 278 if (0 == limit) { 279 return 0; 280 } 281 } 282 } else { 283 // span not 284 for (;;) { 285 c = s.charAt(--limit); 286 if (c <= 0xff) { 287 if (latin1Contains[c]) { 288 break; 289 } 290 } else if (c <= 0x7ff) { 291 if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) { 292 break; 293 } 294 } else if (c < 0xd800 || 295 c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) { 296 int lead = c >> 12; 297 int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001; 298 if (twoBits <= 1) { 299 // All 64 code points with the same bits 15..6 300 // are either in the set or not. 301 if (twoBits != 0) { 302 break; 303 } 304 } else { 305 // Look up the code point in its 4k block of code points. 306 if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) { 307 break; 308 } 309 } 310 } else { 311 // surrogate pair 312 int supplementary = Character.toCodePoint(c2, c); 313 if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) { 314 break; 315 } 316 --limit; 317 } 318 if (0 == limit) { 319 return 0; 320 } 321 } 322 } 323 return limit + 1; 324 } 325 326 /** 327 * Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800 328 */ set32x64Bits(int[] table, int start, int limit)329 private static void set32x64Bits(int[] table, int start, int limit) { 330 assert (64 == table.length); 331 int lead = start >> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits. 332 int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. 333 334 // Set one bit indicating an all-one block. 335 int bits = 1 << lead; 336 if ((start + 1) == limit) { // Single-character shortcut. 337 table[trail] |= bits; 338 return; 339 } 340 341 int limitLead = limit >> 6; 342 int limitTrail = limit & 0x3f; 343 344 if (lead == limitLead) { 345 // Partial vertical bit column. 346 while (trail < limitTrail) { 347 table[trail++] |= bits; 348 } 349 } else { 350 // Partial vertical bit column, 351 // followed by a bit rectangle, 352 // followed by another partial vertical bit column. 353 if (trail > 0) { 354 do { 355 table[trail++] |= bits; 356 } while (trail < 64); 357 ++lead; 358 } 359 if (lead < limitLead) { 360 bits = ~((1 << lead) - 1); 361 if (limitLead < 0x20) { 362 bits &= (1 << limitLead) - 1; 363 } 364 for (trail = 0; trail < 64; ++trail) { 365 table[trail] |= bits; 366 } 367 } 368 // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. 369 // In that case, bits=1<<limitLead == 1<<0 == 1 370 // (because Java << uses only the lower 5 bits of the shift operand) 371 // but the bits value is not used because trail<limitTrail is already false. 372 bits = 1 << limitLead; 373 for (trail = 0; trail < limitTrail; ++trail) { 374 table[trail] |= bits; 375 } 376 } 377 } 378 initBits()379 private void initBits() { 380 int start, limit; 381 int listIndex = 0; 382 383 // Set latin1Contains[]. 384 do { 385 start = list[listIndex++]; 386 if (listIndex < listLength) { 387 limit = list[listIndex++]; 388 } else { 389 limit = 0x110000; 390 } 391 if (start >= 0x100) { 392 break; 393 } 394 do { 395 latin1Contains[start++] = true; 396 } while (start < limit && start < 0x100); 397 } while (limit <= 0x100); 398 399 // Set table7FF[]. 400 while (start < 0x800) { 401 set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800); 402 if (limit > 0x800) { 403 start = 0x800; 404 break; 405 } 406 407 start = list[listIndex++]; 408 if (listIndex < listLength) { 409 limit = list[listIndex++]; 410 } else { 411 limit = 0x110000; 412 } 413 } 414 415 // Set bmpBlockBits[]. 416 int minStart = 0x800; 417 while (start < 0x10000) { 418 if (limit > 0x10000) { 419 limit = 0x10000; 420 } 421 422 if (start < minStart) { 423 start = minStart; 424 } 425 if (start < limit) { // Else: Another range entirely in a known mixed-value block. 426 if (0 != (start & 0x3f)) { 427 // Mixed-value block of 64 code points. 428 start >>= 6; 429 bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6); 430 start = (start + 1) << 6; // Round up to the next block boundary. 431 minStart = start; // Ignore further ranges in this block. 432 } 433 if (start < limit) { 434 if (start < (limit & ~0x3f)) { 435 // Multiple all-ones blocks of 64 code points each. 436 set32x64Bits(bmpBlockBits, start >> 6, limit >> 6); 437 } 438 439 if (0 != (limit & 0x3f)) { 440 // Mixed-value block of 64 code points. 441 limit >>= 6; 442 bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6); 443 limit = (limit + 1) << 6; // Round up to the next block boundary. 444 minStart = limit; // Ignore further ranges in this block. 445 } 446 } 447 } 448 449 if (limit == 0x10000) { 450 break; 451 } 452 453 start = list[listIndex++]; 454 if (listIndex < listLength) { 455 limit = list[listIndex++]; 456 } else { 457 limit = 0x110000; 458 } 459 } 460 } 461 462 463 /** 464 * Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code 465 * points in a certain range. 466 * 467 * For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and 468 * hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1. 469 * 470 * @param c 471 * a character in a subrange of MIN_VALUE..MAX_VALUE 472 * @param lo 473 * The lowest index to be returned. 474 * @param hi 475 * The highest index to be returned. 476 * @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i] 477 */ findCodePoint(int c, int lo, int hi)478 private int findCodePoint(int c, int lo, int hi) { 479 /* Examples: 480 findCodePoint(c) 481 set list[] c=0 1 3 4 7 8 482 === ============== =========== 483 [] [110000] 0 0 0 0 0 0 484 [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 485 [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 486 [:Any:] [0, 110000] 1 1 1 1 1 1 487 */ 488 489 // Return the smallest i such that c < list[i]. Assume 490 // list[len - 1] == HIGH and that c is legal (0..HIGH-1). 491 if (c < list[lo]) 492 return lo; 493 // High runner test. c is often after the last range, so an 494 // initial check for this condition pays off. 495 if (lo >= hi || c >= list[hi - 1]) 496 return hi; 497 // invariant: c >= list[lo] 498 // invariant: c < list[hi] 499 for (;;) { 500 int i = (lo + hi) >>> 1; 501 if (i == lo) { 502 break; // Found! 503 } else if (c < list[i]) { 504 hi = i; 505 } else { 506 lo = i; 507 } 508 } 509 return hi; 510 } 511 containsSlow(int c, int lo, int hi)512 private final boolean containsSlow(int c, int lo, int hi) { 513 return (0 != (findCodePoint(c, lo, hi) & 1)); 514 } 515 } 516