1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * Collation.java, ported from collation.h/.cpp 9 * 10 * C++ version created on: 2010oct27 11 * created by: Markus W. Scherer 12 */ 13 14 package com.ibm.icu.impl.coll; 15 16 /** 17 * Collation v2 basic definitions and static helper functions. 18 * 19 * Data structures except for expansion tables store 32-bit CEs which are 20 * either specials (see tags below) or are compact forms of 64-bit CEs. 21 */ 22 public final class Collation { 23 /** UChar32 U_SENTINEL. 24 * TODO: Create a common, public constant? 25 */ 26 public static final int SENTINEL_CP = -1; 27 28 // ICU4C compare() API returns enum UCollationResult values (with UCOL_ prefix). 29 // ICU4J just returns int. We use these constants for ease of porting. 30 public static final int LESS = -1; 31 public static final int EQUAL = 0; 32 public static final int GREATER = 1; 33 34 // Special sort key bytes for all levels. 35 public static final int TERMINATOR_BYTE = 0; 36 public static final int LEVEL_SEPARATOR_BYTE = 1; 37 38 /** The secondary/tertiary lower limit for tailoring before any root elements. */ 39 static final int BEFORE_WEIGHT16 = 0x100; 40 41 /** 42 * Merge-sort-key separator. 43 * Same as the unique primary and identical-level weights of U+FFFE. 44 * Must not be used as primary compression low terminator. 45 * Otherwise usable. 46 */ 47 public static final int MERGE_SEPARATOR_BYTE = 2; 48 public static final long MERGE_SEPARATOR_PRIMARY = 0x02000000; // U+FFFE 49 static final int MERGE_SEPARATOR_CE32 = 0x02000505; // U+FFFE 50 51 /** 52 * Primary compression low terminator, must be greater than MERGE_SEPARATOR_BYTE. 53 * Reserved value in primary second byte if the lead byte is compressible. 54 * Otherwise usable in all CE weight bytes. 55 */ 56 public static final int PRIMARY_COMPRESSION_LOW_BYTE = 3; 57 /** 58 * Primary compression high terminator. 59 * Reserved value in primary second byte if the lead byte is compressible. 60 * Otherwise usable in all CE weight bytes. 61 */ 62 public static final int PRIMARY_COMPRESSION_HIGH_BYTE = 0xff; 63 64 /** Default secondary/tertiary weight lead byte. */ 65 static final int COMMON_BYTE = 5; 66 public static final int COMMON_WEIGHT16 = 0x0500; 67 /** Middle 16 bits of a CE with a common secondary weight. */ 68 static final int COMMON_SECONDARY_CE = 0x05000000; 69 /** Lower 16 bits of a CE with a common tertiary weight. */ 70 static final int COMMON_TERTIARY_CE = 0x0500; 71 /** Lower 32 bits of a CE with common secondary and tertiary weights. */ 72 public static final int COMMON_SEC_AND_TER_CE = 0x05000500; 73 74 static final int SECONDARY_MASK = 0xffff0000; 75 public static final int CASE_MASK = 0xc000; 76 static final int SECONDARY_AND_CASE_MASK = SECONDARY_MASK | CASE_MASK; 77 /** Only the 2*6 bits for the pure tertiary weight. */ 78 public static final int ONLY_TERTIARY_MASK = 0x3f3f; 79 /** Only the secondary & tertiary bits; no case, no quaternary. */ 80 static final int ONLY_SEC_TER_MASK = SECONDARY_MASK | ONLY_TERTIARY_MASK; 81 /** Case bits and tertiary bits. */ 82 static final int CASE_AND_TERTIARY_MASK = CASE_MASK | ONLY_TERTIARY_MASK; 83 public static final int QUATERNARY_MASK = 0xc0; 84 /** Case bits and quaternary bits. */ 85 public static final int CASE_AND_QUATERNARY_MASK = CASE_MASK | QUATERNARY_MASK; 86 87 static final int UNASSIGNED_IMPLICIT_BYTE = 0xfe; // compressible 88 /** 89 * First unassigned: AlphabeticIndex overflow boundary. 90 * We want a 3-byte primary so that it fits into the root elements table. 91 * 92 * This 3-byte primary will not collide with 93 * any unassigned-implicit 4-byte primaries because 94 * the first few hundred Unicode code points all have real mappings. 95 */ 96 static final long FIRST_UNASSIGNED_PRIMARY = 0xfe040200L; 97 98 static final int TRAIL_WEIGHT_BYTE = 0xff; // not compressible 99 static final long FIRST_TRAILING_PRIMARY = 0xff020200L; // [first trailing] 100 public static final long MAX_PRIMARY = 0xffff0000L; // U+FFFF 101 static final int MAX_REGULAR_CE32 = 0xffff0505; // U+FFFF 102 103 // CE32 value for U+FFFD as well as illegal UTF-8 byte sequences (which behave like U+FFFD). 104 // We use the third-highest primary weight for U+FFFD (as in UCA 6.3+). 105 public static final long FFFD_PRIMARY = MAX_PRIMARY - 0x20000; 106 static final int FFFD_CE32 = MAX_REGULAR_CE32 - 0x20000; 107 108 /** 109 * A CE32 is special if its low byte is this or greater. 110 * Impossible case bits 11 mark special CE32s. 111 * This value itself is used to indicate a fallback to the base collator. 112 */ 113 static final int SPECIAL_CE32_LOW_BYTE = 0xc0; 114 static final int FALLBACK_CE32 = SPECIAL_CE32_LOW_BYTE; 115 /** 116 * Low byte of a long-primary special CE32. 117 */ 118 static final int LONG_PRIMARY_CE32_LOW_BYTE = 0xc1; // SPECIAL_CE32_LOW_BYTE | LONG_PRIMARY_TAG 119 120 static final int UNASSIGNED_CE32 = 0xffffffff; // Compute an unassigned-implicit CE. 121 122 static final int NO_CE32 = 1; 123 124 /** No CE: End of input. Only used in runtime code, not stored in data. */ 125 static final long NO_CE_PRIMARY = 1; // not a left-adjusted weight 126 static final int NO_CE_WEIGHT16 = 0x0100; // weight of LEVEL_SEPARATOR_BYTE 127 public static final long NO_CE = 0x101000100L; // NO_CE_PRIMARY, NO_CE_WEIGHT16, NO_CE_WEIGHT16 128 129 /** Sort key levels. */ 130 131 /** Unspecified level. */ 132 public static final int NO_LEVEL = 0; 133 public static final int PRIMARY_LEVEL = 1; 134 public static final int SECONDARY_LEVEL = 2; 135 public static final int CASE_LEVEL = 3; 136 public static final int TERTIARY_LEVEL = 4; 137 public static final int QUATERNARY_LEVEL = 5; 138 public static final int IDENTICAL_LEVEL = 6; 139 /** Beyond sort key bytes. */ 140 public static final int ZERO_LEVEL = 7; 141 142 /** 143 * Sort key level flags: xx_FLAG = 1 << xx_LEVEL. 144 * In Java, use enum Level with flag() getters, or use EnumSet rather than hand-made bit sets. 145 */ 146 static final int NO_LEVEL_FLAG = 1; 147 static final int PRIMARY_LEVEL_FLAG = 2; 148 static final int SECONDARY_LEVEL_FLAG = 4; 149 static final int CASE_LEVEL_FLAG = 8; 150 static final int TERTIARY_LEVEL_FLAG = 0x10; 151 static final int QUATERNARY_LEVEL_FLAG = 0x20; 152 static final int IDENTICAL_LEVEL_FLAG = 0x40; 153 static final int ZERO_LEVEL_FLAG = 0x80; 154 155 /** 156 * Special-CE32 tags, from bits 3..0 of a special 32-bit CE. 157 * Bits 31..8 are available for tag-specific data. 158 * Bits 5..4: Reserved. May be used in the future to indicate lccc!=0 and tccc!=0. 159 */ 160 161 /** 162 * Fall back to the base collator. 163 * This is the tag value in SPECIAL_CE32_LOW_BYTE and FALLBACK_CE32. 164 * Bits 31..8: Unused, 0. 165 */ 166 static final int FALLBACK_TAG = 0; 167 /** 168 * Long-primary CE with COMMON_SEC_AND_TER_CE. 169 * Bits 31..8: Three-byte primary. 170 */ 171 static final int LONG_PRIMARY_TAG = 1; 172 /** 173 * Long-secondary CE with zero primary. 174 * Bits 31..16: Secondary weight. 175 * Bits 15.. 8: Tertiary weight. 176 */ 177 static final int LONG_SECONDARY_TAG = 2; 178 /** 179 * Unused. 180 * May be used in the future for single-byte secondary CEs (SHORT_SECONDARY_TAG), 181 * storing the secondary in bits 31..24, the ccc in bits 23..16, 182 * and the tertiary in bits 15..8. 183 */ 184 static final int RESERVED_TAG_3 = 3; 185 /** 186 * Latin mini expansions of two simple CEs [pp, 05, tt] [00, ss, 05]. 187 * Bits 31..24: Single-byte primary weight pp of the first CE. 188 * Bits 23..16: Tertiary weight tt of the first CE. 189 * Bits 15.. 8: Secondary weight ss of the second CE. 190 */ 191 static final int LATIN_EXPANSION_TAG = 4; 192 /** 193 * Points to one or more simple/long-primary/long-secondary 32-bit CE32s. 194 * Bits 31..13: Index into int table. 195 * Bits 12.. 8: Length=1..31. 196 */ 197 static final int EXPANSION32_TAG = 5; 198 /** 199 * Points to one or more 64-bit CEs. 200 * Bits 31..13: Index into CE table. 201 * Bits 12.. 8: Length=1..31. 202 */ 203 static final int EXPANSION_TAG = 6; 204 /** 205 * Builder data, used only in the CollationDataBuilder, not in runtime data. 206 * 207 * If bit 8 is 0: Builder context, points to a list of context-sensitive mappings. 208 * Bits 31..13: Index to the builder's list of ConditionalCE32 for this character. 209 * Bits 12.. 9: Unused, 0. 210 * 211 * If bit 8 is 1 (IS_BUILDER_JAMO_CE32): Builder-only jamoCE32 value. 212 * The builder fetches the Jamo CE32 from the trie. 213 * Bits 31..13: Jamo code point. 214 * Bits 12.. 9: Unused, 0. 215 */ 216 static final int BUILDER_DATA_TAG = 7; 217 /** 218 * Points to prefix trie. 219 * Bits 31..13: Index into prefix/contraction data. 220 * Bits 12.. 8: Unused, 0. 221 */ 222 static final int PREFIX_TAG = 8; 223 /** 224 * Points to contraction data. 225 * Bits 31..13: Index into prefix/contraction data. 226 * Bits 12..11: Unused, 0. 227 * Bit 10: CONTRACT_TRAILING_CCC flag. 228 * Bit 9: CONTRACT_NEXT_CCC flag. 229 * Bit 8: CONTRACT_SINGLE_CP_NO_MATCH flag. 230 */ 231 static final int CONTRACTION_TAG = 9; 232 /** 233 * Decimal digit. 234 * Bits 31..13: Index into int table for non-numeric-collation CE32. 235 * Bit 12: Unused, 0. 236 * Bits 11.. 8: Digit value 0..9. 237 */ 238 static final int DIGIT_TAG = 10; 239 /** 240 * Tag for U+0000, for moving the NUL-termination handling 241 * from the regular fastpath into specials-handling code. 242 * Bits 31..8: Unused, 0. 243 */ 244 static final int U0000_TAG = 11; 245 /** 246 * Tag for a Hangul syllable. 247 * Bits 31..9: Unused, 0. 248 * Bit 8: HANGUL_NO_SPECIAL_JAMO flag. 249 */ 250 static final int HANGUL_TAG = 12; 251 /** 252 * Tag for a lead surrogate code unit. 253 * Optional optimization for UTF-16 string processing. 254 * Bits 31..10: Unused, 0. 255 * 9.. 8: =0: All associated supplementary code points are unassigned-implict. 256 * =1: All associated supplementary code points fall back to the base data. 257 * else: (Normally 2) Look up the data for the supplementary code point. 258 */ 259 static final int LEAD_SURROGATE_TAG = 13; 260 /** 261 * Tag for CEs with primary weights in code point order. 262 * Bits 31..13: Index into CE table, for one data "CE". 263 * Bits 12.. 8: Unused, 0. 264 * 265 * This data "CE" has the following bit fields: 266 * Bits 63..32: Three-byte primary pppppp00. 267 * 31.. 8: Start/base code point of the in-order range. 268 * 7: Flag isCompressible primary. 269 * 6.. 0: Per-code point primary-weight increment. 270 */ 271 static final int OFFSET_TAG = 14; 272 /** 273 * Implicit CE tag. Compute an unassigned-implicit CE. 274 * All bits are set (UNASSIGNED_CE32=0xffffffff). 275 */ 276 static final int IMPLICIT_TAG = 15; 277 isAssignedCE32(int ce32)278 static boolean isAssignedCE32(int ce32) { 279 return ce32 != FALLBACK_CE32 && ce32 != UNASSIGNED_CE32; 280 } 281 282 /** 283 * We limit the number of CEs in an expansion 284 * so that we can use a small number of length bits in the data structure, 285 * and so that an implementation can copy CEs at runtime without growing a destination buffer. 286 */ 287 static final int MAX_EXPANSION_LENGTH = 31; 288 static final int MAX_INDEX = 0x7ffff; 289 290 /** 291 * Set if there is no match for the single (no-suffix) character itself. 292 * This is only possible if there is a prefix. 293 * In this case, discontiguous contraction matching cannot add combining marks 294 * starting from an empty suffix. 295 * The default CE32 is used anyway if there is no suffix match. 296 */ 297 static final int CONTRACT_SINGLE_CP_NO_MATCH = 0x100; 298 /** Set if the first character of every contraction suffix has lccc!=0. */ 299 static final int CONTRACT_NEXT_CCC = 0x200; 300 /** Set if any contraction suffix ends with lccc!=0. */ 301 static final int CONTRACT_TRAILING_CCC = 0x400; 302 303 /** For HANGUL_TAG: None of its Jamo CE32s isSpecialCE32(). */ 304 static final int HANGUL_NO_SPECIAL_JAMO = 0x100; 305 306 static final int LEAD_ALL_UNASSIGNED = 0; 307 static final int LEAD_ALL_FALLBACK = 0x100; 308 static final int LEAD_MIXED = 0x200; 309 static final int LEAD_TYPE_MASK = 0x300; 310 makeLongPrimaryCE32(long p)311 static int makeLongPrimaryCE32(long p) { return (int)(p | LONG_PRIMARY_CE32_LOW_BYTE); } 312 313 /** Turns the long-primary CE32 into a primary weight pppppp00. */ primaryFromLongPrimaryCE32(int ce32)314 static long primaryFromLongPrimaryCE32(int ce32) { 315 return (long)ce32 & 0xffffff00L; 316 } ceFromLongPrimaryCE32(int ce32)317 static long ceFromLongPrimaryCE32(int ce32) { 318 return ((long)(ce32 & 0xffffff00) << 32) | COMMON_SEC_AND_TER_CE; 319 } 320 makeLongSecondaryCE32(int lower32)321 static int makeLongSecondaryCE32(int lower32) { 322 return lower32 | SPECIAL_CE32_LOW_BYTE | LONG_SECONDARY_TAG; 323 } ceFromLongSecondaryCE32(int ce32)324 static long ceFromLongSecondaryCE32(int ce32) { 325 return (long)ce32 & 0xffffff00L; 326 } 327 328 /** Makes a special CE32 with tag, index and length. */ makeCE32FromTagIndexAndLength(int tag, int index, int length)329 static int makeCE32FromTagIndexAndLength(int tag, int index, int length) { 330 return (index << 13) | (length << 8) | SPECIAL_CE32_LOW_BYTE | tag; 331 } 332 /** Makes a special CE32 with only tag and index. */ makeCE32FromTagAndIndex(int tag, int index)333 static int makeCE32FromTagAndIndex(int tag, int index) { 334 return (index << 13) | SPECIAL_CE32_LOW_BYTE | tag; 335 } 336 isSpecialCE32(int ce32)337 static boolean isSpecialCE32(int ce32) { 338 return (ce32 & 0xff) >= SPECIAL_CE32_LOW_BYTE; 339 } 340 tagFromCE32(int ce32)341 static int tagFromCE32(int ce32) { 342 return ce32 & 0xf; 343 } 344 hasCE32Tag(int ce32, int tag)345 static boolean hasCE32Tag(int ce32, int tag) { 346 return isSpecialCE32(ce32) && tagFromCE32(ce32) == tag; 347 } 348 isLongPrimaryCE32(int ce32)349 static boolean isLongPrimaryCE32(int ce32) { 350 return hasCE32Tag(ce32, LONG_PRIMARY_TAG); 351 } 352 isSimpleOrLongCE32(int ce32)353 static boolean isSimpleOrLongCE32(int ce32) { 354 return !isSpecialCE32(ce32) || 355 tagFromCE32(ce32) == LONG_PRIMARY_TAG || 356 tagFromCE32(ce32) == LONG_SECONDARY_TAG; 357 } 358 359 /** 360 * @return true if the ce32 yields one or more CEs without further data lookups 361 */ isSelfContainedCE32(int ce32)362 static boolean isSelfContainedCE32(int ce32) { 363 return !isSpecialCE32(ce32) || 364 tagFromCE32(ce32) == LONG_PRIMARY_TAG || 365 tagFromCE32(ce32) == LONG_SECONDARY_TAG || 366 tagFromCE32(ce32) == LATIN_EXPANSION_TAG; 367 } 368 isPrefixCE32(int ce32)369 static boolean isPrefixCE32(int ce32) { 370 return hasCE32Tag(ce32, PREFIX_TAG); 371 } 372 isContractionCE32(int ce32)373 static boolean isContractionCE32(int ce32) { 374 return hasCE32Tag(ce32, CONTRACTION_TAG); 375 } 376 ce32HasContext(int ce32)377 static boolean ce32HasContext(int ce32) { 378 return isSpecialCE32(ce32) && 379 (tagFromCE32(ce32) == PREFIX_TAG || 380 tagFromCE32(ce32) == CONTRACTION_TAG); 381 } 382 383 /** 384 * Get the first of the two Latin-expansion CEs encoded in ce32. 385 * @see LATIN_EXPANSION_TAG 386 */ latinCE0FromCE32(int ce32)387 static long latinCE0FromCE32(int ce32) { 388 return ((long)(ce32 & 0xff000000) << 32) | COMMON_SECONDARY_CE | ((ce32 & 0xff0000) >> 8); 389 } 390 391 /** 392 * Get the second of the two Latin-expansion CEs encoded in ce32. 393 * @see LATIN_EXPANSION_TAG 394 */ latinCE1FromCE32(int ce32)395 static long latinCE1FromCE32(int ce32) { 396 return (((long)ce32 & 0xff00) << 16) | COMMON_TERTIARY_CE; 397 } 398 399 /** 400 * Returns the data index from a special CE32. 401 */ indexFromCE32(int ce32)402 static int indexFromCE32(int ce32) { 403 return ce32 >>> 13; 404 } 405 406 /** 407 * Returns the data length from a ce32. 408 */ lengthFromCE32(int ce32)409 static int lengthFromCE32(int ce32) { 410 return (ce32 >> 8) & 31; 411 } 412 413 /** 414 * Returns the digit value from a DIGIT_TAG ce32. 415 */ digitFromCE32(int ce32)416 static char digitFromCE32(int ce32) { 417 return (char)((ce32 >> 8) & 0xf); 418 } 419 420 /** Returns a 64-bit CE from a simple CE32 (not special). */ ceFromSimpleCE32(int ce32)421 static long ceFromSimpleCE32(int ce32) { 422 // normal form ppppsstt -> pppp0000ss00tt00 423 assert (ce32 & 0xff) < SPECIAL_CE32_LOW_BYTE; 424 return ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | ((ce32 & 0xff) << 8); 425 } 426 427 /** Returns a 64-bit CE from a simple/long-primary/long-secondary CE32. */ 428 static long ceFromCE32(int ce32) { 429 int tertiary = ce32 & 0xff; 430 if(tertiary < SPECIAL_CE32_LOW_BYTE) { 431 // normal form ppppsstt -> pppp0000ss00tt00 432 return ((long)(ce32 & 0xffff0000) << 32) | ((long)(ce32 & 0xff00) << 16) | (tertiary << 8); 433 } else { 434 ce32 -= tertiary; 435 if((tertiary & 0xf) == LONG_PRIMARY_TAG) { 436 // long-primary form ppppppC1 -> pppppp00050000500 437 return ((long)ce32 << 32) | COMMON_SEC_AND_TER_CE; 438 } else { 439 // long-secondary form ssssttC2 -> 00000000sssstt00 440 assert (tertiary & 0xf) == LONG_SECONDARY_TAG; 441 return ce32 & 0xffffffffL; 442 } 443 } 444 } 445 446 /** Creates a CE from a primary weight. */ 447 public static long makeCE(long p) { 448 return (p << 32) | COMMON_SEC_AND_TER_CE; 449 } 450 /** 451 * Creates a CE from a primary weight, 452 * 16-bit secondary/tertiary weights, and a 2-bit quaternary. 453 */ 454 static long makeCE(long p, int s, int t, int q) { 455 return (p << 32) | ((long)s << 16) | t | (q << 6); 456 } 457 458 /** 459 * Increments a 2-byte primary by a code point offset. 460 */ 461 public static long incTwoBytePrimaryByOffset(long basePrimary, boolean isCompressible, 462 int offset) { 463 // Extract the second byte, minus the minimum byte value, 464 // plus the offset, modulo the number of usable byte values, plus the minimum. 465 // Reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary. 466 long primary; 467 if(isCompressible) { 468 offset += ((int)(basePrimary >> 16) & 0xff) - 4; 469 primary = ((offset % 251) + 4) << 16; 470 offset /= 251; 471 } else { 472 offset += ((int)(basePrimary >> 16) & 0xff) - 2; 473 primary = ((offset % 254) + 2) << 16; 474 offset /= 254; 475 } 476 // First byte, assume no further overflow. 477 return primary | ((basePrimary & 0xff000000L) + ((long)offset << 24)); 478 } 479 480 /** 481 * Increments a 3-byte primary by a code point offset. 482 */ 483 public static long incThreeBytePrimaryByOffset(long basePrimary, boolean isCompressible, 484 int offset) { 485 // Extract the third byte, minus the minimum byte value, 486 // plus the offset, modulo the number of usable byte values, plus the minimum. 487 offset += ((int)(basePrimary >> 8) & 0xff) - 2; 488 long primary = ((offset % 254) + 2) << 8; 489 offset /= 254; 490 // Same with the second byte, 491 // but reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary. 492 if(isCompressible) { 493 offset += ((int)(basePrimary >> 16) & 0xff) - 4; 494 primary |= ((offset % 251) + 4) << 16; 495 offset /= 251; 496 } else { 497 offset += ((int)(basePrimary >> 16) & 0xff) - 2; 498 primary |= ((offset % 254) + 2) << 16; 499 offset /= 254; 500 } 501 // First byte, assume no further overflow. 502 return primary | ((basePrimary & 0xff000000L) + ((long)offset << 24)); 503 } 504 505 /** 506 * Decrements a 2-byte primary by one range step (1..0x7f). 507 */ 508 static long decTwoBytePrimaryByOneStep(long basePrimary, boolean isCompressible, int step) { 509 // Extract the second byte, minus the minimum byte value, 510 // minus the step, modulo the number of usable byte values, plus the minimum. 511 // Reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary. 512 // Assume no further underflow for the first byte. 513 assert(0 < step && step <= 0x7f); 514 int byte2 = ((int)(basePrimary >> 16) & 0xff) - step; 515 if(isCompressible) { 516 if(byte2 < 4) { 517 byte2 += 251; 518 basePrimary -= 0x1000000; 519 } 520 } else { 521 if(byte2 < 2) { 522 byte2 += 254; 523 basePrimary -= 0x1000000; 524 } 525 } 526 return (basePrimary & 0xff000000L) | (byte2 << 16); 527 } 528 529 /** 530 * Decrements a 3-byte primary by one range step (1..0x7f). 531 */ 532 static long decThreeBytePrimaryByOneStep(long basePrimary, boolean isCompressible, int step) { 533 // Extract the third byte, minus the minimum byte value, 534 // minus the step, modulo the number of usable byte values, plus the minimum. 535 assert(0 < step && step <= 0x7f); 536 int byte3 = ((int)(basePrimary >> 8) & 0xff) - step; 537 if(byte3 >= 2) { 538 return (basePrimary & 0xffff0000L) | (byte3 << 8); 539 } 540 byte3 += 254; 541 // Same with the second byte, 542 // but reserve the PRIMARY_COMPRESSION_LOW_BYTE and high byte if necessary. 543 int byte2 = ((int)(basePrimary >> 16) & 0xff) - 1; 544 if(isCompressible) { 545 if(byte2 < 4) { 546 byte2 = 0xfe; 547 basePrimary -= 0x1000000; 548 } 549 } else { 550 if(byte2 < 2) { 551 byte2 = 0xff; 552 basePrimary -= 0x1000000; 553 } 554 } 555 // First byte, assume no further underflow. 556 return (basePrimary & 0xff000000L) | (byte2 << 16) | (byte3 << 8); 557 } 558 559 /** 560 * Computes a 3-byte primary for c's OFFSET_TAG data "CE". 561 */ 562 static long getThreeBytePrimaryForOffsetData(int c, long dataCE) { 563 long p = dataCE >>> 32; // three-byte primary pppppp00 564 int lower32 = (int)dataCE; // base code point b & step s: bbbbbbss (bit 7: isCompressible) 565 int offset = (c - (lower32 >> 8)) * (lower32 & 0x7f); // delta * increment 566 boolean isCompressible = (lower32 & 0x80) != 0; 567 return Collation.incThreeBytePrimaryByOffset(p, isCompressible, offset); 568 } 569 570 /** 571 * Returns the unassigned-character implicit primary weight for any valid code point c. 572 */ 573 static long unassignedPrimaryFromCodePoint(int c) { 574 // Create a gap before U+0000. Use c=-1 for [first unassigned]. 575 ++c; 576 // Fourth byte: 18 values, every 14th byte value (gap of 13). 577 long primary = 2 + (c % 18) * 14; 578 c /= 18; 579 // Third byte: 254 values. 580 primary |= (2 + (c % 254)) << 8; 581 c /= 254; 582 // Second byte: 251 values 04..FE excluding the primary compression bytes. 583 primary |= (4 + (c % 251)) << 16; 584 // One lead byte covers all code points (c < 0x1182B4 = 1*251*254*18). 585 return primary | ((long)UNASSIGNED_IMPLICIT_BYTE << 24); 586 } 587 588 static long unassignedCEFromCodePoint(int c) { 589 return makeCE(unassignedPrimaryFromCodePoint(c)); 590 } 591 592 // private Collation() // No instantiation. 593 } 594