1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2013-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * CollationSettings.java, ported from collationsettings.h/.cpp 9 * 10 * C++ version created on: 2013feb07 11 * created by: Markus W. Scherer 12 */ 13 14 package com.ibm.icu.impl.coll; 15 16 import java.util.Arrays; 17 18 import com.ibm.icu.text.Collator; 19 20 /** 21 * Collation settings/options/attributes. 22 * These are the values that can be changed via API. 23 */ 24 public final class CollationSettings extends SharedObject { 25 /** 26 * Options bit 0: Perform the FCD check on the input text and deliver normalized text. 27 */ 28 public static final int CHECK_FCD = 1; 29 /** 30 * Options bit 1: Numeric collation. 31 * Also known as CODAN = COllate Digits As Numbers. 32 * 33 * Treat digit sequences as numbers with CE sequences in numeric order, 34 * rather than returning a normal CE for each digit. 35 */ 36 public static final int NUMERIC = 2; 37 /** 38 * "Shifted" alternate handling, see ALTERNATE_MASK. 39 */ 40 static final int SHIFTED = 4; 41 /** 42 * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable. 43 * Reserve values 8 and 0xc for shift-trimmed and blanked. 44 */ 45 static final int ALTERNATE_MASK = 0xc; 46 /** 47 * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value. 48 */ 49 static final int MAX_VARIABLE_SHIFT = 4; 50 /** maxVariable options bit mask before shifting. */ 51 static final int MAX_VARIABLE_MASK = 0x70; 52 /** Options bit 7: Reserved/unused/0. */ 53 /** 54 * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on. 55 */ 56 static final int UPPER_FIRST = 0x100; 57 /** 58 * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values) 59 * unless case level is on (when they are *moved* into the separate case level). 60 * By default, the case bits are removed from the tertiary weight (ignored). 61 * 62 * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to 63 * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST. 64 */ 65 public static final int CASE_FIRST = 0x200; 66 /** 67 * Options bit mask for caseFirst and upperFirst, before shifting. 68 * Same value as caseFirst==upperFirst. 69 */ 70 public static final int CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST; 71 /** 72 * Options bit 10: Insert the case level between the secondary and tertiary levels. 73 */ 74 public static final int CASE_LEVEL = 0x400; 75 /** 76 * Options bit 11: Compare secondary weights backwards. ("French secondary") 77 */ 78 public static final int BACKWARD_SECONDARY = 0x800; 79 /** 80 * Options bits 15..12: The 4-bit strength value bit field is shifted by this value. 81 * It is the top used bit field in the options. (No need to mask after shifting.) 82 */ 83 static final int STRENGTH_SHIFT = 12; 84 /** Strength options bit mask before shifting. */ 85 static final int STRENGTH_MASK = 0xf000; 86 87 /** maxVariable values */ 88 static final int MAX_VAR_SPACE = 0; 89 static final int MAX_VAR_PUNCT = 1; 90 static final int MAX_VAR_SYMBOL = 2; 91 static final int MAX_VAR_CURRENCY = 3; 92 CollationSettings()93 CollationSettings() {} 94 95 @Override clone()96 public CollationSettings clone() { 97 CollationSettings newSettings = (CollationSettings)super.clone(); 98 // Note: The reorderTable, reorderRanges, and reorderCodes need not be cloned 99 // because, in Java, they only get replaced but not modified. 100 newSettings.fastLatinPrimaries = fastLatinPrimaries.clone(); 101 return newSettings; 102 } 103 104 @Override equals(Object other)105 public boolean equals(Object other) { 106 if(other == null) { return false; } 107 if(!this.getClass().equals(other.getClass())) { return false; } 108 CollationSettings o = (CollationSettings)other; 109 if(options != o.options) { return false; } 110 if((options & ALTERNATE_MASK) != 0 && variableTop != o.variableTop) { return false; } 111 if(!Arrays.equals(reorderCodes, o.reorderCodes)) { return false; } 112 return true; 113 } 114 115 @Override hashCode()116 public int hashCode() { 117 int h = options << 8; 118 if((options & ALTERNATE_MASK) != 0) { h ^= variableTop; } 119 h ^= reorderCodes.length; 120 for(int i = 0; i < reorderCodes.length; ++i) { 121 h ^= (reorderCodes[i] << i); 122 } 123 return h; 124 } 125 resetReordering()126 public void resetReordering() { 127 // When we turn off reordering, we want to set a null permutation 128 // rather than a no-op permutation. 129 reorderTable = null; 130 minHighNoReorder = 0; 131 reorderRanges = null; 132 reorderCodes = EMPTY_INT_ARRAY; 133 } 134 aliasReordering(CollationData data, int[] codesAndRanges, int codesLength, byte[] table)135 void aliasReordering(CollationData data, int[] codesAndRanges, int codesLength, byte[] table) { 136 int[] codes; 137 if(codesLength == codesAndRanges.length) { 138 codes = codesAndRanges; 139 } else { 140 codes = Arrays.copyOf(codesAndRanges, codesLength); 141 } 142 int rangesStart = codesLength; 143 int rangesLimit = codesAndRanges.length; 144 int rangesLength = rangesLimit - rangesStart; 145 if(table != null && 146 (rangesLength == 0 ? 147 !reorderTableHasSplitBytes(table) : 148 rangesLength >= 2 && 149 // The first offset must be 0. The last offset must not be 0. 150 (codesAndRanges[rangesStart] & 0xffff) == 0 && 151 (codesAndRanges[rangesLimit - 1] & 0xffff) != 0)) { 152 reorderTable = table; 153 reorderCodes = codes; 154 // Drop ranges before the first split byte. They are reordered by the table. 155 // This then speeds up reordering of the remaining ranges. 156 int firstSplitByteRangeIndex = rangesStart; 157 while(firstSplitByteRangeIndex < rangesLimit && 158 (codesAndRanges[firstSplitByteRangeIndex] & 0xff0000) == 0) { 159 // The second byte of the primary limit is 0. 160 ++firstSplitByteRangeIndex; 161 } 162 if(firstSplitByteRangeIndex == rangesLimit) { 163 assert(!reorderTableHasSplitBytes(table)); 164 minHighNoReorder = 0; 165 reorderRanges = null; 166 } else { 167 assert(table[codesAndRanges[firstSplitByteRangeIndex] >>> 24] == 0); 168 minHighNoReorder = codesAndRanges[rangesLimit - 1] & 0xffff0000L; 169 setReorderRanges(codesAndRanges, firstSplitByteRangeIndex, 170 rangesLimit - firstSplitByteRangeIndex); 171 } 172 return; 173 } 174 // Regenerate missing data. 175 setReordering(data, codes); 176 } 177 setReordering(CollationData data, int[] codes)178 public void setReordering(CollationData data, int[] codes) { 179 if(codes.length == 0 || (codes.length == 1 && codes[0] == Collator.ReorderCodes.NONE)) { 180 resetReordering(); 181 return; 182 } 183 UVector32 rangesList = new UVector32(); 184 data.makeReorderRanges(codes, rangesList); 185 int rangesLength = rangesList.size(); 186 if(rangesLength == 0) { 187 resetReordering(); 188 return; 189 } 190 int[] ranges = rangesList.getBuffer(); 191 // ranges[] contains at least two (limit, offset) pairs. 192 // The first offset must be 0. The last offset must not be 0. 193 // Separators (at the low end) and trailing weights (at the high end) 194 // are never reordered. 195 assert(rangesLength >= 2); 196 assert((ranges[0] & 0xffff) == 0 && (ranges[rangesLength - 1] & 0xffff) != 0); 197 minHighNoReorder = ranges[rangesLength - 1] & 0xffff0000L; 198 199 // Write the lead byte permutation table. 200 // Set a 0 for each lead byte that has a range boundary in the middle. 201 byte[] table = new byte[256]; 202 int b = 0; 203 int firstSplitByteRangeIndex = -1; 204 for(int i = 0; i < rangesLength; ++i) { 205 int pair = ranges[i]; 206 int limit1 = pair >>> 24; 207 while(b < limit1) { 208 table[b] = (byte)(b + pair); 209 ++b; 210 } 211 // Check the second byte of the limit. 212 if((pair & 0xff0000) != 0) { 213 table[limit1] = 0; 214 b = limit1 + 1; 215 if(firstSplitByteRangeIndex < 0) { 216 firstSplitByteRangeIndex = i; 217 } 218 } 219 } 220 while(b <= 0xff) { 221 table[b] = (byte)b; 222 ++b; 223 } 224 int rangesStart; 225 if(firstSplitByteRangeIndex < 0) { 226 // The lead byte permutation table alone suffices for reordering. 227 rangesStart = rangesLength = 0; 228 } else { 229 // Remove the ranges below the first split byte. 230 rangesStart = firstSplitByteRangeIndex; 231 rangesLength -= firstSplitByteRangeIndex; 232 } 233 setReorderArrays(codes, ranges, rangesStart, rangesLength, table); 234 } 235 setReorderArrays(int[] codes, int[] ranges, int rangesStart, int rangesLength, byte[] table)236 private void setReorderArrays(int[] codes, 237 int[] ranges, int rangesStart, int rangesLength, byte[] table) { 238 // Very different from C++. See the comments after the reorderCodes declaration. 239 if(codes == null) { 240 codes = EMPTY_INT_ARRAY; 241 } 242 assert (codes.length == 0) == (table == null); 243 reorderTable = table; 244 reorderCodes = codes; 245 setReorderRanges(ranges, rangesStart, rangesLength); 246 } 247 setReorderRanges(int[] ranges, int rangesStart, int rangesLength)248 private void setReorderRanges(int[] ranges, int rangesStart, int rangesLength) { 249 if(rangesLength == 0) { 250 reorderRanges = null; 251 } else { 252 reorderRanges = new long[rangesLength]; 253 int i = 0; 254 do { 255 reorderRanges[i++] = ranges[rangesStart++] & 0xffffffffL; 256 } while(i < rangesLength); 257 } 258 } 259 copyReorderingFrom(CollationSettings other)260 public void copyReorderingFrom(CollationSettings other) { 261 if(!other.hasReordering()) { 262 resetReordering(); 263 return; 264 } 265 minHighNoReorder = other.minHighNoReorder; 266 reorderTable = other.reorderTable; 267 reorderRanges = other.reorderRanges; 268 reorderCodes = other.reorderCodes; 269 } 270 hasReordering()271 public boolean hasReordering() { return reorderTable != null; } 272 reorderTableHasSplitBytes(byte[] table)273 private static boolean reorderTableHasSplitBytes(byte[] table) { 274 assert(table[0] == 0); 275 for(int i = 1; i < 256; ++i) { 276 if(table[i] == 0) { 277 return true; 278 } 279 } 280 return false; 281 } 282 reorder(long p)283 public long reorder(long p) { 284 byte b = reorderTable[(int)p >>> 24]; 285 if(b != 0 || p <= Collation.NO_CE_PRIMARY) { 286 return ((b & 0xffL) << 24) | (p & 0xffffff); 287 } else { 288 return reorderEx(p); 289 } 290 } 291 reorderEx(long p)292 private long reorderEx(long p) { 293 assert minHighNoReorder > 0; 294 if(p >= minHighNoReorder) { return p; } 295 // Round up p so that its lower 16 bits are >= any offset bits. 296 // Then compare q directly with (limit, offset) pairs. 297 long q = p | 0xffff; 298 long r; 299 int i = 0; 300 while(q >= (r = reorderRanges[i])) { ++i; } 301 return p + ((long)(short)r << 24); 302 } 303 304 // In C++, we use enums for attributes and their values, with a special value for the default. 305 // Combined getter/setter methods handle many attributes. 306 // In Java, we have specific methods for getting, setting, and set-to-default, 307 // except that this class uses bits in its own bit set for simple values. 308 setStrength(int value)309 public void setStrength(int value) { 310 int noStrength = options & ~STRENGTH_MASK; 311 switch(value) { 312 case Collator.PRIMARY: 313 case Collator.SECONDARY: 314 case Collator.TERTIARY: 315 case Collator.QUATERNARY: 316 case Collator.IDENTICAL: 317 options = noStrength | (value << STRENGTH_SHIFT); 318 break; 319 default: 320 throw new IllegalArgumentException("illegal strength value " + value); 321 } 322 } 323 setStrengthDefault(int defaultOptions)324 public void setStrengthDefault(int defaultOptions) { 325 int noStrength = options & ~STRENGTH_MASK; 326 options = noStrength | (defaultOptions & STRENGTH_MASK); 327 } 328 getStrength(int options)329 static int getStrength(int options) { 330 return options >> STRENGTH_SHIFT; 331 } 332 getStrength()333 public int getStrength() { 334 return getStrength(options); 335 } 336 337 /** Sets the options bit for an on/off attribute. */ setFlag(int bit, boolean value)338 public void setFlag(int bit, boolean value) { 339 if(value) { 340 options |= bit; 341 } else { 342 options &= ~bit; 343 } 344 } 345 setFlagDefault(int bit, int defaultOptions)346 public void setFlagDefault(int bit, int defaultOptions) { 347 options = (options & ~bit) | (defaultOptions & bit); 348 } 349 getFlag(int bit)350 public boolean getFlag(int bit) { 351 return (options & bit) != 0; 352 } 353 setCaseFirst(int value)354 public void setCaseFirst(int value) { 355 assert value == 0 || value == CASE_FIRST || value == CASE_FIRST_AND_UPPER_MASK; 356 int noCaseFirst = options & ~CASE_FIRST_AND_UPPER_MASK; 357 options = noCaseFirst | value; 358 } 359 setCaseFirstDefault(int defaultOptions)360 public void setCaseFirstDefault(int defaultOptions) { 361 int noCaseFirst = options & ~CASE_FIRST_AND_UPPER_MASK; 362 options = noCaseFirst | (defaultOptions & CASE_FIRST_AND_UPPER_MASK); 363 } 364 getCaseFirst()365 public int getCaseFirst() { 366 return options & CASE_FIRST_AND_UPPER_MASK; 367 } 368 setAlternateHandlingShifted(boolean value)369 public void setAlternateHandlingShifted(boolean value) { 370 int noAlternate = options & ~ALTERNATE_MASK; 371 if(value) { 372 options = noAlternate | SHIFTED; 373 } else { 374 options = noAlternate; 375 } 376 } 377 setAlternateHandlingDefault(int defaultOptions)378 public void setAlternateHandlingDefault(int defaultOptions) { 379 int noAlternate = options & ~ALTERNATE_MASK; 380 options = noAlternate | (defaultOptions & ALTERNATE_MASK); 381 } 382 getAlternateHandling()383 public boolean getAlternateHandling() { 384 return (options & ALTERNATE_MASK) != 0; 385 } 386 setMaxVariable(int value, int defaultOptions)387 public void setMaxVariable(int value, int defaultOptions) { 388 int noMax = options & ~MAX_VARIABLE_MASK; 389 switch(value) { 390 case MAX_VAR_SPACE: 391 case MAX_VAR_PUNCT: 392 case MAX_VAR_SYMBOL: 393 case MAX_VAR_CURRENCY: 394 options = noMax | (value << MAX_VARIABLE_SHIFT); 395 break; 396 case -1: 397 options = noMax | (defaultOptions & MAX_VARIABLE_MASK); 398 break; 399 default: 400 throw new IllegalArgumentException("illegal maxVariable value " + value); 401 } 402 } 403 getMaxVariable()404 public int getMaxVariable() { 405 return (options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT; 406 } 407 408 /** 409 * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off. 410 */ isTertiaryWithCaseBits(int options)411 static boolean isTertiaryWithCaseBits(int options) { 412 return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST; 413 } getTertiaryMask(int options)414 static int getTertiaryMask(int options) { 415 // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. 416 return isTertiaryWithCaseBits(options) ? 417 Collation.CASE_AND_TERTIARY_MASK : Collation.ONLY_TERTIARY_MASK; 418 } 419 sortsTertiaryUpperCaseFirst(int options)420 static boolean sortsTertiaryUpperCaseFirst(int options) { 421 // On tertiary level, consider case bits and sort uppercase first 422 // if caseLevel is off and caseFirst==upperFirst. 423 return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK; 424 } 425 dontCheckFCD()426 public boolean dontCheckFCD() { 427 return (options & CHECK_FCD) == 0; 428 } 429 hasBackwardSecondary()430 boolean hasBackwardSecondary() { 431 return (options & BACKWARD_SECONDARY) != 0; 432 } 433 isNumeric()434 public boolean isNumeric() { 435 return (options & NUMERIC) != 0; 436 } 437 438 /** CHECK_FCD etc. */ 439 public int options = (Collator.TERTIARY << STRENGTH_SHIFT) | // DEFAULT_STRENGTH 440 (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT); 441 /** Variable-top primary weight. */ 442 public long variableTop; 443 /** 444 * 256-byte table for reordering permutation of primary lead bytes; null if no reordering. 445 * A 0 entry at a non-zero index means that the primary lead byte is "split" 446 * (there are different offsets for primaries that share that lead byte) 447 * and the reordering offset must be determined via the reorderRanges. 448 */ 449 public byte[] reorderTable; 450 /** Limit of last reordered range. 0 if no reordering or no split bytes. */ 451 long minHighNoReorder; 452 /** 453 * Primary-weight ranges for script reordering, 454 * to be used by reorder(p) for split-reordered primary lead bytes. 455 * 456 * <p>Each entry is a (limit, offset) pair. 457 * The upper 16 bits of the entry are the upper 16 bits of the 458 * exclusive primary limit of a range. 459 * Primaries between the previous limit and this one have their lead bytes 460 * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits. 461 * 462 * <p>CollationData.makeReorderRanges() writes a full list where the first range 463 * (at least for terminators and separators) has a 0 offset. 464 * The last range has a non-zero offset. 465 * minHighNoReorder is set to the limit of that last range. 466 * 467 * <p>In the settings object, the initial ranges before the first split lead byte 468 * are omitted for efficiency; they are handled by reorder(p) via the reorderTable. 469 * If there are no split-reordered lead bytes, then no ranges are needed. 470 */ 471 long[] reorderRanges; 472 /** Array of reorder codes; ignored if length == 0. */ 473 public int[] reorderCodes = EMPTY_INT_ARRAY; 474 // Note: In C++, we keep a memory block around for the reorder codes, 475 // the ranges, and the permutation table, 476 // and modify them for new codes. 477 // In Java, we simply copy references and then never modify the array contents. 478 // The caller must abandon the arrays. 479 // Reorder codes from the public setter API must be cloned. 480 private static final int[] EMPTY_INT_ARRAY = new int[0]; 481 482 /** Options for CollationFastLatin. Negative if disabled. */ 483 public int fastLatinOptions = -1; 484 // fastLatinPrimaries.length must be equal to CollationFastLatin.LATIN_LIMIT, 485 // but we do not import CollationFastLatin to reduce circular dependencies. 486 public char[] fastLatinPrimaries = new char[0x180]; // mutable contents 487 } 488