1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /** 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import com.ibm.icu.impl.coll.Collation; 12 13 /** 14 * A <code>CollationKey</code> represents a <code>String</code> 15 * under the rules of a specific <code>Collator</code> 16 * object. Comparing two <code>CollationKey</code>s returns the 17 * relative order of the <code>String</code>s they represent. 18 * 19 * <p>Since the rule set of <code>Collator</code>s can differ, the 20 * sort orders of the same string under two different 21 * <code>Collator</code>s might differ. Hence comparing 22 * <code>CollationKey</code>s generated from different 23 * <code>Collator</code>s can give incorrect results. 24 25 * <p>Both the method 26 * <code>CollationKey.compareTo(CollationKey)</code> and the method 27 * <code>Collator.compare(String, String)</code> compare two strings 28 * and returns their relative order. The performance characteristics 29 * of these two approaches can differ. 30 * Note that collation keys are often less efficient than simply doing comparison. 31 * For more details, see the ICU User Guide. 32 * 33 * <p>During the construction of a <code>CollationKey</code>, the 34 * entire source string is examined and processed into a series of 35 * bits terminated by a null, that are stored in the <code>CollationKey</code>. 36 * When <code>CollationKey.compareTo(CollationKey)</code> executes, it 37 * performs bitwise comparison on the bit sequences. This can incurs 38 * startup cost when creating the <code>CollationKey</code>, but once 39 * the key is created, binary comparisons are fast. This approach is 40 * recommended when the same strings are to be compared over and over 41 * again. 42 * 43 * <p>On the other hand, implementations of 44 * <code>Collator.compare(String, String)</code> can examine and 45 * process the strings only until the first characters differing in 46 * order. This approach is recommended if the strings are to be 47 * compared only once.</p> 48 * 49 * <p>More information about the composition of the bit sequence can 50 * be found in the 51 * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html"> 52 * user guide</a>.</p> 53 * 54 * <p>The following example shows how <code>CollationKey</code>s can be used 55 * to sort a list of <code>String</code>s.</p> 56 * <blockquote> 57 * <pre> 58 * // Create an array of CollationKeys for the Strings to be sorted. 59 * Collator myCollator = Collator.getInstance(); 60 * CollationKey[] keys = new CollationKey[3]; 61 * keys[0] = myCollator.getCollationKey("Tom"); 62 * keys[1] = myCollator.getCollationKey("Dick"); 63 * keys[2] = myCollator.getCollationKey("Harry"); 64 * sort( keys ); 65 * <br> 66 * //... 67 * <br> 68 * // Inside body of sort routine, compare keys this way 69 * if( keys[i].compareTo( keys[j] ) > 0 ) 70 * // swap keys[i] and keys[j] 71 * <br> 72 * //... 73 * <br> 74 * // Finally, when we've returned from sort. 75 * System.out.println( keys[0].getSourceString() ); 76 * System.out.println( keys[1].getSourceString() ); 77 * System.out.println( keys[2].getSourceString() ); 78 * </pre> 79 * </blockquote> 80 * <p> 81 * This class is not subclassable 82 * @see Collator 83 * @see RuleBasedCollator 84 * @author Syn Wee Quek 85 * @stable ICU 2.8 86 */ 87 public final class CollationKey implements Comparable<CollationKey> 88 { 89 // public inner classes ------------------------------------------------- 90 91 /** 92 * Options that used in the API CollationKey.getBound() for getting a 93 * CollationKey based on the bound mode requested. 94 * @stable ICU 2.6 95 */ 96 public static final class BoundMode 97 { 98 /* 99 * do not change the values assigned to the members of this enum. 100 * Underlying code depends on them having these numbers 101 */ 102 103 /** 104 * Lower bound 105 * @stable ICU 2.6 106 */ 107 public static final int LOWER = 0; 108 109 /** 110 * Upper bound that will match strings of exact size 111 * @stable ICU 2.6 112 */ 113 public static final int UPPER = 1; 114 115 /** 116 * Upper bound that will match all the strings that have the same 117 * initial substring as the given string 118 * @stable ICU 2.6 119 */ 120 public static final int UPPER_LONG = 2; 121 122 /** 123 * One more than the highest normal BoundMode value. 124 * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420. 125 */ 126 @Deprecated 127 public static final int COUNT = 3; 128 129 /** 130 * Private Constructor 131 */ 132 ///CLOVER:OFF BoundMode()133 private BoundMode(){} 134 ///CLOVER:ON 135 } 136 137 // public constructor --------------------------------------------------- 138 139 /** 140 * CollationKey constructor. 141 * This constructor is given public access, unlike the JDK version, to 142 * allow access to users extending the Collator class. See 143 * {@link Collator#getCollationKey(String)}. 144 * @param source string this CollationKey is to represent 145 * @param key array of bytes that represent the collation order of argument 146 * source terminated by a null 147 * @see Collator 148 * @stable ICU 2.8 149 */ CollationKey(String source, byte key[])150 public CollationKey(String source, byte key[]) 151 { 152 this(source, key, -1); 153 } 154 155 /** 156 * Private constructor, takes a length argument so it need not be lazy-evaluated. 157 * There must be a 00 byte at key[length] and none before. 158 */ CollationKey(String source, byte key[], int length)159 private CollationKey(String source, byte key[], int length) 160 { 161 m_source_ = source; 162 m_key_ = key; 163 m_hashCode_ = 0; 164 m_length_ = length; 165 } 166 167 /** 168 * CollationKey constructor that forces key to release its internal byte 169 * array for adoption. key will have a null byte array after this 170 * construction. 171 * @param source string this CollationKey is to represent 172 * @param key RawCollationKey object that represents the collation order of 173 * argument source. 174 * @see Collator 175 * @see RawCollationKey 176 * @stable ICU 2.8 177 */ CollationKey(String source, RawCollationKey key)178 public CollationKey(String source, RawCollationKey key) 179 { 180 m_source_ = source; 181 m_length_ = key.size - 1; 182 m_key_ = key.releaseBytes(); 183 assert m_key_[m_length_] == 0; 184 m_hashCode_ = 0; 185 } 186 187 // public getters ------------------------------------------------------- 188 189 /** 190 * Return the source string that this CollationKey represents. 191 * @return source string that this CollationKey represents 192 * @stable ICU 2.8 193 */ getSourceString()194 public String getSourceString() 195 { 196 return m_source_; 197 } 198 199 /** 200 * Duplicates and returns the value of this CollationKey as a sequence 201 * of big-endian bytes terminated by a null. 202 * 203 * <p>If two CollationKeys can be legitimately compared, then one can 204 * compare the byte arrays of each to obtain the same result, e.g. 205 * <pre> 206 * byte key1[] = collationkey1.toByteArray(); 207 * byte key2[] = collationkey2.toByteArray(); 208 * int key, targetkey; 209 * int i = 0; 210 * do { 211 * key = key1[i] & 0xFF; 212 * targetkey = key2[i] & 0xFF; 213 * if (key < targetkey) { 214 * System.out.println("String 1 is less than string 2"); 215 * return; 216 * } 217 * if (targetkey < key) { 218 * System.out.println("String 1 is more than string 2"); 219 * } 220 * i ++; 221 * } while (key != 0 && targetKey != 0); 222 * 223 * System.out.println("Strings are equal."); 224 * </pre> 225 * 226 * @return CollationKey value in a sequence of big-endian byte bytes 227 * terminated by a null. 228 * @stable ICU 2.8 229 */ toByteArray()230 public byte[] toByteArray() 231 { 232 int length = getLength() + 1; 233 byte result[] = new byte[length]; 234 System.arraycopy(m_key_, 0, result, 0, length); 235 return result; 236 } 237 238 // public other methods ------------------------------------------------- 239 240 /** 241 * Compare this CollationKey to another CollationKey. The 242 * collation rules of the Collator that created this key are 243 * applied. 244 * 245 * <p><strong>Note:</strong> Comparison between CollationKeys 246 * created by different Collators might return incorrect 247 * results. See class documentation. 248 * 249 * @param target target CollationKey 250 * @return an integer value. If the value is less than zero this CollationKey 251 * is less than than target, if the value is zero they are equal, and 252 * if the value is greater than zero this CollationKey is greater 253 * than target. 254 * @exception NullPointerException is thrown if argument is null. 255 * @see Collator#compare(String, String) 256 * @stable ICU 2.8 257 */ 258 @Override compareTo(CollationKey target)259 public int compareTo(CollationKey target) 260 { 261 for (int i = 0;; ++i) { 262 int l = m_key_[i]&0xff; 263 int r = target.m_key_[i]&0xff; 264 if (l < r) { 265 return -1; 266 } else if (l > r) { 267 return 1; 268 } else if (l == 0) { 269 return 0; 270 } 271 } 272 } 273 274 /** 275 * Compare this CollationKey and the specified Object for 276 * equality. The collation rules of the Collator that created 277 * this key are applied. 278 * 279 * <p>See note in compareTo(CollationKey) for warnings about 280 * possible incorrect results. 281 * 282 * @param target the object to compare to. 283 * @return true if the two keys compare as equal, false otherwise. 284 * @see #compareTo(CollationKey) 285 * @exception ClassCastException is thrown when the argument is not 286 * a CollationKey. NullPointerException is thrown when the argument 287 * is null. 288 * @stable ICU 2.8 289 */ 290 @Override equals(Object target)291 public boolean equals(Object target) 292 { 293 if (!(target instanceof CollationKey)) { 294 return false; 295 } 296 297 return equals((CollationKey)target); 298 } 299 300 /** 301 * Compare this CollationKey and the argument target CollationKey for 302 * equality. 303 * The collation 304 * rules of the Collator object which created these objects are applied. 305 * <p> 306 * See note in compareTo(CollationKey) for warnings of incorrect results 307 * 308 * @param target the CollationKey to compare to. 309 * @return true if two objects are equal, false otherwise. 310 * @exception NullPointerException is thrown when the argument is null. 311 * @stable ICU 2.8 312 */ equals(CollationKey target)313 public boolean equals(CollationKey target) 314 { 315 if (this == target) { 316 return true; 317 } 318 if (target == null) { 319 return false; 320 } 321 CollationKey other = target; 322 int i = 0; 323 while (true) { 324 if (m_key_[i] != other.m_key_[i]) { 325 return false; 326 } 327 if (m_key_[i] == 0) { 328 break; 329 } 330 i ++; 331 } 332 return true; 333 } 334 335 /** 336 * Returns a hash code for this CollationKey. The hash value is calculated 337 * on the key itself, not the String from which the key was created. Thus 338 * if x and y are CollationKeys, then x.hashCode(x) == y.hashCode() 339 * if x.equals(y) is true. This allows language-sensitive comparison in a 340 * hash table. 341 * 342 * @return the hash value. 343 * @stable ICU 2.8 344 */ 345 @Override hashCode()346 public int hashCode() 347 { 348 if (m_hashCode_ == 0) { 349 if (m_key_ == null) { 350 m_hashCode_ = 1; 351 } 352 else { 353 int size = m_key_.length >> 1; 354 StringBuilder key = new StringBuilder(size); 355 int i = 0; 356 while (m_key_[i] != 0 && m_key_[i + 1] != 0) { 357 key.append((char)((m_key_[i] << 8) | (0xff & m_key_[i + 1]))); 358 i += 2; 359 } 360 if (m_key_[i] != 0) { 361 key.append((char)(m_key_[i] << 8)); 362 } 363 m_hashCode_ = key.toString().hashCode(); 364 } 365 } 366 return m_hashCode_; 367 } 368 369 /** 370 * Produces a bound for the sort order of a given collation key and a 371 * strength level. This API does not attempt to find a bound for the 372 * CollationKey String representation, hence null will be returned in its 373 * place. 374 * <p> 375 * Resulting bounds can be used to produce a range of strings that are 376 * between upper and lower bounds. For example, if bounds are produced 377 * for a sortkey of string "smith", strings between upper and lower 378 * bounds with primary strength would include "Smith", "SMITH", "sMiTh". 379 * <p> 380 * There are two upper bounds that can be produced. If BoundMode.UPPER 381 * is produced, strings matched would be as above. However, if a bound 382 * is produced using BoundMode.UPPER_LONG is used, the above example will 383 * also match "Smithsonian" and similar. 384 * <p> 385 * For more on usage, see example in test procedure 386 * <a href="http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/collator/CollationAPITest.java"> 387 * src/com/ibm/icu/dev/test/collator/CollationAPITest/TestBounds. 388 * </a> 389 * <p> 390 * Collation keys produced may be compared using the <TT>compare</TT> API. 391 * @param boundType Mode of bound required. It can be BoundMode.LOWER, which 392 * produces a lower inclusive bound, BoundMode.UPPER, that 393 * produces upper bound that matches strings of the same 394 * length or BoundMode.UPPER_LONG that matches strings that 395 * have the same starting substring as the source string. 396 * @param noOfLevels Strength levels required in the resulting bound 397 * (for most uses, the recommended value is PRIMARY). This 398 * strength should be less than the maximum strength of 399 * this CollationKey. 400 * See users guide for explanation on the strength levels a 401 * collation key can have. 402 * @return the result bounded CollationKey with a valid sort order but 403 * a null String representation. 404 * @exception IllegalArgumentException thrown when the strength level 405 * requested is higher than or equal to the strength in this 406 * CollationKey. 407 * In the case of an Exception, information 408 * about the maximum strength to use will be returned in the 409 * Exception. The user can then call getBound() again with the 410 * appropriate strength. 411 * @see CollationKey 412 * @see CollationKey.BoundMode 413 * @see Collator#PRIMARY 414 * @see Collator#SECONDARY 415 * @see Collator#TERTIARY 416 * @see Collator#QUATERNARY 417 * @see Collator#IDENTICAL 418 * @stable ICU 2.6 419 */ getBound(int boundType, int noOfLevels)420 public CollationKey getBound(int boundType, int noOfLevels) 421 { 422 // Scan the string until we skip enough of the key OR reach the end of 423 // the key 424 int offset = 0; 425 int keystrength = Collator.PRIMARY; 426 427 if (noOfLevels > Collator.PRIMARY) { 428 while (offset < m_key_.length && m_key_[offset] != 0) { 429 if (m_key_[offset ++] 430 == Collation.LEVEL_SEPARATOR_BYTE) { 431 keystrength ++; 432 noOfLevels --; 433 if (noOfLevels == Collator.PRIMARY 434 || offset == m_key_.length || m_key_[offset] == 0) { 435 offset --; 436 break; 437 } 438 } 439 } 440 } 441 442 if (noOfLevels > 0) { 443 throw new IllegalArgumentException( 444 "Source collation key has only " 445 + keystrength 446 + " strength level. Call getBound() again " 447 + " with noOfLevels < " + keystrength); 448 } 449 450 // READ ME: this code assumes that the values for BoundMode variables 451 // will not change. They are set so that the enum value corresponds to 452 // the number of extra bytes each bound type needs. 453 byte resultkey[] = new byte[offset + boundType + 1]; 454 System.arraycopy(m_key_, 0, resultkey, 0, offset); 455 switch (boundType) { 456 case BoundMode.LOWER: // = 0 457 // Lower bound just gets terminated. No extra bytes 458 break; 459 case BoundMode.UPPER: // = 1 460 // Upper bound needs one extra byte 461 resultkey[offset ++] = 2; 462 break; 463 case BoundMode.UPPER_LONG: // = 2 464 // Upper long bound needs two extra bytes 465 resultkey[offset ++] = (byte)0xFF; 466 resultkey[offset ++] = (byte)0xFF; 467 break; 468 default: 469 throw new IllegalArgumentException( 470 "Illegal boundType argument"); 471 } 472 resultkey[offset] = 0; 473 return new CollationKey(null, resultkey, offset); 474 } 475 476 477 /** 478 * Merges this CollationKey with another. 479 * The levels are merged with their corresponding counterparts 480 * (primaries with primaries, secondaries with secondaries etc.). 481 * Between the values from the same level a separator is inserted. 482 * 483 * <p>This is useful, for example, for combining sort keys from first and last names 484 * to sort such pairs. 485 * See http://www.unicode.org/reports/tr10/#Merging_Sort_Keys 486 * 487 * <p>The recommended way to achieve "merged" sorting is by 488 * concatenating strings with U+FFFE between them. 489 * The concatenation has the same sort order as the merged sort keys, 490 * but merge(getSortKey(str1), getSortKey(str2)) may differ from getSortKey(str1 + '\uFFFE' + str2). 491 * Using strings with U+FFFE may yield shorter sort keys. 492 * 493 * <p>For details about Sort Key Features see 494 * https://unicode-org.github.io/icu/userguide/collation/api#sort-key-features 495 * 496 * <p>It is possible to merge multiple sort keys by consecutively merging 497 * another one with the intermediate result. 498 * 499 * <p>Only the sort key bytes of the CollationKeys are merged. 500 * This API does not attempt to merge the 501 * String representations of the CollationKeys, hence null will be returned 502 * as the result's String representation. 503 * 504 * <p>Example (uncompressed): 505 * <pre>191B1D 01 050505 01 910505 00 506 * 1F2123 01 050505 01 910505 00</pre> 507 * will be merged as 508 * <pre>191B1D 02 1F2123 01 050505 02 050505 01 910505 02 910505 00</pre> 509 * 510 * @param source CollationKey to merge with 511 * @return a CollationKey that contains the valid merged sort keys 512 * with a null String representation, 513 * i.e. <tt>new CollationKey(null, merged_sort_keys)</tt> 514 * @exception IllegalArgumentException thrown if source CollationKey 515 * argument is null or of 0 length. 516 * @stable ICU 2.6 517 */ merge(CollationKey source)518 public CollationKey merge(CollationKey source) 519 { 520 // check arguments 521 if (source == null || source.getLength() == 0) { 522 throw new IllegalArgumentException( 523 "CollationKey argument can not be null or of 0 length"); 524 } 525 526 // 1 byte extra for the 02 separator at the end of the copy of this sort key, 527 // and 1 more for the terminating 00. 528 byte result[] = new byte[getLength() + source.getLength() + 2]; 529 530 // merge the sort keys with the same number of levels 531 int rindex = 0; 532 int index = 0; 533 int sourceindex = 0; 534 while (true) { 535 // copy level from src1 not including 00 or 01 536 // unsigned issues 537 while (m_key_[index] < 0 || m_key_[index] >= MERGE_SEPERATOR_) { 538 result[rindex++] = m_key_[index++]; 539 } 540 541 // add a 02 merge separator 542 result[rindex++] = MERGE_SEPERATOR_; 543 544 // copy level from src2 not including 00 or 01 545 while (source.m_key_[sourceindex] < 0 546 || source.m_key_[sourceindex] >= MERGE_SEPERATOR_) { 547 result[rindex++] = source.m_key_[sourceindex++]; 548 } 549 550 // if both sort keys have another level, then add a 01 level 551 // separator and continue 552 if (m_key_[index] == Collation.LEVEL_SEPARATOR_BYTE 553 && source.m_key_[sourceindex] 554 == Collation.LEVEL_SEPARATOR_BYTE) { 555 ++index; 556 ++sourceindex; 557 result[rindex++] = Collation.LEVEL_SEPARATOR_BYTE; 558 } 559 else { 560 break; 561 } 562 } 563 564 // here, at least one sort key is finished now, but the other one 565 // might have some contents left from containing more levels; 566 // that contents is just appended to the result 567 int remainingLength; 568 if ((remainingLength = m_length_ - index) > 0) { 569 System.arraycopy(m_key_, index, result, rindex, remainingLength); 570 rindex += remainingLength; 571 } 572 else if ((remainingLength = source.m_length_ - sourceindex) > 0) { 573 System.arraycopy(source.m_key_, sourceindex, result, rindex, remainingLength); 574 rindex += remainingLength; 575 } 576 result[rindex] = 0; 577 578 assert rindex == result.length - 1; 579 return new CollationKey(null, result, rindex); 580 } 581 582 // private data members ------------------------------------------------- 583 584 /** 585 * Sequence of bytes that represents the sort key 586 */ 587 private byte m_key_[]; 588 589 /** 590 * Source string this CollationKey represents 591 */ 592 private String m_source_; 593 594 /** 595 * Hash code for the key 596 */ 597 private int m_hashCode_; 598 /** 599 * Gets the length of this CollationKey 600 */ 601 private int m_length_; 602 /** 603 * Collation key merge seperator 604 */ 605 private static final int MERGE_SEPERATOR_ = 2; 606 607 // private methods ------------------------------------------------------ 608 609 /** 610 * Gets the length of the CollationKey 611 * @return length of the CollationKey 612 */ getLength()613 private int getLength() 614 { 615 if (m_length_ >= 0) { 616 return m_length_; 617 } 618 int length = m_key_.length; 619 for (int index = 0; index < length; index ++) { 620 if (m_key_[index] == 0) { 621 length = index; 622 break; 623 } 624 } 625 m_length_ = length; 626 return m_length_; 627 } 628 } 629