• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /**
4 *******************************************************************************
5 * Copyright (C) 1996-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 */
9 package com.ibm.icu.text;
10 
11 import com.ibm.icu.impl.coll.Collation;
12 
13 /**
14  * A <code>CollationKey</code> represents a <code>String</code>
15  * under the rules of a specific <code>Collator</code>
16  * object. Comparing two <code>CollationKey</code>s returns the
17  * relative order of the <code>String</code>s they represent.
18  *
19  * <p>Since the rule set of <code>Collator</code>s can differ, the
20  * sort orders of the same string under two different
21  * <code>Collator</code>s might differ.  Hence comparing
22  * <code>CollationKey</code>s generated from different
23  * <code>Collator</code>s can give incorrect results.
24 
25  * <p>Both the method
26  * <code>CollationKey.compareTo(CollationKey)</code> and the method
27  * <code>Collator.compare(String, String)</code> compare two strings
28  * and returns their relative order.  The performance characteristics
29  * of these two approaches can differ.
30  * Note that collation keys are often less efficient than simply doing comparison.
31  * For more details, see the ICU User Guide.
32  *
33  * <p>During the construction of a <code>CollationKey</code>, the
34  * entire source string is examined and processed into a series of
35  * bits terminated by a null, that are stored in the <code>CollationKey</code>.
36  * When <code>CollationKey.compareTo(CollationKey)</code> executes, it
37  * performs bitwise comparison on the bit sequences.  This can incurs
38  * startup cost when creating the <code>CollationKey</code>, but once
39  * the key is created, binary comparisons are fast.  This approach is
40  * recommended when the same strings are to be compared over and over
41  * again.
42  *
43  * <p>On the other hand, implementations of
44  * <code>Collator.compare(String, String)</code> can examine and
45  * process the strings only until the first characters differing in
46  * order.  This approach is recommended if the strings are to be
47  * compared only once.</p>
48  *
49  * <p>More information about the composition of the bit sequence can
50  * be found in the
51  * <a href="http://www.icu-project.org/userguide/Collate_ServiceArchitecture.html">
52  * user guide</a>.</p>
53  *
54  * <p>The following example shows how <code>CollationKey</code>s can be used
55  * to sort a list of <code>String</code>s.</p>
56  * <blockquote>
57  * <pre>
58  * // Create an array of CollationKeys for the Strings to be sorted.
59  * Collator myCollator = Collator.getInstance();
60  * CollationKey[] keys = new CollationKey[3];
61  * keys[0] = myCollator.getCollationKey("Tom");
62  * keys[1] = myCollator.getCollationKey("Dick");
63  * keys[2] = myCollator.getCollationKey("Harry");
64  * sort( keys );
65  * <br>
66  * //...
67  * <br>
68  * // Inside body of sort routine, compare keys this way
69  * if( keys[i].compareTo( keys[j] ) &gt; 0 )
70  *    // swap keys[i] and keys[j]
71  * <br>
72  * //...
73  * <br>
74  * // Finally, when we've returned from sort.
75  * System.out.println( keys[0].getSourceString() );
76  * System.out.println( keys[1].getSourceString() );
77  * System.out.println( keys[2].getSourceString() );
78  * </pre>
79  * </blockquote>
80  * <p>
81  * This class is not subclassable
82  * @see Collator
83  * @see RuleBasedCollator
84  * @author Syn Wee Quek
85  * @stable ICU 2.8
86  */
87 public final class CollationKey implements Comparable<CollationKey>
88 {
89     // public inner classes -------------------------------------------------
90 
91     /**
92      * Options that used in the API CollationKey.getBound() for getting a
93      * CollationKey based on the bound mode requested.
94      * @stable ICU 2.6
95      */
96     public static final class BoundMode
97     {
98         /*
99          * do not change the values assigned to the members of this enum.
100          * Underlying code depends on them having these numbers
101          */
102 
103         /**
104          * Lower bound
105          * @stable ICU 2.6
106          */
107         public static final int LOWER = 0;
108 
109         /**
110          * Upper bound that will match strings of exact size
111          * @stable ICU 2.6
112          */
113         public static final int UPPER = 1;
114 
115         /**
116          * Upper bound that will match all the strings that have the same
117          * initial substring as the given string
118          * @stable ICU 2.6
119          */
120         public static final int UPPER_LONG = 2;
121 
122         /**
123          * One more than the highest normal BoundMode value.
124          * @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
125          */
126         @Deprecated
127         public static final int COUNT = 3;
128 
129         /**
130          * Private Constructor
131          */
132         ///CLOVER:OFF
BoundMode()133         private BoundMode(){}
134         ///CLOVER:ON
135     }
136 
137     // public constructor ---------------------------------------------------
138 
139     /**
140      * CollationKey constructor.
141      * This constructor is given public access, unlike the JDK version, to
142      * allow access to users extending the Collator class. See
143      * {@link Collator#getCollationKey(String)}.
144      * @param source string this CollationKey is to represent
145      * @param key array of bytes that represent the collation order of argument
146      *            source terminated by a null
147      * @see Collator
148      * @stable ICU 2.8
149      */
CollationKey(String source, byte key[])150     public CollationKey(String source, byte key[])
151     {
152         this(source, key, -1);
153     }
154 
155     /**
156      * Private constructor, takes a length argument so it need not be lazy-evaluated.
157      * There must be a 00 byte at key[length] and none before.
158      */
CollationKey(String source, byte key[], int length)159     private CollationKey(String source, byte key[], int length)
160     {
161         m_source_ = source;
162         m_key_ = key;
163         m_hashCode_ = 0;
164         m_length_ = length;
165     }
166 
167     /**
168      * CollationKey constructor that forces key to release its internal byte
169      * array for adoption. key will have a null byte array after this
170      * construction.
171      * @param source string this CollationKey is to represent
172      * @param key RawCollationKey object that represents the collation order of
173      *            argument source.
174      * @see Collator
175      * @see RawCollationKey
176      * @stable ICU 2.8
177      */
CollationKey(String source, RawCollationKey key)178     public CollationKey(String source, RawCollationKey key)
179     {
180         m_source_ = source;
181         m_length_ = key.size - 1;
182         m_key_ = key.releaseBytes();
183         assert m_key_[m_length_] == 0;
184         m_hashCode_ = 0;
185     }
186 
187     // public getters -------------------------------------------------------
188 
189     /**
190      * Return the source string that this CollationKey represents.
191      * @return source string that this CollationKey represents
192      * @stable ICU 2.8
193      */
getSourceString()194     public String getSourceString()
195     {
196         return m_source_;
197     }
198 
199     /**
200      * Duplicates and returns the value of this CollationKey as a sequence
201      * of big-endian bytes terminated by a null.
202      *
203      * <p>If two CollationKeys can be legitimately compared, then one can
204      * compare the byte arrays of each to obtain the same result, e.g.
205      * <pre>
206      * byte key1[] = collationkey1.toByteArray();
207      * byte key2[] = collationkey2.toByteArray();
208      * int key, targetkey;
209      * int i = 0;
210      * do {
211      *       key = key1[i] &amp; 0xFF;
212      *     targetkey = key2[i] &amp; 0xFF;
213      *     if (key &lt; targetkey) {
214      *         System.out.println("String 1 is less than string 2");
215      *         return;
216      *     }
217      *     if (targetkey &lt; key) {
218      *         System.out.println("String 1 is more than string 2");
219      *     }
220      *     i ++;
221      * } while (key != 0 &amp;&amp; targetKey != 0);
222      *
223      * System.out.println("Strings are equal.");
224      * </pre>
225      *
226      * @return CollationKey value in a sequence of big-endian byte bytes
227      *         terminated by a null.
228      * @stable ICU 2.8
229      */
toByteArray()230     public byte[] toByteArray()
231     {
232         int length = getLength() + 1;
233         byte result[] = new byte[length];
234         System.arraycopy(m_key_, 0, result, 0, length);
235         return result;
236     }
237 
238     // public other methods -------------------------------------------------
239 
240     /**
241      * Compare this CollationKey to another CollationKey.  The
242      * collation rules of the Collator that created this key are
243      * applied.
244      *
245      * <p><strong>Note:</strong> Comparison between CollationKeys
246      * created by different Collators might return incorrect
247      * results.  See class documentation.
248      *
249      * @param target target CollationKey
250      * @return an integer value.  If the value is less than zero this CollationKey
251      *         is less than than target, if the value is zero they are equal, and
252      *         if the value is greater than zero this CollationKey is greater
253      *         than target.
254      * @exception NullPointerException is thrown if argument is null.
255      * @see Collator#compare(String, String)
256      * @stable ICU 2.8
257      */
258     @Override
compareTo(CollationKey target)259     public int compareTo(CollationKey target)
260     {
261         for (int i = 0;; ++i) {
262             int l = m_key_[i]&0xff;
263             int r = target.m_key_[i]&0xff;
264             if (l < r) {
265             return -1;
266             } else if (l > r) {
267             return 1;
268             } else if (l == 0) {
269             return 0;
270             }
271         }
272     }
273 
274     /**
275      * Compare this CollationKey and the specified Object for
276      * equality.  The collation rules of the Collator that created
277      * this key are applied.
278      *
279      * <p>See note in compareTo(CollationKey) for warnings about
280      * possible incorrect results.
281      *
282      * @param target the object to compare to.
283      * @return true if the two keys compare as equal, false otherwise.
284      * @see #compareTo(CollationKey)
285      * @exception ClassCastException is thrown when the argument is not
286      *            a CollationKey.  NullPointerException is thrown when the argument
287      *            is null.
288      * @stable ICU 2.8
289      */
290     @Override
equals(Object target)291     public boolean equals(Object target)
292     {
293         if (!(target instanceof CollationKey)) {
294             return false;
295         }
296 
297         return equals((CollationKey)target);
298     }
299 
300     /**
301      * Compare this CollationKey and the argument target CollationKey for
302      * equality.
303      * The collation
304      * rules of the Collator object which created these objects are applied.
305      * <p>
306      * See note in compareTo(CollationKey) for warnings of incorrect results
307      *
308      * @param target the CollationKey to compare to.
309      * @return true if two objects are equal, false otherwise.
310      * @exception NullPointerException is thrown when the argument is null.
311      * @stable ICU 2.8
312      */
equals(CollationKey target)313     public boolean equals(CollationKey target)
314     {
315         if (this == target) {
316             return true;
317         }
318         if (target == null) {
319             return false;
320         }
321         CollationKey other = target;
322         int i = 0;
323         while (true) {
324             if (m_key_[i] != other.m_key_[i]) {
325                 return false;
326             }
327             if (m_key_[i] == 0) {
328               break;
329             }
330             i ++;
331         }
332         return true;
333     }
334 
335     /**
336      * Returns a hash code for this CollationKey. The hash value is calculated
337      * on the key itself, not the String from which the key was created. Thus
338      * if x and y are CollationKeys, then x.hashCode(x) == y.hashCode()
339      * if x.equals(y) is true. This allows language-sensitive comparison in a
340      * hash table.
341      *
342      * @return the hash value.
343      * @stable ICU 2.8
344      */
345     @Override
hashCode()346     public int hashCode()
347     {
348         if (m_hashCode_ == 0) {
349             if (m_key_ == null) {
350                 m_hashCode_ = 1;
351             }
352             else {
353                 int size = m_key_.length >> 1;
354                 StringBuilder key = new StringBuilder(size);
355                 int i = 0;
356                 while (m_key_[i] != 0 && m_key_[i + 1] != 0) {
357                     key.append((char)((m_key_[i] << 8) | (0xff & m_key_[i + 1])));
358                     i += 2;
359                 }
360                 if (m_key_[i] != 0) {
361                     key.append((char)(m_key_[i] << 8));
362                 }
363                 m_hashCode_ = key.toString().hashCode();
364             }
365         }
366         return m_hashCode_;
367     }
368 
369     /**
370      * Produces a bound for the sort order of a given collation key and a
371      * strength level. This API does not attempt to find a bound for the
372      * CollationKey String representation, hence null will be returned in its
373      * place.
374      * <p>
375      * Resulting bounds can be used to produce a range of strings that are
376      * between upper and lower bounds. For example, if bounds are produced
377      * for a sortkey of string "smith", strings between upper and lower
378      * bounds with primary strength would include "Smith", "SMITH", "sMiTh".
379      * <p>
380      * There are two upper bounds that can be produced. If BoundMode.UPPER
381      * is produced, strings matched would be as above. However, if a bound
382      * is produced using BoundMode.UPPER_LONG is used, the above example will
383      * also match "Smithsonian" and similar.
384      * <p>
385      * For more on usage, see example in test procedure
386      * <a href="http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/collator/CollationAPITest.java">
387      * src/com/ibm/icu/dev/test/collator/CollationAPITest/TestBounds.
388      * </a>
389      * <p>
390      * Collation keys produced may be compared using the <TT>compare</TT> API.
391      * @param boundType Mode of bound required. It can be BoundMode.LOWER, which
392      *              produces a lower inclusive bound, BoundMode.UPPER, that
393      *              produces upper bound that matches strings of the same
394      *              length or BoundMode.UPPER_LONG that matches strings that
395      *              have the same starting substring as the source string.
396      * @param noOfLevels Strength levels required in the resulting bound
397      *                 (for most uses, the recommended value is PRIMARY). This
398      *                 strength should be less than the maximum strength of
399      *                 this CollationKey.
400      *                 See users guide for explanation on the strength levels a
401      *                 collation key can have.
402      * @return the result bounded CollationKey with a valid sort order but
403      *         a null String representation.
404      * @exception IllegalArgumentException thrown when the strength level
405      *            requested is higher than or equal to the strength in this
406      *            CollationKey.
407      *            In the case of an Exception, information
408      *            about the maximum strength to use will be returned in the
409      *            Exception. The user can then call getBound() again with the
410      *            appropriate strength.
411      * @see CollationKey
412      * @see CollationKey.BoundMode
413      * @see Collator#PRIMARY
414      * @see Collator#SECONDARY
415      * @see Collator#TERTIARY
416      * @see Collator#QUATERNARY
417      * @see Collator#IDENTICAL
418      * @stable ICU 2.6
419      */
getBound(int boundType, int noOfLevels)420     public CollationKey getBound(int boundType, int noOfLevels)
421     {
422         // Scan the string until we skip enough of the key OR reach the end of
423         // the key
424         int offset = 0;
425         int keystrength = Collator.PRIMARY;
426 
427         if (noOfLevels > Collator.PRIMARY) {
428             while (offset < m_key_.length && m_key_[offset] != 0) {
429                 if (m_key_[offset ++]
430                         == Collation.LEVEL_SEPARATOR_BYTE) {
431                     keystrength ++;
432                     noOfLevels --;
433                     if (noOfLevels == Collator.PRIMARY
434                         || offset == m_key_.length || m_key_[offset] == 0) {
435                         offset --;
436                         break;
437                     }
438                 }
439             }
440         }
441 
442         if (noOfLevels > 0) {
443             throw new IllegalArgumentException(
444                                   "Source collation key has only "
445                                   + keystrength
446                                   + " strength level. Call getBound() again "
447                                   + " with noOfLevels < " + keystrength);
448         }
449 
450         // READ ME: this code assumes that the values for BoundMode variables
451         // will not change. They are set so that the enum value corresponds to
452         // the number of extra bytes each bound type needs.
453         byte resultkey[] = new byte[offset + boundType + 1];
454         System.arraycopy(m_key_, 0, resultkey, 0, offset);
455         switch (boundType) {
456             case BoundMode.LOWER: // = 0
457                 // Lower bound just gets terminated. No extra bytes
458                 break;
459             case BoundMode.UPPER: // = 1
460                 // Upper bound needs one extra byte
461                 resultkey[offset ++] = 2;
462                 break;
463             case BoundMode.UPPER_LONG: // = 2
464                 // Upper long bound needs two extra bytes
465                 resultkey[offset ++] = (byte)0xFF;
466                 resultkey[offset ++] = (byte)0xFF;
467                 break;
468             default:
469                 throw new IllegalArgumentException(
470                                                 "Illegal boundType argument");
471         }
472         resultkey[offset] = 0;
473         return new CollationKey(null, resultkey, offset);
474     }
475 
476 
477     /**
478      * Merges this CollationKey with another.
479      * The levels are merged with their corresponding counterparts
480      * (primaries with primaries, secondaries with secondaries etc.).
481      * Between the values from the same level a separator is inserted.
482      *
483      * <p>This is useful, for example, for combining sort keys from first and last names
484      * to sort such pairs.
485      * See http://www.unicode.org/reports/tr10/#Merging_Sort_Keys
486      *
487      * <p>The recommended way to achieve "merged" sorting is by
488      * concatenating strings with U+FFFE between them.
489      * The concatenation has the same sort order as the merged sort keys,
490      * but merge(getSortKey(str1), getSortKey(str2)) may differ from getSortKey(str1 + '\uFFFE' + str2).
491      * Using strings with U+FFFE may yield shorter sort keys.
492      *
493      * <p>For details about Sort Key Features see
494      * https://unicode-org.github.io/icu/userguide/collation/api#sort-key-features
495      *
496      * <p>It is possible to merge multiple sort keys by consecutively merging
497      * another one with the intermediate result.
498      *
499      * <p>Only the sort key bytes of the CollationKeys are merged.
500      * This API does not attempt to merge the
501      * String representations of the CollationKeys, hence null will be returned
502      * as the result's String representation.
503      *
504      * <p>Example (uncompressed):
505      * <pre>191B1D 01 050505 01 910505 00
506      * 1F2123 01 050505 01 910505 00</pre>
507      * will be merged as
508      * <pre>191B1D 02 1F2123 01 050505 02 050505 01 910505 02 910505 00</pre>
509      *
510      * @param source CollationKey to merge with
511      * @return a CollationKey that contains the valid merged sort keys
512      *         with a null String representation,
513      *         i.e. <tt>new CollationKey(null, merged_sort_keys)</tt>
514      * @exception IllegalArgumentException thrown if source CollationKey
515      *            argument is null or of 0 length.
516      * @stable ICU 2.6
517      */
merge(CollationKey source)518     public CollationKey merge(CollationKey source)
519     {
520         // check arguments
521         if (source == null || source.getLength() == 0) {
522             throw new IllegalArgumentException(
523                       "CollationKey argument can not be null or of 0 length");
524         }
525 
526         // 1 byte extra for the 02 separator at the end of the copy of this sort key,
527         // and 1 more for the terminating 00.
528         byte result[] = new byte[getLength() + source.getLength() + 2];
529 
530         // merge the sort keys with the same number of levels
531         int rindex = 0;
532         int index = 0;
533         int sourceindex = 0;
534         while (true) {
535             // copy level from src1 not including 00 or 01
536             // unsigned issues
537             while (m_key_[index] < 0 || m_key_[index] >= MERGE_SEPERATOR_) {
538                 result[rindex++] = m_key_[index++];
539             }
540 
541             // add a 02 merge separator
542             result[rindex++] = MERGE_SEPERATOR_;
543 
544             // copy level from src2 not including 00 or 01
545             while (source.m_key_[sourceindex] < 0
546                    || source.m_key_[sourceindex] >= MERGE_SEPERATOR_) {
547                 result[rindex++] = source.m_key_[sourceindex++];
548             }
549 
550             // if both sort keys have another level, then add a 01 level
551             // separator and continue
552             if (m_key_[index] == Collation.LEVEL_SEPARATOR_BYTE
553                 && source.m_key_[sourceindex]
554                         == Collation.LEVEL_SEPARATOR_BYTE) {
555                 ++index;
556                 ++sourceindex;
557                 result[rindex++] = Collation.LEVEL_SEPARATOR_BYTE;
558             }
559             else {
560                 break;
561             }
562         }
563 
564         // here, at least one sort key is finished now, but the other one
565         // might have some contents left from containing more levels;
566         // that contents is just appended to the result
567         int remainingLength;
568         if ((remainingLength = m_length_ - index) > 0) {
569             System.arraycopy(m_key_, index, result, rindex, remainingLength);
570             rindex += remainingLength;
571         }
572         else if ((remainingLength = source.m_length_ - sourceindex) > 0) {
573             System.arraycopy(source.m_key_, sourceindex, result, rindex, remainingLength);
574             rindex += remainingLength;
575         }
576         result[rindex] = 0;
577 
578         assert rindex == result.length - 1;
579         return new CollationKey(null, result, rindex);
580     }
581 
582     // private data members -------------------------------------------------
583 
584     /**
585      * Sequence of bytes that represents the sort key
586      */
587     private byte m_key_[];
588 
589     /**
590      * Source string this CollationKey represents
591      */
592     private String m_source_;
593 
594     /**
595      * Hash code for the key
596      */
597     private int m_hashCode_;
598     /**
599      * Gets the length of this CollationKey
600      */
601     private int m_length_;
602     /**
603      * Collation key merge seperator
604      */
605     private static final int MERGE_SEPERATOR_ = 2;
606 
607     // private methods ------------------------------------------------------
608 
609     /**
610      * Gets the length of the CollationKey
611      * @return length of the CollationKey
612      */
getLength()613     private int getLength()
614     {
615         if (m_length_ >= 0) {
616             return m_length_;
617         }
618         int length = m_key_.length;
619         for (int index = 0; index < length; index ++) {
620             if (m_key_[index] == 0) {
621                 length = index;
622                 break;
623             }
624         }
625         m_length_ = length;
626         return m_length_;
627     }
628 }
629