• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4 *******************************************************************************
5 * Copyright (C) 2013-2015, International Business Machines
6 * Corporation and others.  All Rights Reserved.
7 *******************************************************************************
8 * CollationSettings.java, ported from collationsettings.h/.cpp
9 *
10 * C++ version created on: 2013feb07
11 * created by: Markus W. Scherer
12 */
13 
14 package com.ibm.icu.impl.coll;
15 
16 import java.util.Arrays;
17 
18 import com.ibm.icu.text.Collator;
19 
20 /**
21  * Collation settings/options/attributes.
22  * These are the values that can be changed via API.
23  */
24 public final class CollationSettings extends SharedObject {
25     /**
26      * Options bit 0: Perform the FCD check on the input text and deliver normalized text.
27      */
28     public static final int CHECK_FCD = 1;
29     /**
30      * Options bit 1: Numeric collation.
31      * Also known as CODAN = COllate Digits As Numbers.
32      *
33      * Treat digit sequences as numbers with CE sequences in numeric order,
34      * rather than returning a normal CE for each digit.
35      */
36     public static final int NUMERIC = 2;
37     /**
38      * "Shifted" alternate handling, see ALTERNATE_MASK.
39      */
40     static final int SHIFTED = 4;
41     /**
42      * Options bits 3..2: Alternate-handling mask. 0 for non-ignorable.
43      * Reserve values 8 and 0xc for shift-trimmed and blanked.
44      */
45     static final int ALTERNATE_MASK = 0xc;
46     /**
47      * Options bits 6..4: The 3-bit maxVariable value bit field is shifted by this value.
48      */
49     static final int MAX_VARIABLE_SHIFT = 4;
50     /** maxVariable options bit mask before shifting. */
51     static final int MAX_VARIABLE_MASK = 0x70;
52     /** Options bit 7: Reserved/unused/0. */
53     /**
54      * Options bit 8: Sort uppercase first if caseLevel or caseFirst is on.
55      */
56     static final int UPPER_FIRST = 0x100;
57     /**
58      * Options bit 9: Keep the case bits in the tertiary weight (they trump other tertiary values)
59      * unless case level is on (when they are *moved* into the separate case level).
60      * By default, the case bits are removed from the tertiary weight (ignored).
61      *
62      * When CASE_FIRST is off, UPPER_FIRST must be off too, corresponding to
63      * the tri-value UCOL_CASE_FIRST attribute: UCOL_OFF vs. UCOL_LOWER_FIRST vs. UCOL_UPPER_FIRST.
64      */
65     public static final int CASE_FIRST = 0x200;
66     /**
67      * Options bit mask for caseFirst and upperFirst, before shifting.
68      * Same value as caseFirst==upperFirst.
69      */
70     public static final int CASE_FIRST_AND_UPPER_MASK = CASE_FIRST | UPPER_FIRST;
71     /**
72      * Options bit 10: Insert the case level between the secondary and tertiary levels.
73      */
74     public static final int CASE_LEVEL = 0x400;
75     /**
76      * Options bit 11: Compare secondary weights backwards. ("French secondary")
77      */
78     public static final int BACKWARD_SECONDARY = 0x800;
79     /**
80      * Options bits 15..12: The 4-bit strength value bit field is shifted by this value.
81      * It is the top used bit field in the options. (No need to mask after shifting.)
82      */
83     static final int STRENGTH_SHIFT = 12;
84     /** Strength options bit mask before shifting. */
85     static final int STRENGTH_MASK = 0xf000;
86 
87     /** maxVariable values */
88     static final int MAX_VAR_SPACE = 0;
89     static final int MAX_VAR_PUNCT = 1;
90     static final int MAX_VAR_SYMBOL = 2;
91     static final int MAX_VAR_CURRENCY = 3;
92 
CollationSettings()93     CollationSettings() {}
94 
95     @Override
clone()96     public CollationSettings clone() {
97         CollationSettings newSettings = (CollationSettings)super.clone();
98         // Note: The reorderTable, reorderRanges, and reorderCodes need not be cloned
99         // because, in Java, they only get replaced but not modified.
100         newSettings.fastLatinPrimaries = fastLatinPrimaries.clone();
101         return newSettings;
102     }
103 
104     @Override
equals(Object other)105     public boolean equals(Object other) {
106         if(other == null) { return false; }
107         if(!this.getClass().equals(other.getClass())) { return false; }
108         CollationSettings o = (CollationSettings)other;
109         if(options != o.options) { return false; }
110         if((options & ALTERNATE_MASK) != 0 && variableTop != o.variableTop) { return false; }
111         if(!Arrays.equals(reorderCodes, o.reorderCodes)) { return false; }
112         return true;
113     }
114 
115     @Override
hashCode()116     public int hashCode() {
117         int h = options << 8;
118         if((options & ALTERNATE_MASK) != 0) { h ^= variableTop; }
119         h ^= reorderCodes.length;
120         for(int i = 0; i < reorderCodes.length; ++i) {
121             h ^= (reorderCodes[i] << i);
122         }
123         return h;
124     }
125 
resetReordering()126     public void resetReordering() {
127         // When we turn off reordering, we want to set a null permutation
128         // rather than a no-op permutation.
129         reorderTable = null;
130         minHighNoReorder = 0;
131         reorderRanges = null;
132         reorderCodes = EMPTY_INT_ARRAY;
133     }
134 
aliasReordering(CollationData data, int[] codesAndRanges, int codesLength, byte[] table)135     void aliasReordering(CollationData data, int[] codesAndRanges, int codesLength, byte[] table) {
136         int[] codes;
137         if(codesLength == codesAndRanges.length) {
138             codes = codesAndRanges;
139         } else {
140             codes = Arrays.copyOf(codesAndRanges, codesLength);
141         }
142         int rangesStart = codesLength;
143         int rangesLimit = codesAndRanges.length;
144         int rangesLength = rangesLimit - rangesStart;
145         if(table != null &&
146                 (rangesLength == 0 ?
147                         !reorderTableHasSplitBytes(table) :
148                         rangesLength >= 2 &&
149                         // The first offset must be 0. The last offset must not be 0.
150                         (codesAndRanges[rangesStart] & 0xffff) == 0 &&
151                         (codesAndRanges[rangesLimit - 1] & 0xffff) != 0)) {
152             reorderTable = table;
153             reorderCodes = codes;
154             // Drop ranges before the first split byte. They are reordered by the table.
155             // This then speeds up reordering of the remaining ranges.
156             int firstSplitByteRangeIndex = rangesStart;
157             while(firstSplitByteRangeIndex < rangesLimit &&
158                     (codesAndRanges[firstSplitByteRangeIndex] & 0xff0000) == 0) {
159                 // The second byte of the primary limit is 0.
160                 ++firstSplitByteRangeIndex;
161             }
162             if(firstSplitByteRangeIndex == rangesLimit) {
163                 assert(!reorderTableHasSplitBytes(table));
164                 minHighNoReorder = 0;
165                 reorderRanges = null;
166             } else {
167                 assert(table[codesAndRanges[firstSplitByteRangeIndex] >>> 24] == 0);
168                 minHighNoReorder = codesAndRanges[rangesLimit - 1] & 0xffff0000L;
169                 setReorderRanges(codesAndRanges, firstSplitByteRangeIndex,
170                         rangesLimit - firstSplitByteRangeIndex);
171             }
172             return;
173         }
174         // Regenerate missing data.
175         setReordering(data, codes);
176     }
177 
setReordering(CollationData data, int[] codes)178     public void setReordering(CollationData data, int[] codes) {
179         if(codes.length == 0 || (codes.length == 1 && codes[0] == Collator.ReorderCodes.NONE)) {
180             resetReordering();
181             return;
182         }
183         UVector32 rangesList = new UVector32();
184         data.makeReorderRanges(codes, rangesList);
185         int rangesLength = rangesList.size();
186         if(rangesLength == 0) {
187             resetReordering();
188             return;
189         }
190         int[] ranges = rangesList.getBuffer();
191         // ranges[] contains at least two (limit, offset) pairs.
192         // The first offset must be 0. The last offset must not be 0.
193         // Separators (at the low end) and trailing weights (at the high end)
194         // are never reordered.
195         assert(rangesLength >= 2);
196         assert((ranges[0] & 0xffff) == 0 && (ranges[rangesLength - 1] & 0xffff) != 0);
197         minHighNoReorder = ranges[rangesLength - 1] & 0xffff0000L;
198 
199         // Write the lead byte permutation table.
200         // Set a 0 for each lead byte that has a range boundary in the middle.
201         byte[] table = new byte[256];
202         int b = 0;
203         int firstSplitByteRangeIndex = -1;
204         for(int i = 0; i < rangesLength; ++i) {
205             int pair = ranges[i];
206             int limit1 = pair >>> 24;
207             while(b < limit1) {
208                 table[b] = (byte)(b + pair);
209                 ++b;
210             }
211             // Check the second byte of the limit.
212             if((pair & 0xff0000) != 0) {
213                 table[limit1] = 0;
214                 b = limit1 + 1;
215                 if(firstSplitByteRangeIndex < 0) {
216                     firstSplitByteRangeIndex = i;
217                 }
218             }
219         }
220         while(b <= 0xff) {
221             table[b] = (byte)b;
222             ++b;
223         }
224         int rangesStart;
225         if(firstSplitByteRangeIndex < 0) {
226             // The lead byte permutation table alone suffices for reordering.
227             rangesStart = rangesLength = 0;
228         } else {
229             // Remove the ranges below the first split byte.
230             rangesStart = firstSplitByteRangeIndex;
231             rangesLength -= firstSplitByteRangeIndex;
232         }
233         setReorderArrays(codes, ranges, rangesStart, rangesLength, table);
234     }
235 
setReorderArrays(int[] codes, int[] ranges, int rangesStart, int rangesLength, byte[] table)236     private void setReorderArrays(int[] codes,
237             int[] ranges, int rangesStart, int rangesLength, byte[] table) {
238         // Very different from C++. See the comments after the reorderCodes declaration.
239         if(codes == null) {
240             codes = EMPTY_INT_ARRAY;
241         }
242         assert (codes.length == 0) == (table == null);
243         reorderTable = table;
244         reorderCodes = codes;
245         setReorderRanges(ranges, rangesStart, rangesLength);
246     }
247 
setReorderRanges(int[] ranges, int rangesStart, int rangesLength)248     private void setReorderRanges(int[] ranges, int rangesStart, int rangesLength) {
249         if(rangesLength == 0) {
250             reorderRanges = null;
251         } else {
252             reorderRanges = new long[rangesLength];
253             int i = 0;
254             do {
255                 reorderRanges[i++] = ranges[rangesStart++] & 0xffffffffL;
256             } while(i < rangesLength);
257         }
258     }
259 
copyReorderingFrom(CollationSettings other)260     public void copyReorderingFrom(CollationSettings other) {
261         if(!other.hasReordering()) {
262             resetReordering();
263             return;
264         }
265         minHighNoReorder = other.minHighNoReorder;
266         reorderTable = other.reorderTable;
267         reorderRanges = other.reorderRanges;
268         reorderCodes = other.reorderCodes;
269     }
270 
hasReordering()271     public boolean hasReordering() { return reorderTable != null; }
272 
reorderTableHasSplitBytes(byte[] table)273     private static boolean reorderTableHasSplitBytes(byte[] table) {
274         assert(table[0] == 0);
275         for(int i = 1; i < 256; ++i) {
276             if(table[i] == 0) {
277                 return true;
278             }
279         }
280         return false;
281     }
282 
reorder(long p)283     public long reorder(long p) {
284         byte b = reorderTable[(int)p >>> 24];
285         if(b != 0 || p <= Collation.NO_CE_PRIMARY) {
286             return ((b & 0xffL) << 24) | (p & 0xffffff);
287         } else {
288             return reorderEx(p);
289         }
290     }
291 
reorderEx(long p)292     private long reorderEx(long p) {
293         assert minHighNoReorder > 0;
294         if(p >= minHighNoReorder) { return p; }
295         // Round up p so that its lower 16 bits are >= any offset bits.
296         // Then compare q directly with (limit, offset) pairs.
297         long q = p | 0xffff;
298         long r;
299         int i = 0;
300         while(q >= (r = reorderRanges[i])) { ++i; }
301         return p + ((long)(short)r << 24);
302     }
303 
304     // In C++, we use enums for attributes and their values, with a special value for the default.
305     // Combined getter/setter methods handle many attributes.
306     // In Java, we have specific methods for getting, setting, and set-to-default,
307     // except that this class uses bits in its own bit set for simple values.
308 
setStrength(int value)309     public void setStrength(int value) {
310         int noStrength = options & ~STRENGTH_MASK;
311         switch(value) {
312         case Collator.PRIMARY:
313         case Collator.SECONDARY:
314         case Collator.TERTIARY:
315         case Collator.QUATERNARY:
316         case Collator.IDENTICAL:
317             options = noStrength | (value << STRENGTH_SHIFT);
318             break;
319         default:
320             throw new IllegalArgumentException("illegal strength value " + value);
321         }
322     }
323 
setStrengthDefault(int defaultOptions)324     public void setStrengthDefault(int defaultOptions) {
325         int noStrength = options & ~STRENGTH_MASK;
326         options = noStrength | (defaultOptions & STRENGTH_MASK);
327     }
328 
getStrength(int options)329     static int getStrength(int options) {
330         return options >> STRENGTH_SHIFT;
331     }
332 
getStrength()333     public int getStrength() {
334         return getStrength(options);
335     }
336 
337     /** Sets the options bit for an on/off attribute. */
setFlag(int bit, boolean value)338     public void setFlag(int bit, boolean value) {
339         if(value) {
340             options |= bit;
341         } else {
342             options &= ~bit;
343         }
344     }
345 
setFlagDefault(int bit, int defaultOptions)346     public void setFlagDefault(int bit, int defaultOptions) {
347         options = (options & ~bit) | (defaultOptions & bit);
348     }
349 
getFlag(int bit)350     public boolean getFlag(int bit) {
351         return (options & bit) != 0;
352     }
353 
setCaseFirst(int value)354     public void setCaseFirst(int value) {
355         assert value == 0 || value == CASE_FIRST || value == CASE_FIRST_AND_UPPER_MASK;
356         int noCaseFirst = options & ~CASE_FIRST_AND_UPPER_MASK;
357         options = noCaseFirst | value;
358     }
359 
setCaseFirstDefault(int defaultOptions)360     public void setCaseFirstDefault(int defaultOptions) {
361         int noCaseFirst = options & ~CASE_FIRST_AND_UPPER_MASK;
362         options = noCaseFirst | (defaultOptions & CASE_FIRST_AND_UPPER_MASK);
363     }
364 
getCaseFirst()365     public int getCaseFirst() {
366         return options & CASE_FIRST_AND_UPPER_MASK;
367     }
368 
setAlternateHandlingShifted(boolean value)369     public void setAlternateHandlingShifted(boolean value) {
370         int noAlternate = options & ~ALTERNATE_MASK;
371         if(value) {
372             options = noAlternate | SHIFTED;
373         } else {
374             options = noAlternate;
375         }
376     }
377 
setAlternateHandlingDefault(int defaultOptions)378     public void setAlternateHandlingDefault(int defaultOptions) {
379         int noAlternate = options & ~ALTERNATE_MASK;
380         options = noAlternate | (defaultOptions & ALTERNATE_MASK);
381     }
382 
getAlternateHandling()383     public boolean getAlternateHandling() {
384         return (options & ALTERNATE_MASK) != 0;
385     }
386 
setMaxVariable(int value, int defaultOptions)387     public void setMaxVariable(int value, int defaultOptions) {
388         int noMax = options & ~MAX_VARIABLE_MASK;
389         switch(value) {
390         case MAX_VAR_SPACE:
391         case MAX_VAR_PUNCT:
392         case MAX_VAR_SYMBOL:
393         case MAX_VAR_CURRENCY:
394             options = noMax | (value << MAX_VARIABLE_SHIFT);
395             break;
396         case -1:
397             options = noMax | (defaultOptions & MAX_VARIABLE_MASK);
398             break;
399         default:
400             throw new IllegalArgumentException("illegal maxVariable value " + value);
401         }
402     }
403 
getMaxVariable()404     public int getMaxVariable() {
405         return (options & MAX_VARIABLE_MASK) >> MAX_VARIABLE_SHIFT;
406     }
407 
408     /**
409      * Include case bits in the tertiary level if caseLevel=off and caseFirst!=off.
410      */
isTertiaryWithCaseBits(int options)411     static boolean isTertiaryWithCaseBits(int options) {
412         return (options & (CASE_LEVEL | CASE_FIRST)) == CASE_FIRST;
413     }
getTertiaryMask(int options)414     static int getTertiaryMask(int options) {
415         // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off.
416         return isTertiaryWithCaseBits(options) ?
417                 Collation.CASE_AND_TERTIARY_MASK : Collation.ONLY_TERTIARY_MASK;
418     }
419 
sortsTertiaryUpperCaseFirst(int options)420     static boolean sortsTertiaryUpperCaseFirst(int options) {
421         // On tertiary level, consider case bits and sort uppercase first
422         // if caseLevel is off and caseFirst==upperFirst.
423         return (options & (CASE_LEVEL | CASE_FIRST_AND_UPPER_MASK)) == CASE_FIRST_AND_UPPER_MASK;
424     }
425 
dontCheckFCD()426     public boolean dontCheckFCD() {
427         return (options & CHECK_FCD) == 0;
428     }
429 
hasBackwardSecondary()430     boolean hasBackwardSecondary() {
431         return (options & BACKWARD_SECONDARY) != 0;
432     }
433 
isNumeric()434     public boolean isNumeric() {
435         return (options & NUMERIC) != 0;
436     }
437 
438     /** CHECK_FCD etc. */
439     public int options = (Collator.TERTIARY << STRENGTH_SHIFT) |  // DEFAULT_STRENGTH
440             (MAX_VAR_PUNCT << MAX_VARIABLE_SHIFT);
441     /** Variable-top primary weight. */
442     public long variableTop;
443     /**
444      * 256-byte table for reordering permutation of primary lead bytes; null if no reordering.
445      * A 0 entry at a non-zero index means that the primary lead byte is "split"
446      * (there are different offsets for primaries that share that lead byte)
447      * and the reordering offset must be determined via the reorderRanges.
448      */
449     public byte[] reorderTable;
450     /** Limit of last reordered range. 0 if no reordering or no split bytes. */
451     long minHighNoReorder;
452     /**
453      * Primary-weight ranges for script reordering,
454      * to be used by reorder(p) for split-reordered primary lead bytes.
455      *
456      * <p>Each entry is a (limit, offset) pair.
457      * The upper 16 bits of the entry are the upper 16 bits of the
458      * exclusive primary limit of a range.
459      * Primaries between the previous limit and this one have their lead bytes
460      * modified by the signed offset (-0xff..+0xff) stored in the lower 16 bits.
461      *
462      * <p>CollationData.makeReorderRanges() writes a full list where the first range
463      * (at least for terminators and separators) has a 0 offset.
464      * The last range has a non-zero offset.
465      * minHighNoReorder is set to the limit of that last range.
466      *
467      * <p>In the settings object, the initial ranges before the first split lead byte
468      * are omitted for efficiency; they are handled by reorder(p) via the reorderTable.
469      * If there are no split-reordered lead bytes, then no ranges are needed.
470      */
471     long[] reorderRanges;
472     /** Array of reorder codes; ignored if length == 0. */
473     public int[] reorderCodes = EMPTY_INT_ARRAY;
474     // Note: In C++, we keep a memory block around for the reorder codes,
475     // the ranges, and the permutation table,
476     // and modify them for new codes.
477     // In Java, we simply copy references and then never modify the array contents.
478     // The caller must abandon the arrays.
479     // Reorder codes from the public setter API must be cloned.
480     private static final int[] EMPTY_INT_ARRAY = new int[0];
481 
482     /** Options for CollationFastLatin. Negative if disabled. */
483     public int fastLatinOptions = -1;
484     // fastLatinPrimaries.length must be equal to CollationFastLatin.LATIN_LIMIT,
485     // but we do not import CollationFastLatin to reduce circular dependencies.
486     public char[] fastLatinPrimaries = new char[0x180];  // mutable contents
487 }
488