• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5 *******************************************************************************
6 * Copyright (C) 2010-2015, International Business Machines
7 * Corporation and others.  All Rights Reserved.
8 *******************************************************************************
9 * CollationData.java, ported from collationdata.h/.cpp
10 *
11 * C++ version created on: 2010oct27
12 * created by: Markus W. Scherer
13 */
14 
15 package ohos.global.icu.impl.coll;
16 
17 import ohos.global.icu.impl.Normalizer2Impl;
18 import ohos.global.icu.impl.Trie2_32;
19 import ohos.global.icu.lang.UScript;
20 import ohos.global.icu.text.Collator;
21 import ohos.global.icu.text.UnicodeSet;
22 import ohos.global.icu.util.ICUException;
23 
24 /**
25  * Collation data container.
26  * Immutable data created by a CollationDataBuilder, or loaded from a file,
27  * or deserialized from API-provided binary data.
28  *
29  * Includes data for the collation base (root/default), aliased if this is not the base.
30  * @hide exposed on OHOS
31  */
32 public final class CollationData {
33     // Note: The ucadata.icu loader could discover the reserved ranges by setting an array
34     // parallel with the ranges, and resetting ranges that are indexed.
35     // The reordering builder code could clone the resulting template array.
36     static final int REORDER_RESERVED_BEFORE_LATIN = Collator.ReorderCodes.FIRST + 14;
37     static final int REORDER_RESERVED_AFTER_LATIN = Collator.ReorderCodes.FIRST + 15;
38 
39     static final int MAX_NUM_SPECIAL_REORDER_CODES = 8;
40 
CollationData(Normalizer2Impl nfc)41     CollationData(Normalizer2Impl nfc) {
42         nfcImpl = nfc;
43     }
44 
getCE32(int c)45     public int getCE32(int c) {
46         return trie.get(c);
47     }
48 
getCE32FromSupplementary(int c)49     int getCE32FromSupplementary(int c) {
50         return trie.get(c);  // TODO: port UTRIE2_GET32_FROM_SUPP(trie, c) to Java?
51     }
52 
isDigit(int c)53     boolean isDigit(int c) {
54         return c < 0x660 ? c <= 0x39 && 0x30 <= c :
55                 Collation.hasCE32Tag(getCE32(c), Collation.DIGIT_TAG);
56     }
57 
isUnsafeBackward(int c, boolean numeric)58     public boolean isUnsafeBackward(int c, boolean numeric) {
59         return unsafeBackwardSet.contains(c) || (numeric && isDigit(c));
60     }
61 
isCompressibleLeadByte(int b)62     public boolean isCompressibleLeadByte(int b) {
63         return compressibleBytes[b];
64     }
65 
isCompressiblePrimary(long p)66     public boolean isCompressiblePrimary(long p) {
67         return isCompressibleLeadByte((int)p >>> 24);
68     }
69 
70     /**
71      * Returns the CE32 from two contexts words.
72      * Access to the defaultCE32 for contraction and prefix matching.
73      */
getCE32FromContexts(int index)74     int getCE32FromContexts(int index) {
75         return ((int)contexts.charAt(index) << 16) | contexts.charAt(index + 1);
76     }
77 
78     /**
79      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG).
80      * Requires that ce32 is special.
81      */
getIndirectCE32(int ce32)82     int getIndirectCE32(int ce32) {
83         assert(Collation.isSpecialCE32(ce32));
84         int tag = Collation.tagFromCE32(ce32);
85         if(tag == Collation.DIGIT_TAG) {
86             // Fetch the non-numeric-collation CE32.
87             ce32 = ce32s[Collation.indexFromCE32(ce32)];
88         } else if(tag == Collation.LEAD_SURROGATE_TAG) {
89             ce32 = Collation.UNASSIGNED_CE32;
90         } else if(tag == Collation.U0000_TAG) {
91             // Fetch the normal ce32 for U+0000.
92             ce32 = ce32s[0];
93         }
94         return ce32;
95     }
96 
97     /**
98      * Returns the CE32 for an indirect special CE32 (e.g., with DIGIT_TAG),
99      * if ce32 is special.
100      */
getFinalCE32(int ce32)101     int getFinalCE32(int ce32) {
102         if(Collation.isSpecialCE32(ce32)) {
103             ce32 = getIndirectCE32(ce32);
104         }
105         return ce32;
106     }
107 
108     /**
109      * Computes a CE from c's ce32 which has the OFFSET_TAG.
110      */
getCEFromOffsetCE32(int c, int ce32)111     long getCEFromOffsetCE32(int c, int ce32) {
112         long dataCE = ces[Collation.indexFromCE32(ce32)];
113         return Collation.makeCE(Collation.getThreeBytePrimaryForOffsetData(c, dataCE));
114     }
115 
116     /**
117      * Returns the single CE that c maps to.
118      * Throws UnsupportedOperationException if c does not map to a single CE.
119      */
getSingleCE(int c)120     long getSingleCE(int c) {
121         CollationData d;
122         int ce32 = getCE32(c);
123         if(ce32 == Collation.FALLBACK_CE32) {
124             d = base;
125             ce32 = base.getCE32(c);
126         } else {
127             d = this;
128         }
129         while(Collation.isSpecialCE32(ce32)) {
130             switch(Collation.tagFromCE32(ce32)) {
131             case Collation.LATIN_EXPANSION_TAG:
132             case Collation.BUILDER_DATA_TAG:
133             case Collation.PREFIX_TAG:
134             case Collation.CONTRACTION_TAG:
135             case Collation.HANGUL_TAG:
136             case Collation.LEAD_SURROGATE_TAG:
137                 throw new UnsupportedOperationException(String.format(
138                         "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
139                         c, ce32));
140             case Collation.FALLBACK_TAG:
141             case Collation.RESERVED_TAG_3:
142                 throw new AssertionError(String.format(
143                         "unexpected CE32 tag for U+%04X (CE32 0x%08x)", c, ce32));
144             case Collation.LONG_PRIMARY_TAG:
145                 return Collation.ceFromLongPrimaryCE32(ce32);
146             case Collation.LONG_SECONDARY_TAG:
147                 return Collation.ceFromLongSecondaryCE32(ce32);
148             case Collation.EXPANSION32_TAG:
149                 if(Collation.lengthFromCE32(ce32) == 1) {
150                     ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
151                     break;
152                 } else {
153                     throw new UnsupportedOperationException(String.format(
154                             "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
155                             c, ce32));
156                 }
157             case Collation.EXPANSION_TAG: {
158                 if(Collation.lengthFromCE32(ce32) == 1) {
159                     return d.ces[Collation.indexFromCE32(ce32)];
160                 } else {
161                     throw new UnsupportedOperationException(String.format(
162                             "there is not exactly one collation element for U+%04X (CE32 0x%08x)",
163                             c, ce32));
164                 }
165             }
166             case Collation.DIGIT_TAG:
167                 // Fetch the non-numeric-collation CE32 and continue.
168                 ce32 = d.ce32s[Collation.indexFromCE32(ce32)];
169                 break;
170             case Collation.U0000_TAG:
171                 assert(c == 0);
172                 // Fetch the normal ce32 for U+0000 and continue.
173                 ce32 = d.ce32s[0];
174                 break;
175             case Collation.OFFSET_TAG:
176                 return d.getCEFromOffsetCE32(c, ce32);
177             case Collation.IMPLICIT_TAG:
178                 return Collation.unassignedCEFromCodePoint(c);
179             }
180         }
181         return Collation.ceFromSimpleCE32(ce32);
182     }
183 
184     /**
185      * Returns the FCD16 value for code point c. c must be >= 0.
186      */
getFCD16(int c)187     int getFCD16(int c) {
188         return nfcImpl.getFCD16(c);
189     }
190 
191     /**
192      * Returns the first primary for the script's reordering group.
193      * @return the primary with only the first primary lead byte of the group
194      *         (not necessarily an actual root collator primary weight),
195      *         or 0 if the script is unknown
196      */
getFirstPrimaryForGroup(int script)197     long getFirstPrimaryForGroup(int script) {
198         int index = getScriptIndex(script);
199         return index == 0 ? 0 : (long)scriptStarts[index] << 16;
200     }
201 
202     /**
203      * Returns the last primary for the script's reordering group.
204      * @return the last primary of the group
205      *         (not an actual root collator primary weight),
206      *         or 0 if the script is unknown
207      */
getLastPrimaryForGroup(int script)208     public long getLastPrimaryForGroup(int script) {
209         int index = getScriptIndex(script);
210         if(index == 0) {
211             return 0;
212         }
213         long limit = scriptStarts[index + 1];
214         return (limit << 16) - 1;
215     }
216 
217     /**
218      * Finds the reordering group which contains the primary weight.
219      * @return the first script of the group, or -1 if the weight is beyond the last group
220      */
getGroupForPrimary(long p)221     public int getGroupForPrimary(long p) {
222         p >>= 16;
223         if(p < scriptStarts[1] || scriptStarts[scriptStarts.length - 1] <= p) {
224             return -1;
225         }
226         int index = 1;
227         while(p >= scriptStarts[index + 1]) { ++index; }
228         for(int i = 0; i < numScripts; ++i) {
229             if(scriptsIndex[i] == index) {
230                 return i;
231             }
232         }
233         for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
234             if(scriptsIndex[numScripts + i] == index) {
235                 return Collator.ReorderCodes.FIRST + i;
236             }
237         }
238         return -1;
239     }
240 
getScriptIndex(int script)241     private int getScriptIndex(int script) {
242         if(script < 0) {
243             return 0;
244         } else if(script < numScripts) {
245             return scriptsIndex[script];
246         } else if(script < Collator.ReorderCodes.FIRST) {
247             return 0;
248         } else {
249             script -= Collator.ReorderCodes.FIRST;
250             if(script < MAX_NUM_SPECIAL_REORDER_CODES) {
251                 return scriptsIndex[numScripts + script];
252             } else {
253                 return 0;
254             }
255         }
256     }
257 
getEquivalentScripts(int script)258     public int[] getEquivalentScripts(int script) {
259         int index = getScriptIndex(script);
260         if(index == 0) { return EMPTY_INT_ARRAY; }
261         if(script >= Collator.ReorderCodes.FIRST) {
262             // Special groups have no aliases.
263             return new int[] { script };
264         }
265 
266         int length = 0;
267         for(int i = 0; i < numScripts; ++i) {
268             if(scriptsIndex[i] == index) {
269                 ++length;
270             }
271         }
272         int[] dest = new int[length];
273         if(length == 1) {
274             dest[0] = script;
275             return dest;
276         }
277         length = 0;
278         for(int i = 0; i < numScripts; ++i) {
279             if(scriptsIndex[i] == index) {
280                 dest[length++] = i;
281             }
282         }
283         return dest;
284     }
285 
286     /**
287      * Writes the permutation of primary-weight ranges
288      * for the given reordering of scripts and groups.
289      * The caller checks for illegal arguments and
290      * takes care of [DEFAULT] and memory allocation.
291      *
292      * <p>Each list element will be a (limit, offset) pair as described
293      * for the CollationSettings.reorderRanges.
294      * The list will be empty if no ranges are reordered.
295      */
makeReorderRanges(int[] reorder, UVector32 ranges)296     void makeReorderRanges(int[] reorder, UVector32 ranges) {
297         makeReorderRanges(reorder, false, ranges);
298     }
299 
makeReorderRanges(int[] reorder, boolean latinMustMove, UVector32 ranges)300     private void makeReorderRanges(int[] reorder, boolean latinMustMove, UVector32 ranges) {
301         ranges.removeAllElements();
302         int length = reorder.length;
303         if(length == 0 || (length == 1 && reorder[0] == UScript.UNKNOWN)) {
304             return;
305         }
306 
307         // Maps each script-or-group range to a new lead byte.
308         short[] table = new short[scriptStarts.length - 1];  // C++: uint8_t[]
309 
310         {
311             // Set "don't care" values for reserved ranges.
312             int index = scriptsIndex[
313                     numScripts + REORDER_RESERVED_BEFORE_LATIN - Collator.ReorderCodes.FIRST];
314             if(index != 0) {
315                 table[index] = 0xff;
316             }
317             index = scriptsIndex[
318                     numScripts + REORDER_RESERVED_AFTER_LATIN - Collator.ReorderCodes.FIRST];
319             if(index != 0) {
320                 table[index] = 0xff;
321             }
322         }
323 
324         // Never reorder special low and high primary lead bytes.
325         assert(scriptStarts.length >= 2);
326         assert(scriptStarts[0] == 0);
327         int lowStart = scriptStarts[1];
328         assert(lowStart == ((Collation.MERGE_SEPARATOR_BYTE + 1) << 8));
329         int highLimit = scriptStarts[scriptStarts.length - 1];
330         assert(highLimit == (Collation.TRAIL_WEIGHT_BYTE << 8));
331 
332         // Get the set of special reorder codes in the input list.
333         // This supports a fixed number of special reorder codes;
334         // it works for data with codes beyond Collator.ReorderCodes.LIMIT.
335         int specials = 0;
336         for(int i = 0; i < length; ++i) {
337             int reorderCode = reorder[i] - Collator.ReorderCodes.FIRST;
338             if(0 <= reorderCode && reorderCode < MAX_NUM_SPECIAL_REORDER_CODES) {
339                 specials |= 1 << reorderCode;
340             }
341         }
342 
343         // Start the reordering with the special low reorder codes that do not occur in the input.
344         for(int i = 0; i < MAX_NUM_SPECIAL_REORDER_CODES; ++i) {
345             int index = scriptsIndex[numScripts + i];
346             if(index != 0 && (specials & (1 << i)) == 0) {
347                 lowStart = addLowScriptRange(table, index, lowStart);
348             }
349         }
350 
351         // Skip the reserved range before Latin if Latin is the first script,
352         // so that we do not move it unnecessarily.
353         int skippedReserved = 0;
354         if(specials == 0 && reorder[0] == UScript.LATIN && !latinMustMove) {
355             int index = scriptsIndex[UScript.LATIN];
356             assert(index != 0);
357             int start = scriptStarts[index];
358             assert(lowStart <= start);
359             skippedReserved = start - lowStart;
360             lowStart = start;
361         }
362 
363         // Reorder according to the input scripts, continuing from the bottom of the primary range.
364         boolean hasReorderToEnd = false;
365         for(int i = 0; i < length;) {
366             int script = reorder[i++];
367             if(script == UScript.UNKNOWN) {
368                 // Put the remaining scripts at the top.
369                 hasReorderToEnd = true;
370                 while(i < length) {
371                     script = reorder[--length];
372                     if(script == UScript.UNKNOWN) {  // Must occur at most once.
373                         throw new IllegalArgumentException(
374                                 "setReorderCodes(): duplicate UScript.UNKNOWN");
375                     }
376                     if(script == Collator.ReorderCodes.DEFAULT) {
377                         throw new IllegalArgumentException(
378                                 "setReorderCodes(): UScript.DEFAULT together with other scripts");
379                     }
380                     int index = getScriptIndex(script);
381                     if(index == 0) { continue; }
382                     if(table[index] != 0) {  // Duplicate or equivalent script.
383                         throw new IllegalArgumentException(
384                                 "setReorderCodes(): duplicate or equivalent script " +
385                                 scriptCodeString(script));
386                     }
387                     highLimit = addHighScriptRange(table, index, highLimit);
388                 }
389                 break;
390             }
391             if(script == Collator.ReorderCodes.DEFAULT) {
392                 // The default code must be the only one in the list, and that is handled by the caller.
393                 // Otherwise it must not be used.
394                 throw new IllegalArgumentException(
395                         "setReorderCodes(): UScript.DEFAULT together with other scripts");
396             }
397             int index = getScriptIndex(script);
398             if(index == 0) { continue; }
399             if(table[index] != 0) {  // Duplicate or equivalent script.
400                 throw new IllegalArgumentException(
401                         "setReorderCodes(): duplicate or equivalent script " +
402                         scriptCodeString(script));
403             }
404             lowStart = addLowScriptRange(table, index, lowStart);
405         }
406 
407         // Put all remaining scripts into the middle.
408         for(int i = 1; i < scriptStarts.length - 1; ++i) {
409             int leadByte = table[i];
410             if(leadByte != 0) { continue; }
411             int start = scriptStarts[i];
412             if(!hasReorderToEnd && start > lowStart) {
413                 // No need to move this script.
414                 lowStart = start;
415             }
416             lowStart = addLowScriptRange(table, i, lowStart);
417         }
418         if(lowStart > highLimit) {
419             if((lowStart - (skippedReserved & 0xff00)) <= highLimit) {
420                 // Try not skipping the before-Latin reserved range.
421                 makeReorderRanges(reorder, true, ranges);
422                 return;
423             }
424             // We need more primary lead bytes than available, despite the reserved ranges.
425             throw new ICUException(
426                     "setReorderCodes(): reordering too many partial-primary-lead-byte scripts");
427         }
428 
429         // Turn lead bytes into a list of (limit, offset) pairs.
430         // Encode each pair in one list element:
431         // Upper 16 bits = limit, lower 16 = signed lead byte offset.
432         int offset = 0;
433         for(int i = 1;; ++i) {
434             int nextOffset = offset;
435             while(i < scriptStarts.length - 1) {
436                 int newLeadByte = table[i];
437                 if(newLeadByte == 0xff) {
438                     // "Don't care" lead byte for reserved range, continue with current offset.
439                 } else {
440                     nextOffset = newLeadByte - (scriptStarts[i] >> 8);
441                     if(nextOffset != offset) { break; }
442                 }
443                 ++i;
444             }
445             if(offset != 0 || i < scriptStarts.length - 1) {
446                 ranges.addElement(((int)scriptStarts[i] << 16) | (offset & 0xffff));
447             }
448             if(i == scriptStarts.length - 1) { break; }
449             offset = nextOffset;
450         }
451     }
452 
addLowScriptRange(short[] table, int index, int lowStart)453     private int addLowScriptRange(short[] table, int index, int lowStart) {
454         int start = scriptStarts[index];
455         if((start & 0xff) < (lowStart & 0xff)) {
456             lowStart += 0x100;
457         }
458         table[index] = (short)(lowStart >> 8);
459         int limit = scriptStarts[index + 1];
460         lowStart = ((lowStart & 0xff00) + ((limit & 0xff00) - (start & 0xff00))) | (limit & 0xff);
461         return lowStart;
462     }
463 
addHighScriptRange(short[] table, int index, int highLimit)464     private int addHighScriptRange(short[] table, int index, int highLimit) {
465         int limit = scriptStarts[index + 1];
466         if((limit & 0xff) > (highLimit & 0xff)) {
467             highLimit -= 0x100;
468         }
469         int start = scriptStarts[index];
470         highLimit = ((highLimit & 0xff00) - ((limit & 0xff00) - (start & 0xff00))) | (start & 0xff);
471         table[index] = (short)(highLimit >> 8);
472         return highLimit;
473     }
474 
scriptCodeString(int script)475     private static String scriptCodeString(int script) {
476         // Do not use the script name here: We do not want to depend on that data.
477         return (script < Collator.ReorderCodes.FIRST) ?
478                 Integer.toString(script) : "0x" + Integer.toHexString(script);
479     }
480 
481     private static final int[] EMPTY_INT_ARRAY = new int[0];
482 
483     /** @see jamoCE32s */
484     static final int JAMO_CE32S_LENGTH = 19 + 21 + 27;
485 
486     /** Main lookup trie. */
487     Trie2_32 trie;
488     /**
489      * Array of CE32 values.
490      * At index 0 there must be CE32(U+0000)
491      * to support U+0000's special-tag for NUL-termination handling.
492      */
493     int[] ce32s;
494     /** Array of CE values for expansions and OFFSET_TAG. */
495     long[] ces;
496     /** Array of prefix and contraction-suffix matching data. */
497     String contexts;
498     /** Base collation data, or null if this data itself is a base. */
499     public CollationData base;
500     /**
501      * Simple array of JAMO_CE32S_LENGTH=19+21+27 CE32s, one per canonical Jamo L/V/T.
502      * They are normally simple CE32s, rarely expansions.
503      * For fast handling of HANGUL_TAG.
504      */
505     int[] jamoCE32s = new int[JAMO_CE32S_LENGTH];
506     public Normalizer2Impl nfcImpl;
507     /** The single-byte primary weight (xx000000) for numeric collation. */
508     long numericPrimary = 0x12000000;
509 
510     /** 256 flags for which primary-weight lead bytes are compressible. */
511     public boolean[] compressibleBytes;
512     /**
513      * Set of code points that are unsafe for starting string comparison after an identical prefix,
514      * or in backwards CE iteration.
515      */
516     UnicodeSet unsafeBackwardSet;
517 
518     /**
519      * Fast Latin table for common-Latin-text string comparisons.
520      * Data structure see class CollationFastLatin.
521      */
522     public char[] fastLatinTable;
523     /**
524      * Header portion of the fastLatinTable.
525      * In C++, these are one array, and the header is skipped for mapping characters.
526      * In Java, two arrays work better.
527      */
528     char[] fastLatinTableHeader;
529 
530     /**
531      * Data for scripts and reordering groups.
532      * Uses include building a reordering permutation table and
533      * providing script boundaries to AlphabeticIndex.
534      */
535     int numScripts;
536     /**
537      * The length of scriptsIndex is numScripts+16.
538      * It maps from a UScriptCode or a special reorder code to an entry in scriptStarts.
539      * 16 special reorder codes (not all used) are mapped starting at numScripts.
540      * Up to MAX_NUM_SPECIAL_REORDER_CODES are codes for special groups like space/punct/digit.
541      * There are special codes at the end for reorder-reserved primary ranges.
542      *
543      * <p>Multiple scripts may share a range and index, for example Hira & Kana.
544      */
545     char[] scriptsIndex;
546     /**
547      * Start primary weight (top 16 bits only) for a group/script/reserved range
548      * indexed by scriptsIndex.
549      * The first range (separators & terminators) and the last range (trailing weights)
550      * are not reorderable, and no scriptsIndex entry points to them.
551      */
552     char[] scriptStarts;
553 
554     /**
555      * Collation elements in the root collator.
556      * Used by the CollationRootElements class. The data structure is described there.
557      * null in a tailoring.
558      */
559     public long[] rootElements;
560 }
561