• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5 ******************************************************************************
6 * Copyright (C) 1996-2010, International Business Machines Corporation and   *
7 * others. All Rights Reserved.                                               *
8 ******************************************************************************
9 */
10 
11 package ohos.global.icu.impl;
12 
13 import java.util.Arrays;
14 
15 import ohos.global.icu.lang.UCharacter;
16 
17 /**
18  * Builder class to manipulate and generate a trie.
19  * This is useful for ICU data in primitive types.
20  * Provides a compact way to store information that is indexed by Unicode
21  * values, such as character properties, types, keyboard values, etc. This is
22  * very useful when you have a block of Unicode data that contains significant
23  * values while the rest of the Unicode data is unused in the application or
24  * when you have a lot of redundance, such as where all 21,000 Han ideographs
25  * have the same value.  However, lookup is much faster than a hash table.
26  * A trie of any primitive data type serves two purposes:
27  * <UL type = round>
28  *     <LI>Fast access of the indexed values.
29  *     <LI>Smaller memory footprint.
30  * </UL>
31  * This is a direct port from the ICU4C version
32  * @author             Syn Wee Quek
33  * @hide exposed on OHOS
34  */
35 public class TrieBuilder
36 {
37     // public data member ----------------------------------------------
38 
39     /**
40      * Number of data values in a stage 2 (data array) block. 2, 4, 8, ..,
41      * 0x200
42      */
43     public static final int DATA_BLOCK_LENGTH = 1 << Trie.INDEX_STAGE_1_SHIFT_;
44 
45     // public class declaration ----------------------------------------
46 
47     /**
48      * Character data in com.ibm.impl.Trie have different user-specified format
49      * for different purposes.
50      * This interface specifies methods to be implemented in order for
51      * com.ibm.impl.Trie, to surrogate offset information encapsulated within
52      * the data.
53      * @hide exposed on OHOS
54      */
55     public static interface DataManipulate
56     {
57         /**
58          * Build-time trie callback function, used with serialize().
59          * This function calculates a lead surrogate's value including a
60          * folding offset from the 1024 supplementary code points
61          * [start..start+1024[ .
62          * It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0.
63          * The folding offset is provided by the caller.
64          * It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT
65          * with n=0..1023.
66          * Instead of the offset itself, n can be stored in 10 bits - or fewer
67          * if it can be assumed that few lead surrogates have associated data.
68          * The returned value must be
69          *  - not zero if and only if there is relevant data for the
70          *                        corresponding 1024 supplementary code points
71          *  - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(...,
72          *                                                    offset))==offset
73          * @return a folded value, or 0 if there is no relevant data for the
74          *         lead surrogate.
75          */
getFoldedValue(int start, int offset)76         public int getFoldedValue(int start, int offset);
77     }
78 
79     // public methods ----------------------------------------------------
80 
81     /**
82      * Checks if the character belongs to a zero block in the trie
83      * @param ch codepoint which data is to be retrieved
84      * @return true if ch is in the zero block
85      */
isInZeroBlock(int ch)86     public boolean isInZeroBlock(int ch)
87     {
88         // valid, uncompacted trie and valid c?
89         if (m_isCompacted_ || ch > UCharacter.MAX_VALUE
90             || ch < UCharacter.MIN_VALUE) {
91             return true;
92         }
93 
94         return m_index_[ch >> SHIFT_] == 0;
95     }
96 
97     // package private method -----------------------------------------------
98 
99     // protected data member -----------------------------------------------
100 
101     /**
102      * Index values at build-time are 32 bits wide for easier processing.
103      * Bit 31 is set if the data block is used by multiple index values
104      * (from setRange()).
105      */
106     protected int m_index_[];
107     protected int m_indexLength_;
108     protected int m_dataCapacity_;
109     protected int m_dataLength_;
110     protected boolean m_isLatin1Linear_;
111     protected boolean m_isCompacted_;
112     /**
113      * Map of adjusted indexes, used in utrie_compact().
114      * Maps from original indexes to new ones.
115      */
116     protected int m_map_[];
117 
118     /**
119      * Shift size for shifting right the input index. 1..9
120      */
121     protected static final int SHIFT_ = Trie.INDEX_STAGE_1_SHIFT_;
122     /**
123      * Length of the index (stage 1) array before folding.
124      * Maximum number of Unicode code points (0x110000) shifted right by
125      * SHIFT.
126      */
127     protected static final int MAX_INDEX_LENGTH_ = (0x110000 >> SHIFT_);
128     /**
129      * Length of the BMP portion of the index (stage 1) array.
130      */
131     protected static final int BMP_INDEX_LENGTH_ = 0x10000 >> SHIFT_;
132     /**
133      * Number of index (stage 1) entries per lead surrogate.
134      * Same as number of indexe entries for 1024 trail surrogates,
135      * ==0x400>>UTRIE_SHIFT
136      * 10 - SHIFT == Number of bits of a trail surrogate that are used in
137      *               index table lookups.
138      */
139     protected static final int SURROGATE_BLOCK_COUNT_ = 1 << (10 - SHIFT_);
140     /**
141      * Mask for getting the lower bits from the input index.
142      * DATA_BLOCK_LENGTH - 1.
143      */
144     protected static final int MASK_ = Trie.INDEX_STAGE_3_MASK_;
145     /**
146      * Shift size for shifting left the index array values.
147      * Increases possible data size with 16-bit index values at the cost
148      * of compactability.
149      * This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY.
150      * 0..UTRIE_SHIFT
151      */
152     protected static final int INDEX_SHIFT_ = Trie.INDEX_STAGE_2_SHIFT_;
153     /**
154      * Maximum length of the runtime data (stage 2) array.
155      * Limited by 16-bit index values that are left-shifted by INDEX_SHIFT_.
156      */
157     protected static final int MAX_DATA_LENGTH_ = (0x10000 << INDEX_SHIFT_);
158     /**
159      * Shifting to position the index value in options
160      */
161     protected static final int OPTIONS_INDEX_SHIFT_ = 4;
162     /**
163      * If set, then the data (stage 2) array is 32 bits wide.
164      */
165     protected static final int OPTIONS_DATA_IS_32_BIT_ = 0x100;
166     /**
167      * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data
168      * (stage 2) array as a simple, linear array at data + DATA_BLOCK_LENGTH.
169      */
170     protected static final int OPTIONS_LATIN1_IS_LINEAR_ = 0x200;
171     /**
172      * The alignment size of a stage 2 data block. Also the granularity for
173      * compaction.
174      */
175     protected static final int DATA_GRANULARITY_ = 1 << INDEX_SHIFT_;
176 
177     // protected constructor ----------------------------------------------
178 
TrieBuilder()179     protected TrieBuilder()
180     {
181         m_index_ = new int[MAX_INDEX_LENGTH_];
182         m_map_ = new int[MAX_BUILD_TIME_DATA_LENGTH_ >> SHIFT_];
183         m_isLatin1Linear_ = false;
184         m_isCompacted_ = false;
185         m_indexLength_ = MAX_INDEX_LENGTH_;
186     }
187 
TrieBuilder(TrieBuilder table)188     protected TrieBuilder(TrieBuilder table)
189     {
190         m_index_ = new int[MAX_INDEX_LENGTH_];
191         m_indexLength_ = table.m_indexLength_;
192         System.arraycopy(table.m_index_, 0, m_index_, 0, m_indexLength_);
193         m_dataCapacity_ = table.m_dataCapacity_;
194         m_dataLength_ = table.m_dataLength_;
195         m_map_ = new int[table.m_map_.length];
196         System.arraycopy(table.m_map_, 0, m_map_, 0, m_map_.length);
197         m_isLatin1Linear_ = table.m_isLatin1Linear_;
198         m_isCompacted_ = table.m_isCompacted_;
199     }
200 
201     // protected functions ------------------------------------------------
202 
203     /**
204      * Compare two sections of an array for equality.
205      */
equal_int(int[] array, int start1, int start2, int length)206     protected static final boolean equal_int(int[] array, int start1, int start2, int length) {
207         while(length>0 && array[start1]==array[start2]) {
208             ++start1;
209             ++start2;
210             --length;
211         }
212         return length==0;
213     }
214 
215     /**
216      * Set a value in the trie index map to indicate which data block
217      * is referenced and which one is not.
218      * utrie_compact() will remove data blocks that are not used at all.
219      * Set
220      * - 0 if it is used
221      * - -1 if it is not used
222      */
findUnusedBlocks()223     protected void findUnusedBlocks()
224     {
225         // fill the entire map with "not used"
226         Arrays.fill(m_map_, 0xff);
227 
228         // mark each block that _is_ used with 0
229         for (int i = 0; i < m_indexLength_; ++ i) {
230             m_map_[Math.abs(m_index_[i]) >> SHIFT_] = 0;
231         }
232 
233         // never move the all-initial-value block 0
234         m_map_[0] = 0;
235     }
236 
237     /**
238      * Finds the same index block as the otherBlock
239      * @param index array
240      * @param indexLength size of index
241      * @param otherBlock
242      * @return same index block
243      */
findSameIndexBlock(int index[], int indexLength, int otherBlock)244     protected static final int findSameIndexBlock(int index[], int indexLength,
245                                                   int otherBlock)
246     {
247         for (int block = BMP_INDEX_LENGTH_; block < indexLength;
248              block += SURROGATE_BLOCK_COUNT_) {
249             if(equal_int(index, block, otherBlock, SURROGATE_BLOCK_COUNT_)) {
250                 return block;
251             }
252         }
253         return indexLength;
254     }
255 
256     // private data member ------------------------------------------------
257 
258     /**
259      * Maximum length of the build-time data (stage 2) array.
260      * The maximum length is 0x110000 + DATA_BLOCK_LENGTH + 0x400.
261      * (Number of Unicode code points + one all-initial-value block +
262      *  possible duplicate entries for 1024 lead surrogates.)
263      */
264     private static final int MAX_BUILD_TIME_DATA_LENGTH_ =
265         0x110000 + DATA_BLOCK_LENGTH + 0x400;
266 }
267