• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GENERATED SOURCE. DO NOT MODIFY. */
2 // © 2016 and later: Unicode, Inc. and others.
3 // License & terms of use: http://www.unicode.org/copyright.html#License
4 /*
5  ******************************************************************************
6  * Copyright (C) 1996-2015, International Business Machines Corporation and
7  * others. All Rights Reserved.
8  ******************************************************************************
9  */
10 
11 package ohos.global.icu.impl;
12 
13 import java.nio.ByteBuffer;
14 import java.util.Arrays;
15 
16 import ohos.global.icu.lang.UCharacter;
17 import ohos.global.icu.text.UTF16;
18 
19 /**
20  * <p>A trie is a kind of compressed, serializable table of values
21  * associated with Unicode code points (0..0x10ffff).</p>
22  * <p>This class defines the basic structure of a trie and provides methods
23  * to <b>retrieve the offsets to the actual data</b>.</p>
24  * <p>Data will be the form of an array of basic types, char or int.</p>
25  * <p>The actual data format will have to be specified by the user in the
26  * inner static interface ohos.global.icu.impl.Trie.DataManipulate.</p>
27  * <p>This trie implementation is optimized for getting offset while walking
28  * forward through a UTF-16 string.
29  * Therefore, the simplest and fastest access macros are the
30  * fromLead() and fromOffsetTrail() methods.
31  * The fromBMP() method are a little more complicated; they get offsets even
32  * for lead surrogate codepoints, while the fromLead() method get special
33  * "folded" offsets for lead surrogate code units if there is relevant data
34  * associated with them.
35  * From such a folded offsets, an offset needs to be extracted to supply
36  * to the fromOffsetTrail() methods.
37  * To handle such supplementary codepoints, some offset information are kept
38  * in the data.</p>
39  * <p>Methods in ohos.global.icu.impl.Trie.DataManipulate are called to retrieve
40  * that offset from the folded value for the lead surrogate unit.</p>
41  * <p>For examples of use, see ohos.global.icu.impl.CharTrie or
42  * ohos.global.icu.impl.IntTrie.</p>
43  * @author synwee
44  * @see ohos.global.icu.impl.CharTrie
45  * @see ohos.global.icu.impl.IntTrie
46  * @hide exposed on OHOS
47  */
48 public abstract class Trie
49 {
50     // public class declaration ----------------------------------------
51 
52     /**
53     * Character data in com.ibm.impl.Trie have different user-specified format
54     * for different purposes.
55     * This interface specifies methods to be implemented in order for
56     * com.ibm.impl.Trie, to surrogate offset information encapsulated within
57     * the data.
58      * @hide exposed on OHOS
59     */
60     public static interface DataManipulate
61     {
62         /**
63         * Called by ohos.global.icu.impl.Trie to extract from a lead surrogate's
64         * data
65         * the index array offset of the indexes for that lead surrogate.
66         * @param value data value for a surrogate from the trie, including the
67         *        folding offset
68         * @return data offset or 0 if there is no data for the lead surrogate
69         */
getFoldingOffset(int value)70         public int getFoldingOffset(int value);
71     }
72 
73     // default implementation
74     private static class DefaultGetFoldingOffset implements DataManipulate {
75         @Override
getFoldingOffset(int value)76         public int getFoldingOffset(int value) {
77             return value;
78         }
79     }
80 
81     // public methods --------------------------------------------------
82 
83     /**
84      * Determines if this trie has a linear latin 1 array
85      * @return true if this trie has a linear latin 1 array, false otherwise
86      */
isLatin1Linear()87     public final boolean isLatin1Linear()
88     {
89         return m_isLatin1Linear_;
90     }
91 
92     /**
93      * Checks if the argument Trie has the same data as this Trie.
94      * Attributes are checked but not the index data.
95      * @param other Trie to check
96      * @return true if the argument Trie has the same data as this Trie, false
97      *         otherwise
98      */
99     ///CLOVER:OFF
100     @Override
equals(Object other)101     public boolean equals(Object other)
102     {
103         if (other == this) {
104             return true;
105         }
106         if (!(other instanceof Trie)) {
107             return false;
108         }
109         Trie othertrie = (Trie)other;
110         return m_isLatin1Linear_ == othertrie.m_isLatin1Linear_
111                && m_options_ == othertrie.m_options_
112                && m_dataLength_ == othertrie.m_dataLength_
113                && Arrays.equals(m_index_, othertrie.m_index_);
114     }
115 
116     @Override
hashCode()117     public int hashCode() {
118         assert false : "hashCode not designed";
119         return 42;
120     }
121     ///CLOVER:ON
122 
123     /**
124      * Gets the serialized data file size of the Trie. This is used during
125      * trie data reading for size checking purposes.
126      * @return size size of serialized trie data file in terms of the number
127      *              of bytes
128      */
getSerializedDataSize()129     public int getSerializedDataSize()
130     {
131         // includes signature, option, dataoffset and datalength output
132         int result = (4 << 2);
133         result += (m_dataOffset_ << 1);
134         if (isCharTrie()) {
135             result += (m_dataLength_ << 1);
136         }
137         else if (isIntTrie()) {
138             result += (m_dataLength_ << 2);
139         }
140         return result;
141     }
142 
143     // protected constructor -------------------------------------------
144 
145     /**
146      * Trie constructor for CharTrie use.
147      * @param bytes data of an ICU data file, containing the trie
148      * @param dataManipulate object containing the information to parse the
149      *                       trie data
150      */
Trie(ByteBuffer bytes, DataManipulate dataManipulate)151     protected Trie(ByteBuffer bytes, DataManipulate  dataManipulate)
152     {
153         // Magic number to authenticate the data.
154         int signature = bytes.getInt();
155         m_options_    = bytes.getInt();
156 
157         if (!checkHeader(signature)) {
158             throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
159         }
160 
161         if(dataManipulate != null) {
162             m_dataManipulate_ = dataManipulate;
163         } else {
164             m_dataManipulate_ = new DefaultGetFoldingOffset();
165         }
166         m_isLatin1Linear_ = (m_options_ &
167                              HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
168         m_dataOffset_     = bytes.getInt();
169         m_dataLength_     = bytes.getInt();
170         unserialize(bytes);
171     }
172 
173     /**
174     * Trie constructor
175     * @param index array to be used for index
176     * @param options used by the trie
177     * @param dataManipulate object containing the information to parse the
178     *                       trie data
179     */
Trie(char index[], int options, DataManipulate dataManipulate)180     protected Trie(char index[], int options, DataManipulate dataManipulate)
181     {
182         m_options_ = options;
183         if(dataManipulate != null) {
184             m_dataManipulate_ = dataManipulate;
185         } else {
186             m_dataManipulate_ = new DefaultGetFoldingOffset();
187         }
188         m_isLatin1Linear_ = (m_options_ &
189                              HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
190         m_index_ = index;
191         m_dataOffset_ = m_index_.length;
192     }
193 
194 
195     // protected data members ------------------------------------------
196 
197     /**
198     * Lead surrogate code points' index displacement in the index array.
199     * 0x10000-0xd800=0x2800
200     * 0x2800 >> INDEX_STAGE_1_SHIFT_
201     */
202     protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
203     /**
204     * Shift size for shifting right the input index. 1..9
205     */
206     protected static final int INDEX_STAGE_1_SHIFT_ = 5;
207     /**
208     * Shift size for shifting left the index array values.
209     * Increases possible data size with 16-bit index values at the cost
210     * of compactability.
211     * This requires blocks of stage 2 data to be aligned by
212     * DATA_GRANULARITY.
213     * 0..INDEX_STAGE_1_SHIFT
214     */
215     protected static final int INDEX_STAGE_2_SHIFT_ = 2;
216     /**
217      * Number of data values in a stage 2 (data array) block.
218      */
219     protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
220     /**
221     * Mask for getting the lower bits from the input index.
222     * DATA_BLOCK_LENGTH - 1.
223     */
224     protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
225     /** Number of bits of a trail surrogate that are used in index table lookups. */
226     protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
227     /**
228      * Number of index (stage 1) entries per lead surrogate.
229      * Same as number of index entries for 1024 trail surrogates,
230      * ==0x400>>INDEX_STAGE_1_SHIFT_
231      */
232     protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
233     /** Length of the BMP portion of the index (stage 1) array. */
234     protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
235     /**
236     * Surrogate mask to use when shifting offset to retrieve supplementary
237     * values
238     */
239     protected static final int SURROGATE_MASK_ = 0x3FF;
240     /**
241     * Index or UTF16 characters
242     */
243     protected char m_index_[];
244     /**
245     * Internal TrieValue which handles the parsing of the data value.
246     * This class is to be implemented by the user
247     */
248     protected DataManipulate m_dataManipulate_;
249     /**
250     * Start index of the data portion of the trie. CharTrie combines
251     * index and data into a char array, so this is used to indicate the
252     * initial offset to the data portion.
253     * Note this index always points to the initial value.
254     */
255     protected int m_dataOffset_;
256     /**
257     * Length of the data array
258     */
259     protected int m_dataLength_;
260 
261     // protected methods -----------------------------------------------
262 
263     /**
264     * Gets the offset to the data which the surrogate pair points to.
265     * @param lead lead surrogate
266     * @param trail trailing surrogate
267     * @return offset to data
268     */
getSurrogateOffset(char lead, char trail)269     protected abstract int getSurrogateOffset(char lead, char trail);
270 
271     /**
272     * Gets the value at the argument index
273     * @param index value at index will be retrieved
274     * @return 32 bit value
275     */
getValue(int index)276     protected abstract int getValue(int index);
277 
278     /**
279     * Gets the default initial value
280     * @return 32 bit value
281     */
getInitialValue()282     protected abstract int getInitialValue();
283 
284     /**
285     * Gets the offset to the data which the index ch after variable offset
286     * points to.
287     * Note for locating a non-supplementary character data offset, calling
288     * <p>
289     * getRawOffset(0, ch);
290     * </p>
291     * will do. Otherwise if it is a supplementary character formed by
292     * surrogates lead and trail. Then we would have to call getRawOffset()
293     * with getFoldingIndexOffset(). See getSurrogateOffset().
294     * @param offset index offset which ch is to start from
295     * @param ch index to be used after offset
296     * @return offset to the data
297     */
getRawOffset(int offset, char ch)298     protected final int getRawOffset(int offset, char ch)
299     {
300         return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
301                 << INDEX_STAGE_2_SHIFT_)
302                 + (ch & INDEX_STAGE_3_MASK_);
303     }
304 
305     /**
306     * Gets the offset to data which the BMP character points to
307     * Treats a lead surrogate as a normal code point.
308     * @param ch BMP character
309     * @return offset to data
310     */
getBMPOffset(char ch)311     protected final int getBMPOffset(char ch)
312     {
313         return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
314                 && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
315                 ? getRawOffset(LEAD_INDEX_OFFSET_, ch)
316                 : getRawOffset(0, ch);
317                 // using a getRawOffset(ch) makes no diff
318     }
319 
320     /**
321     * Gets the offset to the data which this lead surrogate character points
322     * to.
323     * Data at the returned offset may contain folding offset information for
324     * the next trailing surrogate character.
325     * @param ch lead surrogate character
326     * @return offset to data
327     */
getLeadOffset(char ch)328     protected final int getLeadOffset(char ch)
329     {
330        return getRawOffset(0, ch);
331     }
332 
333     /**
334     * Internal trie getter from a code point.
335     * Could be faster(?) but longer with
336     *   if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
337     * Gets the offset to data which the codepoint points to
338     * @param ch codepoint
339     * @return offset to data
340     */
getCodePointOffset(int ch)341     protected final int getCodePointOffset(int ch)
342     {
343         // if ((ch >> 16) == 0) slower
344         if (ch < 0) {
345             return -1;
346         } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
347             // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
348             return getRawOffset(0, (char)ch);
349         } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
350             // BMP codepoint
351             return getBMPOffset((char)ch);
352         } else if (ch <= UCharacter.MAX_VALUE) {
353             // look at the construction of supplementary characters
354             // trail forms the ends of it.
355             return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
356                                       (char)(ch & SURROGATE_MASK_));
357         } else {
358             // return -1 if there is an error, in this case we return
359             return -1;
360         }
361     }
362 
363     /**
364      * <p>Parses the byte buffer and creates the trie index with it.</p>
365      * <p>The position of the input ByteBuffer must be right after the trie header.</p>
366      * <p>This is overwritten by the child classes.
367      * @param bytes buffer containing trie data
368      */
unserialize(ByteBuffer bytes)369     protected void unserialize(ByteBuffer bytes)
370     {
371         m_index_ = ICUBinary.getChars(bytes, m_dataOffset_, 0);
372     }
373 
374     /**
375     * Determines if this is a 32 bit trie
376     * @return true if options specifies this is a 32 bit trie
377     */
isIntTrie()378     protected final boolean isIntTrie()
379     {
380         return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
381     }
382 
383     /**
384     * Determines if this is a 16 bit trie
385     * @return true if this is a 16 bit trie
386     */
isCharTrie()387     protected final boolean isCharTrie()
388     {
389         return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
390     }
391 
392     // private data members --------------------------------------------
393 
394     //  struct UTrieHeader {
395     //      int32_t   signature;
396     //      int32_t   options  (a bit field)
397     //      int32_t   indexLength
398     //      int32_t   dataLength
399 
400     /**
401     * Size of Trie header in bytes
402     */
403     protected static final int HEADER_LENGTH_ = 4 * 4;
404     /**
405     * Latin 1 option mask
406     */
407     protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
408     /**
409     * Constant number to authenticate the byte block
410     */
411     protected static final int HEADER_SIGNATURE_ = 0x54726965;
412     /**
413     * Header option formatting
414     */
415     private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
416     protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
417     protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
418 
419     /**
420     * Flag indicator for Latin quick access data block
421     */
422     private boolean m_isLatin1Linear_;
423 
424     /**
425     * <p>Trie options field.</p>
426     * <p>options bit field:<br>
427     * 9  1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
428     * 8  0 = 16-bit data, 1=32-bit data<br>
429     * 7..4  INDEX_STAGE_1_SHIFT   // 0..INDEX_STAGE_2_SHIFT<br>
430     * 3..0  INDEX_STAGE_2_SHIFT   // 1..9<br>
431     */
432     private int m_options_;
433 
434     // private methods ---------------------------------------------------
435 
436     /**
437     * Authenticates raw data header.
438     * Checking the header information, signature and options.
439     * @param signature This contains the options and type of a Trie
440     * @return true if the header is authenticated valid
441     */
checkHeader(int signature)442     private final boolean checkHeader(int signature)
443     {
444         // check the signature
445         // Trie in big-endian US-ASCII (0x54726965).
446         // Magic number to authenticate the data.
447         if (signature != HEADER_SIGNATURE_) {
448             return false;
449         }
450 
451         if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
452                                                     INDEX_STAGE_1_SHIFT_ ||
453             ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
454                                                 HEADER_OPTIONS_SHIFT_MASK_)
455                                                  != INDEX_STAGE_2_SHIFT_) {
456             return false;
457         }
458         return true;
459     }
460 }
461