• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  ******************************************************************************
5  * Copyright (C) 1996-2015, International Business Machines Corporation and
6  * others. All Rights Reserved.
7  ******************************************************************************
8  */
9 
10 package com.ibm.icu.impl;
11 
12 import java.nio.ByteBuffer;
13 import java.util.Arrays;
14 
15 import com.ibm.icu.lang.UCharacter;
16 import com.ibm.icu.text.UTF16;
17 
18 /**
19  * <p>A trie is a kind of compressed, serializable table of values
20  * associated with Unicode code points (0..0x10ffff).</p>
21  * <p>This class defines the basic structure of a trie and provides methods
22  * to <b>retrieve the offsets to the actual data</b>.</p>
23  * <p>Data will be the form of an array of basic types, char or int.</p>
24  * <p>The actual data format will have to be specified by the user in the
25  * inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
26  * <p>This trie implementation is optimized for getting offset while walking
27  * forward through a UTF-16 string.
28  * Therefore, the simplest and fastest access macros are the
29  * fromLead() and fromOffsetTrail() methods.
30  * The fromBMP() method are a little more complicated; they get offsets even
31  * for lead surrogate codepoints, while the fromLead() method get special
32  * "folded" offsets for lead surrogate code units if there is relevant data
33  * associated with them.
34  * From such a folded offsets, an offset needs to be extracted to supply
35  * to the fromOffsetTrail() methods.
36  * To handle such supplementary codepoints, some offset information are kept
37  * in the data.</p>
38  * <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
39  * that offset from the folded value for the lead surrogate unit.</p>
40  * <p>For examples of use, see com.ibm.icu.impl.CharTrie or
41  * com.ibm.icu.impl.IntTrie.</p>
42  * @author synwee
43  * @see com.ibm.icu.impl.CharTrie
44  * @see com.ibm.icu.impl.IntTrie
45  * @since release 2.1, Jan 01 2002
46  */
47 public abstract class Trie
48 {
49     // public class declaration ----------------------------------------
50 
51     /**
52     * Character data in com.ibm.impl.Trie have different user-specified format
53     * for different purposes.
54     * This interface specifies methods to be implemented in order for
55     * com.ibm.impl.Trie, to surrogate offset information encapsulated within
56     * the data.
57     */
58     public static interface DataManipulate
59     {
60         /**
61         * Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
62         * data
63         * the index array offset of the indexes for that lead surrogate.
64         * @param value data value for a surrogate from the trie, including the
65         *        folding offset
66         * @return data offset or 0 if there is no data for the lead surrogate
67         */
getFoldingOffset(int value)68         public int getFoldingOffset(int value);
69     }
70 
71     // default implementation
72     private static class DefaultGetFoldingOffset implements DataManipulate {
73         @Override
getFoldingOffset(int value)74         public int getFoldingOffset(int value) {
75             return value;
76         }
77     }
78 
79     // public methods --------------------------------------------------
80 
81     /**
82      * Determines if this trie has a linear latin 1 array
83      * @return true if this trie has a linear latin 1 array, false otherwise
84      */
isLatin1Linear()85     public final boolean isLatin1Linear()
86     {
87         return m_isLatin1Linear_;
88     }
89 
90     /**
91      * Checks if the argument Trie has the same data as this Trie.
92      * Attributes are checked but not the index data.
93      * @param other Trie to check
94      * @return true if the argument Trie has the same data as this Trie, false
95      *         otherwise
96      */
97     ///CLOVER:OFF
98     @Override
equals(Object other)99     public boolean equals(Object other)
100     {
101         if (other == this) {
102             return true;
103         }
104         if (!(other instanceof Trie)) {
105             return false;
106         }
107         Trie othertrie = (Trie)other;
108         return m_isLatin1Linear_ == othertrie.m_isLatin1Linear_
109                && m_options_ == othertrie.m_options_
110                && m_dataLength_ == othertrie.m_dataLength_
111                && Arrays.equals(m_index_, othertrie.m_index_);
112     }
113 
114     @Override
hashCode()115     public int hashCode() {
116         assert false : "hashCode not designed";
117         return 42;
118     }
119     ///CLOVER:ON
120 
121     /**
122      * Gets the serialized data file size of the Trie. This is used during
123      * trie data reading for size checking purposes.
124      * @return size size of serialized trie data file in terms of the number
125      *              of bytes
126      */
getSerializedDataSize()127     public int getSerializedDataSize()
128     {
129         // includes signature, option, dataoffset and datalength output
130         int result = (4 << 2);
131         result += (m_dataOffset_ << 1);
132         if (isCharTrie()) {
133             result += (m_dataLength_ << 1);
134         }
135         else if (isIntTrie()) {
136             result += (m_dataLength_ << 2);
137         }
138         return result;
139     }
140 
141     // protected constructor -------------------------------------------
142 
143     /**
144      * Trie constructor for CharTrie use.
145      * @param bytes data of an ICU data file, containing the trie
146      * @param dataManipulate object containing the information to parse the
147      *                       trie data
148      */
Trie(ByteBuffer bytes, DataManipulate dataManipulate)149     protected Trie(ByteBuffer bytes, DataManipulate  dataManipulate)
150     {
151         // Magic number to authenticate the data.
152         int signature = bytes.getInt();
153         m_options_    = bytes.getInt();
154 
155         if (!checkHeader(signature)) {
156             throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
157         }
158 
159         if(dataManipulate != null) {
160             m_dataManipulate_ = dataManipulate;
161         } else {
162             m_dataManipulate_ = new DefaultGetFoldingOffset();
163         }
164         m_isLatin1Linear_ = (m_options_ &
165                              HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
166         m_dataOffset_     = bytes.getInt();
167         m_dataLength_     = bytes.getInt();
168         unserialize(bytes);
169     }
170 
171     /**
172     * Trie constructor
173     * @param index array to be used for index
174     * @param options used by the trie
175     * @param dataManipulate object containing the information to parse the
176     *                       trie data
177     */
Trie(char index[], int options, DataManipulate dataManipulate)178     protected Trie(char index[], int options, DataManipulate dataManipulate)
179     {
180         m_options_ = options;
181         if(dataManipulate != null) {
182             m_dataManipulate_ = dataManipulate;
183         } else {
184             m_dataManipulate_ = new DefaultGetFoldingOffset();
185         }
186         m_isLatin1Linear_ = (m_options_ &
187                              HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
188         m_index_ = index;
189         m_dataOffset_ = m_index_.length;
190     }
191 
192 
193     // protected data members ------------------------------------------
194 
195     /**
196     * Lead surrogate code points' index displacement in the index array.
197     * 0x10000-0xd800=0x2800
198     * 0x2800 >> INDEX_STAGE_1_SHIFT_
199     */
200     protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
201     /**
202     * Shift size for shifting right the input index. 1..9
203     */
204     protected static final int INDEX_STAGE_1_SHIFT_ = 5;
205     /**
206     * Shift size for shifting left the index array values.
207     * Increases possible data size with 16-bit index values at the cost
208     * of compactability.
209     * This requires blocks of stage 2 data to be aligned by
210     * DATA_GRANULARITY.
211     * 0..INDEX_STAGE_1_SHIFT
212     */
213     protected static final int INDEX_STAGE_2_SHIFT_ = 2;
214     /**
215      * Number of data values in a stage 2 (data array) block.
216      */
217     protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
218     /**
219     * Mask for getting the lower bits from the input index.
220     * DATA_BLOCK_LENGTH - 1.
221     */
222     protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
223     /** Number of bits of a trail surrogate that are used in index table lookups. */
224     protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_;
225     /**
226      * Number of index (stage 1) entries per lead surrogate.
227      * Same as number of index entries for 1024 trail surrogates,
228      * ==0x400>>INDEX_STAGE_1_SHIFT_
229      */
230     protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS);
231     /** Length of the BMP portion of the index (stage 1) array. */
232     protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_;
233     /**
234     * Surrogate mask to use when shifting offset to retrieve supplementary
235     * values
236     */
237     protected static final int SURROGATE_MASK_ = 0x3FF;
238     /**
239     * Index or UTF16 characters
240     */
241     protected char m_index_[];
242     /**
243     * Internal TrieValue which handles the parsing of the data value.
244     * This class is to be implemented by the user
245     */
246     protected DataManipulate m_dataManipulate_;
247     /**
248     * Start index of the data portion of the trie. CharTrie combines
249     * index and data into a char array, so this is used to indicate the
250     * initial offset to the data portion.
251     * Note this index always points to the initial value.
252     */
253     protected int m_dataOffset_;
254     /**
255     * Length of the data array
256     */
257     protected int m_dataLength_;
258 
259     // protected methods -----------------------------------------------
260 
261     /**
262     * Gets the offset to the data which the surrogate pair points to.
263     * @param lead lead surrogate
264     * @param trail trailing surrogate
265     * @return offset to data
266     */
getSurrogateOffset(char lead, char trail)267     protected abstract int getSurrogateOffset(char lead, char trail);
268 
269     /**
270     * Gets the value at the argument index
271     * @param index value at index will be retrieved
272     * @return 32 bit value
273     */
getValue(int index)274     protected abstract int getValue(int index);
275 
276     /**
277     * Gets the default initial value
278     * @return 32 bit value
279     */
getInitialValue()280     protected abstract int getInitialValue();
281 
282     /**
283     * Gets the offset to the data which the index ch after variable offset
284     * points to.
285     * Note for locating a non-supplementary character data offset, calling
286     * <p>
287     * getRawOffset(0, ch);
288     * </p>
289     * will do. Otherwise if it is a supplementary character formed by
290     * surrogates lead and trail. Then we would have to call getRawOffset()
291     * with getFoldingIndexOffset(). See getSurrogateOffset().
292     * @param offset index offset which ch is to start from
293     * @param ch index to be used after offset
294     * @return offset to the data
295     */
getRawOffset(int offset, char ch)296     protected final int getRawOffset(int offset, char ch)
297     {
298         return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
299                 << INDEX_STAGE_2_SHIFT_)
300                 + (ch & INDEX_STAGE_3_MASK_);
301     }
302 
303     /**
304     * Gets the offset to data which the BMP character points to
305     * Treats a lead surrogate as a normal code point.
306     * @param ch BMP character
307     * @return offset to data
308     */
getBMPOffset(char ch)309     protected final int getBMPOffset(char ch)
310     {
311         return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
312                 && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
313                 ? getRawOffset(LEAD_INDEX_OFFSET_, ch)
314                 : getRawOffset(0, ch);
315                 // using a getRawOffset(ch) makes no diff
316     }
317 
318     /**
319     * Gets the offset to the data which this lead surrogate character points
320     * to.
321     * Data at the returned offset may contain folding offset information for
322     * the next trailing surrogate character.
323     * @param ch lead surrogate character
324     * @return offset to data
325     */
getLeadOffset(char ch)326     protected final int getLeadOffset(char ch)
327     {
328        return getRawOffset(0, ch);
329     }
330 
331     /**
332     * Internal trie getter from a code point.
333     * Could be faster(?) but longer with
334     *   if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }
335     * Gets the offset to data which the codepoint points to
336     * @param ch codepoint
337     * @return offset to data
338     */
getCodePointOffset(int ch)339     protected final int getCodePointOffset(int ch)
340     {
341         // if ((ch >> 16) == 0) slower
342         if (ch < 0) {
343             return -1;
344         } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
345             // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
346             return getRawOffset(0, (char)ch);
347         } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
348             // BMP codepoint
349             return getBMPOffset((char)ch);
350         } else if (ch <= UCharacter.MAX_VALUE) {
351             // look at the construction of supplementary characters
352             // trail forms the ends of it.
353             return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
354                                       (char)(ch & SURROGATE_MASK_));
355         } else {
356             // return -1 if there is an error, in this case we return
357             return -1;
358         }
359     }
360 
361     /**
362      * <p>Parses the byte buffer and creates the trie index with it.</p>
363      * <p>The position of the input ByteBuffer must be right after the trie header.</p>
364      * <p>This is overwritten by the child classes.
365      * @param bytes buffer containing trie data
366      */
unserialize(ByteBuffer bytes)367     protected void unserialize(ByteBuffer bytes)
368     {
369         m_index_ = ICUBinary.getChars(bytes, m_dataOffset_, 0);
370     }
371 
372     /**
373     * Determines if this is a 32 bit trie
374     * @return true if options specifies this is a 32 bit trie
375     */
isIntTrie()376     protected final boolean isIntTrie()
377     {
378         return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0;
379     }
380 
381     /**
382     * Determines if this is a 16 bit trie
383     * @return true if this is a 16 bit trie
384     */
isCharTrie()385     protected final boolean isCharTrie()
386     {
387         return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
388     }
389 
390     // private data members --------------------------------------------
391 
392     //  struct UTrieHeader {
393     //      int32_t   signature;
394     //      int32_t   options  (a bit field)
395     //      int32_t   indexLength
396     //      int32_t   dataLength
397 
398     /**
399     * Size of Trie header in bytes
400     */
401     protected static final int HEADER_LENGTH_ = 4 * 4;
402     /**
403     * Latin 1 option mask
404     */
405     protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
406     /**
407     * Constant number to authenticate the byte block
408     */
409     protected static final int HEADER_SIGNATURE_ = 0x54726965;
410     /**
411     * Header option formatting
412     */
413     private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
414     protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
415     protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
416 
417     /**
418     * Flag indicator for Latin quick access data block
419     */
420     private boolean m_isLatin1Linear_;
421 
422     /**
423     * <p>Trie options field.</p>
424     * <p>options bit field:<br>
425     * 9  1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
426     * 8  0 = 16-bit data, 1=32-bit data<br>
427     * 7..4  INDEX_STAGE_1_SHIFT   // 0..INDEX_STAGE_2_SHIFT<br>
428     * 3..0  INDEX_STAGE_2_SHIFT   // 1..9<br>
429     */
430     private int m_options_;
431 
432     // private methods ---------------------------------------------------
433 
434     /**
435     * Authenticates raw data header.
436     * Checking the header information, signature and options.
437     * @param signature This contains the options and type of a Trie
438     * @return true if the header is authenticated valid
439     */
checkHeader(int signature)440     private final boolean checkHeader(int signature)
441     {
442         // check the signature
443         // Trie in big-endian US-ASCII (0x54726965).
444         // Magic number to authenticate the data.
445         if (signature != HEADER_SIGNATURE_) {
446             return false;
447         }
448 
449         if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
450                                                     INDEX_STAGE_1_SHIFT_ ||
451             ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
452                                                 HEADER_OPTIONS_SHIFT_MASK_)
453                                                  != INDEX_STAGE_2_SHIFT_) {
454             return false;
455         }
456         return true;
457     }
458 }
459