1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ****************************************************************************** 6 * Copyright (C) 1996-2015, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ****************************************************************************** 9 */ 10 11 package ohos.global.icu.impl; 12 13 import java.nio.ByteBuffer; 14 import java.util.Arrays; 15 16 import ohos.global.icu.lang.UCharacter; 17 import ohos.global.icu.text.UTF16; 18 19 /** 20 * <p>A trie is a kind of compressed, serializable table of values 21 * associated with Unicode code points (0..0x10ffff).</p> 22 * <p>This class defines the basic structure of a trie and provides methods 23 * to <b>retrieve the offsets to the actual data</b>.</p> 24 * <p>Data will be the form of an array of basic types, char or int.</p> 25 * <p>The actual data format will have to be specified by the user in the 26 * inner static interface ohos.global.icu.impl.Trie.DataManipulate.</p> 27 * <p>This trie implementation is optimized for getting offset while walking 28 * forward through a UTF-16 string. 29 * Therefore, the simplest and fastest access macros are the 30 * fromLead() and fromOffsetTrail() methods. 31 * The fromBMP() method are a little more complicated; they get offsets even 32 * for lead surrogate codepoints, while the fromLead() method get special 33 * "folded" offsets for lead surrogate code units if there is relevant data 34 * associated with them. 35 * From such a folded offsets, an offset needs to be extracted to supply 36 * to the fromOffsetTrail() methods. 37 * To handle such supplementary codepoints, some offset information are kept 38 * in the data.</p> 39 * <p>Methods in ohos.global.icu.impl.Trie.DataManipulate are called to retrieve 40 * that offset from the folded value for the lead surrogate unit.</p> 41 * <p>For examples of use, see ohos.global.icu.impl.CharTrie or 42 * ohos.global.icu.impl.IntTrie.</p> 43 * @author synwee 44 * @see ohos.global.icu.impl.CharTrie 45 * @see ohos.global.icu.impl.IntTrie 46 * @hide exposed on OHOS 47 */ 48 public abstract class Trie 49 { 50 // public class declaration ---------------------------------------- 51 52 /** 53 * Character data in com.ibm.impl.Trie have different user-specified format 54 * for different purposes. 55 * This interface specifies methods to be implemented in order for 56 * com.ibm.impl.Trie, to surrogate offset information encapsulated within 57 * the data. 58 * @hide exposed on OHOS 59 */ 60 public static interface DataManipulate 61 { 62 /** 63 * Called by ohos.global.icu.impl.Trie to extract from a lead surrogate's 64 * data 65 * the index array offset of the indexes for that lead surrogate. 66 * @param value data value for a surrogate from the trie, including the 67 * folding offset 68 * @return data offset or 0 if there is no data for the lead surrogate 69 */ getFoldingOffset(int value)70 public int getFoldingOffset(int value); 71 } 72 73 // default implementation 74 private static class DefaultGetFoldingOffset implements DataManipulate { 75 @Override getFoldingOffset(int value)76 public int getFoldingOffset(int value) { 77 return value; 78 } 79 } 80 81 // public methods -------------------------------------------------- 82 83 /** 84 * Determines if this trie has a linear latin 1 array 85 * @return true if this trie has a linear latin 1 array, false otherwise 86 */ isLatin1Linear()87 public final boolean isLatin1Linear() 88 { 89 return m_isLatin1Linear_; 90 } 91 92 /** 93 * Checks if the argument Trie has the same data as this Trie. 94 * Attributes are checked but not the index data. 95 * @param other Trie to check 96 * @return true if the argument Trie has the same data as this Trie, false 97 * otherwise 98 */ 99 ///CLOVER:OFF 100 @Override equals(Object other)101 public boolean equals(Object other) 102 { 103 if (other == this) { 104 return true; 105 } 106 if (!(other instanceof Trie)) { 107 return false; 108 } 109 Trie othertrie = (Trie)other; 110 return m_isLatin1Linear_ == othertrie.m_isLatin1Linear_ 111 && m_options_ == othertrie.m_options_ 112 && m_dataLength_ == othertrie.m_dataLength_ 113 && Arrays.equals(m_index_, othertrie.m_index_); 114 } 115 116 @Override hashCode()117 public int hashCode() { 118 assert false : "hashCode not designed"; 119 return 42; 120 } 121 ///CLOVER:ON 122 123 /** 124 * Gets the serialized data file size of the Trie. This is used during 125 * trie data reading for size checking purposes. 126 * @return size size of serialized trie data file in terms of the number 127 * of bytes 128 */ getSerializedDataSize()129 public int getSerializedDataSize() 130 { 131 // includes signature, option, dataoffset and datalength output 132 int result = (4 << 2); 133 result += (m_dataOffset_ << 1); 134 if (isCharTrie()) { 135 result += (m_dataLength_ << 1); 136 } 137 else if (isIntTrie()) { 138 result += (m_dataLength_ << 2); 139 } 140 return result; 141 } 142 143 // protected constructor ------------------------------------------- 144 145 /** 146 * Trie constructor for CharTrie use. 147 * @param bytes data of an ICU data file, containing the trie 148 * @param dataManipulate object containing the information to parse the 149 * trie data 150 */ Trie(ByteBuffer bytes, DataManipulate dataManipulate)151 protected Trie(ByteBuffer bytes, DataManipulate dataManipulate) 152 { 153 // Magic number to authenticate the data. 154 int signature = bytes.getInt(); 155 m_options_ = bytes.getInt(); 156 157 if (!checkHeader(signature)) { 158 throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file"); 159 } 160 161 if(dataManipulate != null) { 162 m_dataManipulate_ = dataManipulate; 163 } else { 164 m_dataManipulate_ = new DefaultGetFoldingOffset(); 165 } 166 m_isLatin1Linear_ = (m_options_ & 167 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; 168 m_dataOffset_ = bytes.getInt(); 169 m_dataLength_ = bytes.getInt(); 170 unserialize(bytes); 171 } 172 173 /** 174 * Trie constructor 175 * @param index array to be used for index 176 * @param options used by the trie 177 * @param dataManipulate object containing the information to parse the 178 * trie data 179 */ Trie(char index[], int options, DataManipulate dataManipulate)180 protected Trie(char index[], int options, DataManipulate dataManipulate) 181 { 182 m_options_ = options; 183 if(dataManipulate != null) { 184 m_dataManipulate_ = dataManipulate; 185 } else { 186 m_dataManipulate_ = new DefaultGetFoldingOffset(); 187 } 188 m_isLatin1Linear_ = (m_options_ & 189 HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0; 190 m_index_ = index; 191 m_dataOffset_ = m_index_.length; 192 } 193 194 195 // protected data members ------------------------------------------ 196 197 /** 198 * Lead surrogate code points' index displacement in the index array. 199 * 0x10000-0xd800=0x2800 200 * 0x2800 >> INDEX_STAGE_1_SHIFT_ 201 */ 202 protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5; 203 /** 204 * Shift size for shifting right the input index. 1..9 205 */ 206 protected static final int INDEX_STAGE_1_SHIFT_ = 5; 207 /** 208 * Shift size for shifting left the index array values. 209 * Increases possible data size with 16-bit index values at the cost 210 * of compactability. 211 * This requires blocks of stage 2 data to be aligned by 212 * DATA_GRANULARITY. 213 * 0..INDEX_STAGE_1_SHIFT 214 */ 215 protected static final int INDEX_STAGE_2_SHIFT_ = 2; 216 /** 217 * Number of data values in a stage 2 (data array) block. 218 */ 219 protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_; 220 /** 221 * Mask for getting the lower bits from the input index. 222 * DATA_BLOCK_LENGTH - 1. 223 */ 224 protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1; 225 /** Number of bits of a trail surrogate that are used in index table lookups. */ 226 protected static final int SURROGATE_BLOCK_BITS=10-INDEX_STAGE_1_SHIFT_; 227 /** 228 * Number of index (stage 1) entries per lead surrogate. 229 * Same as number of index entries for 1024 trail surrogates, 230 * ==0x400>>INDEX_STAGE_1_SHIFT_ 231 */ 232 protected static final int SURROGATE_BLOCK_COUNT=(1<<SURROGATE_BLOCK_BITS); 233 /** Length of the BMP portion of the index (stage 1) array. */ 234 protected static final int BMP_INDEX_LENGTH=0x10000>>INDEX_STAGE_1_SHIFT_; 235 /** 236 * Surrogate mask to use when shifting offset to retrieve supplementary 237 * values 238 */ 239 protected static final int SURROGATE_MASK_ = 0x3FF; 240 /** 241 * Index or UTF16 characters 242 */ 243 protected char m_index_[]; 244 /** 245 * Internal TrieValue which handles the parsing of the data value. 246 * This class is to be implemented by the user 247 */ 248 protected DataManipulate m_dataManipulate_; 249 /** 250 * Start index of the data portion of the trie. CharTrie combines 251 * index and data into a char array, so this is used to indicate the 252 * initial offset to the data portion. 253 * Note this index always points to the initial value. 254 */ 255 protected int m_dataOffset_; 256 /** 257 * Length of the data array 258 */ 259 protected int m_dataLength_; 260 261 // protected methods ----------------------------------------------- 262 263 /** 264 * Gets the offset to the data which the surrogate pair points to. 265 * @param lead lead surrogate 266 * @param trail trailing surrogate 267 * @return offset to data 268 */ getSurrogateOffset(char lead, char trail)269 protected abstract int getSurrogateOffset(char lead, char trail); 270 271 /** 272 * Gets the value at the argument index 273 * @param index value at index will be retrieved 274 * @return 32 bit value 275 */ getValue(int index)276 protected abstract int getValue(int index); 277 278 /** 279 * Gets the default initial value 280 * @return 32 bit value 281 */ getInitialValue()282 protected abstract int getInitialValue(); 283 284 /** 285 * Gets the offset to the data which the index ch after variable offset 286 * points to. 287 * Note for locating a non-supplementary character data offset, calling 288 * <p> 289 * getRawOffset(0, ch); 290 * </p> 291 * will do. Otherwise if it is a supplementary character formed by 292 * surrogates lead and trail. Then we would have to call getRawOffset() 293 * with getFoldingIndexOffset(). See getSurrogateOffset(). 294 * @param offset index offset which ch is to start from 295 * @param ch index to be used after offset 296 * @return offset to the data 297 */ getRawOffset(int offset, char ch)298 protected final int getRawOffset(int offset, char ch) 299 { 300 return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)] 301 << INDEX_STAGE_2_SHIFT_) 302 + (ch & INDEX_STAGE_3_MASK_); 303 } 304 305 /** 306 * Gets the offset to data which the BMP character points to 307 * Treats a lead surrogate as a normal code point. 308 * @param ch BMP character 309 * @return offset to data 310 */ getBMPOffset(char ch)311 protected final int getBMPOffset(char ch) 312 { 313 return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE 314 && ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) 315 ? getRawOffset(LEAD_INDEX_OFFSET_, ch) 316 : getRawOffset(0, ch); 317 // using a getRawOffset(ch) makes no diff 318 } 319 320 /** 321 * Gets the offset to the data which this lead surrogate character points 322 * to. 323 * Data at the returned offset may contain folding offset information for 324 * the next trailing surrogate character. 325 * @param ch lead surrogate character 326 * @return offset to data 327 */ getLeadOffset(char ch)328 protected final int getLeadOffset(char ch) 329 { 330 return getRawOffset(0, ch); 331 } 332 333 /** 334 * Internal trie getter from a code point. 335 * Could be faster(?) but longer with 336 * if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); } 337 * Gets the offset to data which the codepoint points to 338 * @param ch codepoint 339 * @return offset to data 340 */ getCodePointOffset(int ch)341 protected final int getCodePointOffset(int ch) 342 { 343 // if ((ch >> 16) == 0) slower 344 if (ch < 0) { 345 return -1; 346 } else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) { 347 // fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works 348 return getRawOffset(0, (char)ch); 349 } else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) { 350 // BMP codepoint 351 return getBMPOffset((char)ch); 352 } else if (ch <= UCharacter.MAX_VALUE) { 353 // look at the construction of supplementary characters 354 // trail forms the ends of it. 355 return getSurrogateOffset(UTF16.getLeadSurrogate(ch), 356 (char)(ch & SURROGATE_MASK_)); 357 } else { 358 // return -1 if there is an error, in this case we return 359 return -1; 360 } 361 } 362 363 /** 364 * <p>Parses the byte buffer and creates the trie index with it.</p> 365 * <p>The position of the input ByteBuffer must be right after the trie header.</p> 366 * <p>This is overwritten by the child classes. 367 * @param bytes buffer containing trie data 368 */ unserialize(ByteBuffer bytes)369 protected void unserialize(ByteBuffer bytes) 370 { 371 m_index_ = ICUBinary.getChars(bytes, m_dataOffset_, 0); 372 } 373 374 /** 375 * Determines if this is a 32 bit trie 376 * @return true if options specifies this is a 32 bit trie 377 */ isIntTrie()378 protected final boolean isIntTrie() 379 { 380 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) != 0; 381 } 382 383 /** 384 * Determines if this is a 16 bit trie 385 * @return true if this is a 16 bit trie 386 */ isCharTrie()387 protected final boolean isCharTrie() 388 { 389 return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0; 390 } 391 392 // private data members -------------------------------------------- 393 394 // struct UTrieHeader { 395 // int32_t signature; 396 // int32_t options (a bit field) 397 // int32_t indexLength 398 // int32_t dataLength 399 400 /** 401 * Size of Trie header in bytes 402 */ 403 protected static final int HEADER_LENGTH_ = 4 * 4; 404 /** 405 * Latin 1 option mask 406 */ 407 protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200; 408 /** 409 * Constant number to authenticate the byte block 410 */ 411 protected static final int HEADER_SIGNATURE_ = 0x54726965; 412 /** 413 * Header option formatting 414 */ 415 private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF; 416 protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4; 417 protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100; 418 419 /** 420 * Flag indicator for Latin quick access data block 421 */ 422 private boolean m_isLatin1Linear_; 423 424 /** 425 * <p>Trie options field.</p> 426 * <p>options bit field:<br> 427 * 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br> 428 * 8 0 = 16-bit data, 1=32-bit data<br> 429 * 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br> 430 * 3..0 INDEX_STAGE_2_SHIFT // 1..9<br> 431 */ 432 private int m_options_; 433 434 // private methods --------------------------------------------------- 435 436 /** 437 * Authenticates raw data header. 438 * Checking the header information, signature and options. 439 * @param signature This contains the options and type of a Trie 440 * @return true if the header is authenticated valid 441 */ checkHeader(int signature)442 private final boolean checkHeader(int signature) 443 { 444 // check the signature 445 // Trie in big-endian US-ASCII (0x54726965). 446 // Magic number to authenticate the data. 447 if (signature != HEADER_SIGNATURE_) { 448 return false; 449 } 450 451 if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) != 452 INDEX_STAGE_1_SHIFT_ || 453 ((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) & 454 HEADER_OPTIONS_SHIFT_MASK_) 455 != INDEX_STAGE_2_SHIFT_) { 456 return false; 457 } 458 return true; 459 } 460 } 461