1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ****************************************************************************** 6 * Copyright (C) 1996-2010, International Business Machines Corporation and * 7 * others. All Rights Reserved. * 8 ****************************************************************************** 9 */ 10 11 package ohos.global.icu.impl; 12 13 import java.util.Arrays; 14 15 import ohos.global.icu.lang.UCharacter; 16 17 /** 18 * Builder class to manipulate and generate a trie. 19 * This is useful for ICU data in primitive types. 20 * Provides a compact way to store information that is indexed by Unicode 21 * values, such as character properties, types, keyboard values, etc. This is 22 * very useful when you have a block of Unicode data that contains significant 23 * values while the rest of the Unicode data is unused in the application or 24 * when you have a lot of redundance, such as where all 21,000 Han ideographs 25 * have the same value. However, lookup is much faster than a hash table. 26 * A trie of any primitive data type serves two purposes: 27 * <UL type = round> 28 * <LI>Fast access of the indexed values. 29 * <LI>Smaller memory footprint. 30 * </UL> 31 * This is a direct port from the ICU4C version 32 * @author Syn Wee Quek 33 * @hide exposed on OHOS 34 */ 35 public class TrieBuilder 36 { 37 // public data member ---------------------------------------------- 38 39 /** 40 * Number of data values in a stage 2 (data array) block. 2, 4, 8, .., 41 * 0x200 42 */ 43 public static final int DATA_BLOCK_LENGTH = 1 << Trie.INDEX_STAGE_1_SHIFT_; 44 45 // public class declaration ---------------------------------------- 46 47 /** 48 * Character data in com.ibm.impl.Trie have different user-specified format 49 * for different purposes. 50 * This interface specifies methods to be implemented in order for 51 * com.ibm.impl.Trie, to surrogate offset information encapsulated within 52 * the data. 53 * @hide exposed on OHOS 54 */ 55 public static interface DataManipulate 56 { 57 /** 58 * Build-time trie callback function, used with serialize(). 59 * This function calculates a lead surrogate's value including a 60 * folding offset from the 1024 supplementary code points 61 * [start..start+1024[ . 62 * It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0. 63 * The folding offset is provided by the caller. 64 * It is offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT 65 * with n=0..1023. 66 * Instead of the offset itself, n can be stored in 10 bits - or fewer 67 * if it can be assumed that few lead surrogates have associated data. 68 * The returned value must be 69 * - not zero if and only if there is relevant data for the 70 * corresponding 1024 supplementary code points 71 * - such that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(..., 72 * offset))==offset 73 * @return a folded value, or 0 if there is no relevant data for the 74 * lead surrogate. 75 */ getFoldedValue(int start, int offset)76 public int getFoldedValue(int start, int offset); 77 } 78 79 // public methods ---------------------------------------------------- 80 81 /** 82 * Checks if the character belongs to a zero block in the trie 83 * @param ch codepoint which data is to be retrieved 84 * @return true if ch is in the zero block 85 */ isInZeroBlock(int ch)86 public boolean isInZeroBlock(int ch) 87 { 88 // valid, uncompacted trie and valid c? 89 if (m_isCompacted_ || ch > UCharacter.MAX_VALUE 90 || ch < UCharacter.MIN_VALUE) { 91 return true; 92 } 93 94 return m_index_[ch >> SHIFT_] == 0; 95 } 96 97 // package private method ----------------------------------------------- 98 99 // protected data member ----------------------------------------------- 100 101 /** 102 * Index values at build-time are 32 bits wide for easier processing. 103 * Bit 31 is set if the data block is used by multiple index values 104 * (from setRange()). 105 */ 106 protected int m_index_[]; 107 protected int m_indexLength_; 108 protected int m_dataCapacity_; 109 protected int m_dataLength_; 110 protected boolean m_isLatin1Linear_; 111 protected boolean m_isCompacted_; 112 /** 113 * Map of adjusted indexes, used in utrie_compact(). 114 * Maps from original indexes to new ones. 115 */ 116 protected int m_map_[]; 117 118 /** 119 * Shift size for shifting right the input index. 1..9 120 */ 121 protected static final int SHIFT_ = Trie.INDEX_STAGE_1_SHIFT_; 122 /** 123 * Length of the index (stage 1) array before folding. 124 * Maximum number of Unicode code points (0x110000) shifted right by 125 * SHIFT. 126 */ 127 protected static final int MAX_INDEX_LENGTH_ = (0x110000 >> SHIFT_); 128 /** 129 * Length of the BMP portion of the index (stage 1) array. 130 */ 131 protected static final int BMP_INDEX_LENGTH_ = 0x10000 >> SHIFT_; 132 /** 133 * Number of index (stage 1) entries per lead surrogate. 134 * Same as number of indexe entries for 1024 trail surrogates, 135 * ==0x400>>UTRIE_SHIFT 136 * 10 - SHIFT == Number of bits of a trail surrogate that are used in 137 * index table lookups. 138 */ 139 protected static final int SURROGATE_BLOCK_COUNT_ = 1 << (10 - SHIFT_); 140 /** 141 * Mask for getting the lower bits from the input index. 142 * DATA_BLOCK_LENGTH - 1. 143 */ 144 protected static final int MASK_ = Trie.INDEX_STAGE_3_MASK_; 145 /** 146 * Shift size for shifting left the index array values. 147 * Increases possible data size with 16-bit index values at the cost 148 * of compactability. 149 * This requires blocks of stage 2 data to be aligned by UTRIE_DATA_GRANULARITY. 150 * 0..UTRIE_SHIFT 151 */ 152 protected static final int INDEX_SHIFT_ = Trie.INDEX_STAGE_2_SHIFT_; 153 /** 154 * Maximum length of the runtime data (stage 2) array. 155 * Limited by 16-bit index values that are left-shifted by INDEX_SHIFT_. 156 */ 157 protected static final int MAX_DATA_LENGTH_ = (0x10000 << INDEX_SHIFT_); 158 /** 159 * Shifting to position the index value in options 160 */ 161 protected static final int OPTIONS_INDEX_SHIFT_ = 4; 162 /** 163 * If set, then the data (stage 2) array is 32 bits wide. 164 */ 165 protected static final int OPTIONS_DATA_IS_32_BIT_ = 0x100; 166 /** 167 * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data 168 * (stage 2) array as a simple, linear array at data + DATA_BLOCK_LENGTH. 169 */ 170 protected static final int OPTIONS_LATIN1_IS_LINEAR_ = 0x200; 171 /** 172 * The alignment size of a stage 2 data block. Also the granularity for 173 * compaction. 174 */ 175 protected static final int DATA_GRANULARITY_ = 1 << INDEX_SHIFT_; 176 177 // protected constructor ---------------------------------------------- 178 TrieBuilder()179 protected TrieBuilder() 180 { 181 m_index_ = new int[MAX_INDEX_LENGTH_]; 182 m_map_ = new int[MAX_BUILD_TIME_DATA_LENGTH_ >> SHIFT_]; 183 m_isLatin1Linear_ = false; 184 m_isCompacted_ = false; 185 m_indexLength_ = MAX_INDEX_LENGTH_; 186 } 187 TrieBuilder(TrieBuilder table)188 protected TrieBuilder(TrieBuilder table) 189 { 190 m_index_ = new int[MAX_INDEX_LENGTH_]; 191 m_indexLength_ = table.m_indexLength_; 192 System.arraycopy(table.m_index_, 0, m_index_, 0, m_indexLength_); 193 m_dataCapacity_ = table.m_dataCapacity_; 194 m_dataLength_ = table.m_dataLength_; 195 m_map_ = new int[table.m_map_.length]; 196 System.arraycopy(table.m_map_, 0, m_map_, 0, m_map_.length); 197 m_isLatin1Linear_ = table.m_isLatin1Linear_; 198 m_isCompacted_ = table.m_isCompacted_; 199 } 200 201 // protected functions ------------------------------------------------ 202 203 /** 204 * Compare two sections of an array for equality. 205 */ equal_int(int[] array, int start1, int start2, int length)206 protected static final boolean equal_int(int[] array, int start1, int start2, int length) { 207 while(length>0 && array[start1]==array[start2]) { 208 ++start1; 209 ++start2; 210 --length; 211 } 212 return length==0; 213 } 214 215 /** 216 * Set a value in the trie index map to indicate which data block 217 * is referenced and which one is not. 218 * utrie_compact() will remove data blocks that are not used at all. 219 * Set 220 * - 0 if it is used 221 * - -1 if it is not used 222 */ findUnusedBlocks()223 protected void findUnusedBlocks() 224 { 225 // fill the entire map with "not used" 226 Arrays.fill(m_map_, 0xff); 227 228 // mark each block that _is_ used with 0 229 for (int i = 0; i < m_indexLength_; ++ i) { 230 m_map_[Math.abs(m_index_[i]) >> SHIFT_] = 0; 231 } 232 233 // never move the all-initial-value block 0 234 m_map_[0] = 0; 235 } 236 237 /** 238 * Finds the same index block as the otherBlock 239 * @param index array 240 * @param indexLength size of index 241 * @param otherBlock 242 * @return same index block 243 */ findSameIndexBlock(int index[], int indexLength, int otherBlock)244 protected static final int findSameIndexBlock(int index[], int indexLength, 245 int otherBlock) 246 { 247 for (int block = BMP_INDEX_LENGTH_; block < indexLength; 248 block += SURROGATE_BLOCK_COUNT_) { 249 if(equal_int(index, block, otherBlock, SURROGATE_BLOCK_COUNT_)) { 250 return block; 251 } 252 } 253 return indexLength; 254 } 255 256 // private data member ------------------------------------------------ 257 258 /** 259 * Maximum length of the build-time data (stage 2) array. 260 * The maximum length is 0x110000 + DATA_BLOCK_LENGTH + 0x400. 261 * (Number of Unicode code points + one all-initial-value block + 262 * possible duplicate entries for 1024 lead surrogates.) 263 */ 264 private static final int MAX_BUILD_TIME_DATA_LENGTH_ = 265 0x110000 + DATA_BLOCK_LENGTH + 0x400; 266 } 267