1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // ucptrie_impl.h (modified from utrie2_impl.h) 5 // created: 2017dec29 Markus W. Scherer 6 7 #ifndef __UCPTRIE_IMPL_H__ 8 #define __UCPTRIE_IMPL_H__ 9 10 #include "unicode/ucptrie.h" 11 #ifdef UCPTRIE_DEBUG 12 #include "unicode/umutablecptrie.h" 13 #endif 14 15 // UCPTrie signature values, in platform endianness and opposite endianness. 16 // The UCPTrie signature ASCII byte values spell "Tri3". 17 #define UCPTRIE_SIG 0x54726933 18 #define UCPTRIE_OE_SIG 0x33697254 19 20 /** 21 * Header data for the binary, memory-mappable representation of a UCPTrie/CodePointTrie. 22 * @internal 23 */ 24 struct UCPTrieHeader { 25 /** "Tri3" in big-endian US-ASCII (0x54726933) */ 26 uint32_t signature; 27 28 /** 29 * Options bit field: 30 * Bits 15..12: Data length bits 19..16. 31 * Bits 11..8: Data null block offset bits 19..16. 32 * Bits 7..6: UCPTrieType 33 * Bits 5..3: Reserved (0). 34 * Bits 2..0: UCPTrieValueWidth 35 */ 36 uint16_t options; 37 38 /** Total length of the index tables. */ 39 uint16_t indexLength; 40 41 /** Data length bits 15..0. */ 42 uint16_t dataLength; 43 44 /** Index-3 null block offset, 0x7fff or 0xffff if none. */ 45 uint16_t index3NullOffset; 46 47 /** Data null block offset bits 15..0, 0xfffff if none. */ 48 uint16_t dataNullOffset; 49 50 /** 51 * First code point of the single-value range ending with U+10ffff, 52 * rounded up and then shifted right by UCPTRIE_SHIFT_2. 53 */ 54 uint16_t shiftedHighStart; 55 }; 56 57 /** 58 * Constants for use with UCPTrieHeader.options. 59 * @internal 60 */ 61 enum { 62 UCPTRIE_OPTIONS_DATA_LENGTH_MASK = 0xf000, 63 UCPTRIE_OPTIONS_DATA_NULL_OFFSET_MASK = 0xf00, 64 UCPTRIE_OPTIONS_RESERVED_MASK = 0x38, 65 UCPTRIE_OPTIONS_VALUE_BITS_MASK = 7, 66 /** 67 * Value for index3NullOffset which indicates that there is no index-3 null block. 68 * Bit 15 is unused for this value because this bit is used if the index-3 contains 69 * 18-bit indexes. 70 */ 71 UCPTRIE_NO_INDEX3_NULL_OFFSET = 0x7fff, 72 UCPTRIE_NO_DATA_NULL_OFFSET = 0xfffff 73 }; 74 75 // Internal constants. 76 enum { 77 /** The length of the BMP index table. 1024=0x400 */ 78 UCPTRIE_BMP_INDEX_LENGTH = 0x10000 >> UCPTRIE_FAST_SHIFT, 79 80 UCPTRIE_SMALL_LIMIT = 0x1000, 81 UCPTRIE_SMALL_INDEX_LENGTH = UCPTRIE_SMALL_LIMIT >> UCPTRIE_FAST_SHIFT, 82 83 /** Shift size for getting the index-3 table offset. */ 84 UCPTRIE_SHIFT_3 = 4, 85 86 /** Shift size for getting the index-2 table offset. */ 87 UCPTRIE_SHIFT_2 = 5 + UCPTRIE_SHIFT_3, 88 89 /** Shift size for getting the index-1 table offset. */ 90 UCPTRIE_SHIFT_1 = 5 + UCPTRIE_SHIFT_2, 91 92 /** 93 * Difference between two shift sizes, 94 * for getting an index-2 offset from an index-3 offset. 5=9-4 95 */ 96 UCPTRIE_SHIFT_2_3 = UCPTRIE_SHIFT_2 - UCPTRIE_SHIFT_3, 97 98 /** 99 * Difference between two shift sizes, 100 * for getting an index-1 offset from an index-2 offset. 5=14-9 101 */ 102 UCPTRIE_SHIFT_1_2 = UCPTRIE_SHIFT_1 - UCPTRIE_SHIFT_2, 103 104 /** 105 * Number of index-1 entries for the BMP. (4) 106 * This part of the index-1 table is omitted from the serialized form. 107 */ 108 UCPTRIE_OMITTED_BMP_INDEX_1_LENGTH = 0x10000 >> UCPTRIE_SHIFT_1, 109 110 /** Number of entries in an index-2 block. 32=0x20 */ 111 UCPTRIE_INDEX_2_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_1_2, 112 113 /** Mask for getting the lower bits for the in-index-2-block offset. */ 114 UCPTRIE_INDEX_2_MASK = UCPTRIE_INDEX_2_BLOCK_LENGTH - 1, 115 116 /** Number of code points per index-2 table entry. 512=0x200 */ 117 UCPTRIE_CP_PER_INDEX_2_ENTRY = 1 << UCPTRIE_SHIFT_2, 118 119 /** Number of entries in an index-3 block. 32=0x20 */ 120 UCPTRIE_INDEX_3_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_2_3, 121 122 /** Mask for getting the lower bits for the in-index-3-block offset. */ 123 UCPTRIE_INDEX_3_MASK = UCPTRIE_INDEX_3_BLOCK_LENGTH - 1, 124 125 /** Number of entries in a small data block. 16=0x10 */ 126 UCPTRIE_SMALL_DATA_BLOCK_LENGTH = 1 << UCPTRIE_SHIFT_3, 127 128 /** Mask for getting the lower bits for the in-small-data-block offset. */ 129 UCPTRIE_SMALL_DATA_MASK = UCPTRIE_SMALL_DATA_BLOCK_LENGTH - 1 130 }; 131 132 typedef UChar32 133 UCPTrieGetRange(const void *trie, UChar32 start, 134 UCPMapValueFilter *filter, const void *context, uint32_t *pValue); 135 136 U_CFUNC UChar32 137 ucptrie_internalGetRange(UCPTrieGetRange *getRange, 138 const void *trie, UChar32 start, 139 UCPMapRangeOption option, uint32_t surrogateValue, 140 UCPMapValueFilter *filter, const void *context, uint32_t *pValue); 141 142 #ifdef UCPTRIE_DEBUG 143 U_CFUNC void 144 ucptrie_printLengths(const UCPTrie *trie, const char *which); 145 146 U_CFUNC void umutablecptrie_setName(UMutableCPTrie *builder, const char *name); 147 #endif 148 149 /* 150 * Format of the binary, memory-mappable representation of a UCPTrie/CodePointTrie. 151 * For overview information see https://icu.unicode.org/design/struct/utrie 152 * 153 * The binary trie data should be 32-bit-aligned. 154 * The overall layout is: 155 * 156 * UCPTrieHeader header; -- 16 bytes, see struct definition above 157 * uint16_t index[header.indexLength]; 158 * uintXY_t data[header.dataLength]; 159 * 160 * The trie data array is an array of uint16_t, uint32_t, or uint8_t, 161 * specified via the UCPTrieValueWidth when building the trie. 162 * The data array is 32-bit-aligned for uint32_t, otherwise 16-bit-aligned. 163 * The overall length of the trie data is a multiple of 4 bytes. 164 * (Padding is added at the end of the index array and/or near the end of the data array as needed.) 165 * 166 * The length of the data array (dataLength) is stored as an integer split across two fields 167 * of the header struct (high bits in header.options). 168 * 169 * The trie type can be "fast" or "small" which determines the index structure, 170 * specified via the UCPTrieType when building the trie. 171 * 172 * The type and valueWidth are stored in the header.options. 173 * There are reserved type and valueWidth values, and reserved header.options bits. 174 * They could be used in future format extensions. 175 * Code reading the trie structure must fail with an error when unknown values or options are set. 176 * 177 * Values for ASCII character (U+0000..U+007F) can always be found at the start of the data array. 178 * 179 * Values for code points below a type-specific fast-indexing limit are found via two-stage lookup. 180 * For a "fast" trie, the limit is the BMP/supplementary boundary at U+10000. 181 * For a "small" trie, the limit is UCPTRIE_SMALL_MAX+1=U+1000. 182 * 183 * All code points in the range highStart..U+10FFFF map to a single highValue 184 * which is stored at the second-to-last position of the data array. 185 * (See UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET.) 186 * The highStart value is header.shiftedHighStart<<UCPTRIE_SHIFT_2. 187 * (UCPTRIE_SHIFT_2=9) 188 * 189 * Values for code points fast_limit..highStart-1 are found via four-stage lookup. 190 * The data block size is smaller for this range than for the fast range. 191 * This together with more index stages with small blocks makes this range 192 * more easily compactable. 193 * 194 * There is also a trie error value stored at the last position of the data array. 195 * (See UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET.) 196 * It is intended to be returned for inputs that are not Unicode code points 197 * (outside U+0000..U+10FFFF), or in string processing for ill-formed input 198 * (unpaired surrogate in UTF-16, ill-formed UTF-8 subsequence). 199 * 200 * For a "fast" trie: 201 * 202 * The index array starts with the BMP index table for BMP code point lookup. 203 * Its length is 1024=0x400. 204 * 205 * The supplementary index-1 table follows the BMP index table. 206 * Variable length, for code points up to highStart-1. 207 * Maximum length 64=0x40=0x100000>>UCPTRIE_SHIFT_1. 208 * (For 0x100000 supplementary code points U+10000..U+10ffff.) 209 * 210 * After this index-1 table follow the variable-length index-3 and index-2 tables. 211 * 212 * The supplementary index tables are omitted completely 213 * if there is only BMP data (highStart<=U+10000). 214 * 215 * For a "small" trie: 216 * 217 * The index array starts with a fast-index table for lookup of code points U+0000..U+0FFF. 218 * 219 * The "supplementary" index tables are always stored. 220 * The index-1 table starts from U+0000, its maximum length is 68=0x44=0x110000>>UCPTRIE_SHIFT_1. 221 * 222 * For both trie types: 223 * 224 * The last index-2 block may be a partial block, storing indexes only for code points 225 * below highStart. 226 * 227 * Lookup for ASCII code point c: 228 * 229 * Linear access from the start of the data array. 230 * 231 * value = data[c]; 232 * 233 * Lookup for fast-range code point c: 234 * 235 * Shift the code point right by UCPTRIE_FAST_SHIFT=6 bits, 236 * fetch the index array value at that offset, 237 * add the lower code point bits, index into the data array. 238 * 239 * value = data[index[c>>6] + (c&0x3f)]; 240 * 241 * (This works for ASCII as well.) 242 * 243 * Lookup for small-range code point c below highStart: 244 * 245 * Split the code point into four bit fields using several sets of shifts & masks 246 * to read consecutive values from the index-1, index-2, index-3 and data tables. 247 * 248 * If all of the data block offsets in an index-3 block fit within 16 bits (up to 0xffff), 249 * then the data block offsets are stored directly as uint16_t. 250 * 251 * Otherwise (this is very unusual but possible), the index-2 entry for the index-3 block 252 * has bit 15 (0x8000) set, and each set of 8 index-3 entries is preceded by 253 * an additional uint16_t word. Data block offsets are 18 bits wide, with the top 2 bits stored 254 * in the additional word. 255 * 256 * See ucptrie_internalSmallIndex() for details. 257 * 258 * (In a "small" trie, this works for ASCII and below-fast_limit code points as well.) 259 * 260 * Compaction: 261 * 262 * Multiple code point ranges ("blocks") that are aligned on certain boundaries 263 * (determined by the shifting/bit fields of code points) and 264 * map to the same data values normally share a single subsequence of the data array. 265 * Data blocks can also overlap partially. 266 * (Depending on the builder code finding duplicate and overlapping blocks.) 267 * 268 * Iteration over same-value ranges: 269 * 270 * Range iteration (ucptrie_getRange()) walks the structure from a start code point 271 * until some code point is found that maps to a different value; 272 * the end of the returned range is just before that. 273 * 274 * The header.dataNullOffset (split across two header fields, high bits in header.options) 275 * is the offset of a widely shared data block filled with one single value. 276 * It helps quickly skip over large ranges of data with that value. 277 * The builder must ensure that if the start of any data block (fast or small) 278 * matches the dataNullOffset, then the whole block must be filled with the null value. 279 * Special care must be taken if there is no fast null data block 280 * but a small one, which is shorter, and it matches the *start* of some fast data block. 281 * 282 * Similarly, the header.index3NullOffset is the index-array offset of an index-3 block 283 * where all index entries point to the dataNullOffset. 284 * If there is no such data or index-3 block, then these offsets are set to 285 * values that cannot be reached (data offset out of range/reserved index offset), 286 * normally UCPTRIE_NO_DATA_NULL_OFFSET or UCPTRIE_NO_INDEX3_NULL_OFFSET respectively. 287 */ 288 289 #endif 290