1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2012, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: bytestrie.h 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2010sep25 14 * created by: Markus W. Scherer 15 */ 16 17 #ifndef __BYTESTRIE_H__ 18 #define __BYTESTRIE_H__ 19 20 /** 21 * \file 22 * \brief C++ API: Trie for mapping byte sequences to integer values. 23 */ 24 25 #include "unicode/utypes.h" 26 #include "unicode/stringpiece.h" 27 #include "unicode/uobject.h" 28 #include "unicode/ustringtrie.h" 29 30 U_NAMESPACE_BEGIN 31 32 class ByteSink; 33 class BytesTrieBuilder; 34 class CharString; 35 class UVector32; 36 37 /** 38 * Light-weight, non-const reader class for a BytesTrie. 39 * Traverses a byte-serialized data structure with minimal state, 40 * for mapping byte sequences to non-negative integer values. 41 * 42 * This class owns the serialized trie data only if it was constructed by 43 * the builder's build() method. 44 * The public constructor and the copy constructor only alias the data (only copy the pointer). 45 * There is no assignment operator. 46 * 47 * This class is not intended for public subclassing. 48 * @stable ICU 4.8 49 */ 50 class U_COMMON_API BytesTrie : public UMemory { 51 public: 52 /** 53 * Constructs a BytesTrie reader instance. 54 * 55 * The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder, 56 * starting with the first byte of that sequence. 57 * The BytesTrie object will not read more bytes than 58 * the BytesTrieBuilder generated in the corresponding build() call. 59 * 60 * The array is not copied/cloned and must not be modified while 61 * the BytesTrie object is in use. 62 * 63 * @param trieBytes The byte array that contains the serialized trie. 64 * @stable ICU 4.8 65 */ BytesTrie(const void * trieBytes)66 BytesTrie(const void *trieBytes) 67 : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)), 68 pos_(bytes_), remainingMatchLength_(-1) {} 69 70 /** 71 * Destructor. 72 * @stable ICU 4.8 73 */ 74 ~BytesTrie(); 75 76 /** 77 * Copy constructor, copies the other trie reader object and its state, 78 * but not the byte array which will be shared. (Shallow copy.) 79 * @param other Another BytesTrie object. 80 * @stable ICU 4.8 81 */ BytesTrie(const BytesTrie & other)82 BytesTrie(const BytesTrie &other) 83 : ownedArray_(NULL), bytes_(other.bytes_), 84 pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {} 85 86 /** 87 * Resets this trie to its initial state. 88 * @return *this 89 * @stable ICU 4.8 90 */ reset()91 BytesTrie &reset() { 92 pos_=bytes_; 93 remainingMatchLength_=-1; 94 return *this; 95 } 96 97 /** 98 * BytesTrie state object, for saving a trie's current state 99 * and resetting the trie back to this state later. 100 * @stable ICU 4.8 101 */ 102 class State : public UMemory { 103 public: 104 /** 105 * Constructs an empty State. 106 * @stable ICU 4.8 107 */ State()108 State() { bytes=NULL; } 109 private: 110 friend class BytesTrie; 111 112 const uint8_t *bytes; 113 const uint8_t *pos; 114 int32_t remainingMatchLength; 115 }; 116 117 /** 118 * Saves the state of this trie. 119 * @param state The State object to hold the trie's state. 120 * @return *this 121 * @see resetToState 122 * @stable ICU 4.8 123 */ saveState(State & state)124 const BytesTrie &saveState(State &state) const { 125 state.bytes=bytes_; 126 state.pos=pos_; 127 state.remainingMatchLength=remainingMatchLength_; 128 return *this; 129 } 130 131 /** 132 * Resets this trie to the saved state. 133 * If the state object contains no state, or the state of a different trie, 134 * then this trie remains unchanged. 135 * @param state The State object which holds a saved trie state. 136 * @return *this 137 * @see saveState 138 * @see reset 139 * @stable ICU 4.8 140 */ resetToState(const State & state)141 BytesTrie &resetToState(const State &state) { 142 if(bytes_==state.bytes && bytes_!=NULL) { 143 pos_=state.pos; 144 remainingMatchLength_=state.remainingMatchLength; 145 } 146 return *this; 147 } 148 149 /** 150 * Determines whether the byte sequence so far matches, whether it has a value, 151 * and whether another input byte can continue a matching byte sequence. 152 * @return The match/value Result. 153 * @stable ICU 4.8 154 */ 155 UStringTrieResult current() const; 156 157 /** 158 * Traverses the trie from the initial state for this input byte. 159 * Equivalent to reset().next(inByte). 160 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 161 * Values below -0x100 and above 0xff will never match. 162 * @return The match/value Result. 163 * @stable ICU 4.8 164 */ first(int32_t inByte)165 inline UStringTrieResult first(int32_t inByte) { 166 remainingMatchLength_=-1; 167 if(inByte<0) { 168 inByte+=0x100; 169 } 170 return nextImpl(bytes_, inByte); 171 } 172 173 /** 174 * Traverses the trie from the current state for this input byte. 175 * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. 176 * Values below -0x100 and above 0xff will never match. 177 * @return The match/value Result. 178 * @stable ICU 4.8 179 */ 180 UStringTrieResult next(int32_t inByte); 181 182 /** 183 * Traverses the trie from the current state for this byte sequence. 184 * Equivalent to 185 * \code 186 * Result result=current(); 187 * for(each c in s) 188 * if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH; 189 * result=next(c); 190 * return result; 191 * \endcode 192 * @param s A string or byte sequence. Can be NULL if length is 0. 193 * @param length The length of the byte sequence. Can be -1 if NUL-terminated. 194 * @return The match/value Result. 195 * @stable ICU 4.8 196 */ 197 UStringTrieResult next(const char *s, int32_t length); 198 199 /** 200 * Returns a matching byte sequence's value if called immediately after 201 * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE. 202 * getValue() can be called multiple times. 203 * 204 * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE! 205 * @return The value for the byte sequence so far. 206 * @stable ICU 4.8 207 */ getValue()208 inline int32_t getValue() const { 209 const uint8_t *pos=pos_; 210 int32_t leadByte=*pos++; 211 // U_ASSERT(leadByte>=kMinValueLead); 212 return readValue(pos, leadByte>>1); 213 } 214 215 /** 216 * Determines whether all byte sequences reachable from the current state 217 * map to the same value. 218 * @param uniqueValue Receives the unique value, if this function returns TRUE. 219 * (output-only) 220 * @return TRUE if all byte sequences reachable from the current state 221 * map to the same value. 222 * @stable ICU 4.8 223 */ hasUniqueValue(int32_t & uniqueValue)224 inline UBool hasUniqueValue(int32_t &uniqueValue) const { 225 const uint8_t *pos=pos_; 226 // Skip the rest of a pending linear-match node. 227 return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue); 228 } 229 230 /** 231 * Finds each byte which continues the byte sequence from the current state. 232 * That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now. 233 * @param out Each next byte is appended to this object. 234 * (Only uses the out.Append(s, length) method.) 235 * @return the number of bytes which continue the byte sequence from here 236 * @stable ICU 4.8 237 */ 238 int32_t getNextBytes(ByteSink &out) const; 239 240 /** 241 * Iterator for all of the (byte sequence, value) pairs in a BytesTrie. 242 * @stable ICU 4.8 243 */ 244 class U_COMMON_API Iterator : public UMemory { 245 public: 246 /** 247 * Iterates from the root of a byte-serialized BytesTrie. 248 * @param trieBytes The trie bytes. 249 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 250 * Otherwise, the iterator returns strings with this maximum length. 251 * @param errorCode Standard ICU error code. Its input value must 252 * pass the U_SUCCESS() test, or else the function returns 253 * immediately. Check for U_FAILURE() on output or use with 254 * function chaining. (See User Guide for details.) 255 * @stable ICU 4.8 256 */ 257 Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode); 258 259 /** 260 * Iterates from the current state of the specified BytesTrie. 261 * @param trie The trie whose state will be copied for iteration. 262 * @param maxStringLength If 0, the iterator returns full strings/byte sequences. 263 * Otherwise, the iterator returns strings with this maximum length. 264 * @param errorCode Standard ICU error code. Its input value must 265 * pass the U_SUCCESS() test, or else the function returns 266 * immediately. Check for U_FAILURE() on output or use with 267 * function chaining. (See User Guide for details.) 268 * @stable ICU 4.8 269 */ 270 Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode); 271 272 /** 273 * Destructor. 274 * @stable ICU 4.8 275 */ 276 ~Iterator(); 277 278 /** 279 * Resets this iterator to its initial state. 280 * @return *this 281 * @stable ICU 4.8 282 */ 283 Iterator &reset(); 284 285 /** 286 * @return TRUE if there are more elements. 287 * @stable ICU 4.8 288 */ 289 UBool hasNext() const; 290 291 /** 292 * Finds the next (byte sequence, value) pair if there is one. 293 * 294 * If the byte sequence is truncated to the maximum length and does not 295 * have a real value, then the value is set to -1. 296 * In this case, this "not a real value" is indistinguishable from 297 * a real value of -1. 298 * @param errorCode Standard ICU error code. Its input value must 299 * pass the U_SUCCESS() test, or else the function returns 300 * immediately. Check for U_FAILURE() on output or use with 301 * function chaining. (See User Guide for details.) 302 * @return TRUE if there is another element. 303 * @stable ICU 4.8 304 */ 305 UBool next(UErrorCode &errorCode); 306 307 /** 308 * @return The NUL-terminated byte sequence for the last successful next(). 309 * @stable ICU 4.8 310 */ 311 StringPiece getString() const; 312 /** 313 * @return The value for the last successful next(). 314 * @stable ICU 4.8 315 */ getValue()316 int32_t getValue() const { return value_; } 317 318 private: 319 UBool truncateAndStop(); 320 321 const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode); 322 323 const uint8_t *bytes_; 324 const uint8_t *pos_; 325 const uint8_t *initialPos_; 326 int32_t remainingMatchLength_; 327 int32_t initialRemainingMatchLength_; 328 329 CharString *str_; 330 int32_t maxLength_; 331 int32_t value_; 332 333 // The stack stores pairs of integers for backtracking to another 334 // outbound edge of a branch node. 335 // The first integer is an offset from bytes_. 336 // The second integer has the str_->length() from before the node in bits 15..0, 337 // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) 338 // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, 339 // but the code looks more confusing that way.) 340 UVector32 *stack_; 341 }; 342 343 private: 344 friend class BytesTrieBuilder; 345 346 /** 347 * Constructs a BytesTrie reader instance. 348 * Unlike the public constructor which just aliases an array, 349 * this constructor adopts the builder's array. 350 * This constructor is only called by the builder. 351 */ BytesTrie(void * adoptBytes,const void * trieBytes)352 BytesTrie(void *adoptBytes, const void *trieBytes) 353 : ownedArray_(static_cast<uint8_t *>(adoptBytes)), 354 bytes_(static_cast<const uint8_t *>(trieBytes)), 355 pos_(bytes_), remainingMatchLength_(-1) {} 356 357 // No assignment operator. 358 BytesTrie &operator=(const BytesTrie &other); 359 stop()360 inline void stop() { 361 pos_=NULL; 362 } 363 364 // Reads a compact 32-bit integer. 365 // pos is already after the leadByte, and the lead byte is already shifted right by 1. 366 static int32_t readValue(const uint8_t *pos, int32_t leadByte); skipValue(const uint8_t * pos,int32_t leadByte)367 static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) { 368 // U_ASSERT(leadByte>=kMinValueLead); 369 if(leadByte>=(kMinTwoByteValueLead<<1)) { 370 if(leadByte<(kMinThreeByteValueLead<<1)) { 371 ++pos; 372 } else if(leadByte<(kFourByteValueLead<<1)) { 373 pos+=2; 374 } else { 375 pos+=3+((leadByte>>1)&1); 376 } 377 } 378 return pos; 379 } skipValue(const uint8_t * pos)380 static inline const uint8_t *skipValue(const uint8_t *pos) { 381 int32_t leadByte=*pos++; 382 return skipValue(pos, leadByte); 383 } 384 385 // Reads a jump delta and jumps. 386 static const uint8_t *jumpByDelta(const uint8_t *pos); 387 skipDelta(const uint8_t * pos)388 static inline const uint8_t *skipDelta(const uint8_t *pos) { 389 int32_t delta=*pos++; 390 if(delta>=kMinTwoByteDeltaLead) { 391 if(delta<kMinThreeByteDeltaLead) { 392 ++pos; 393 } else if(delta<kFourByteDeltaLead) { 394 pos+=2; 395 } else { 396 pos+=3+(delta&1); 397 } 398 } 399 return pos; 400 } 401 valueResult(int32_t node)402 static inline UStringTrieResult valueResult(int32_t node) { 403 return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal)); 404 } 405 406 // Handles a branch node for both next(byte) and next(string). 407 UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte); 408 409 // Requires remainingLength_<0. 410 UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte); 411 412 // Helper functions for hasUniqueValue(). 413 // Recursively finds a unique value (or whether there is not a unique one) 414 // from a branch. 415 static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length, 416 UBool haveUniqueValue, int32_t &uniqueValue); 417 // Recursively finds a unique value (or whether there is not a unique one) 418 // starting from a position on a node lead byte. 419 static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue); 420 421 // Helper functions for getNextBytes(). 422 // getNextBytes() when pos is on a branch node. 423 static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out); 424 static void append(ByteSink &out, int c); 425 426 // BytesTrie data structure 427 // 428 // The trie consists of a series of byte-serialized nodes for incremental 429 // string/byte sequence matching. The root node is at the beginning of the trie data. 430 // 431 // Types of nodes are distinguished by their node lead byte ranges. 432 // After each node, except a final-value node, another node follows to 433 // encode match values or continue matching further bytes. 434 // 435 // Node types: 436 // - Value node: Stores a 32-bit integer in a compact, variable-length format. 437 // The value is for the string/byte sequence so far. 438 // One node bit indicates whether the value is final or whether 439 // matching continues with the next node. 440 // - Linear-match node: Matches a number of bytes. 441 // - Branch node: Branches to other nodes according to the current input byte. 442 // The node byte is the length of the branch (number of bytes to select from) 443 // minus 1. It is followed by a sub-node: 444 // - If the length is at most kMaxBranchLinearSubNodeLength, then 445 // there are length-1 (key, value) pairs and then one more comparison byte. 446 // If one of the key bytes matches, then the value is either a final value for 447 // the string/byte sequence so far, or a "jump" delta to the next node. 448 // If the last byte matches, then matching continues with the next node. 449 // (Values have the same encoding as value nodes.) 450 // - If the length is greater than kMaxBranchLinearSubNodeLength, then 451 // there is one byte and one "jump" delta. 452 // If the input byte is less than the sub-node byte, then "jump" by delta to 453 // the next sub-node which will have a length of length/2. 454 // (The delta has its own compact encoding.) 455 // Otherwise, skip the "jump" delta to the next sub-node 456 // which will have a length of length-length/2. 457 458 // Node lead byte values. 459 460 // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise 461 // the length is one more than the next byte. 462 463 // For a branch sub-node with at most this many entries, we drop down 464 // to a linear search. 465 static const int32_t kMaxBranchLinearSubNodeLength=5; 466 467 // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node. 468 static const int32_t kMinLinearMatch=0x10; 469 static const int32_t kMaxLinearMatchLength=0x10; 470 471 // 20..ff: Variable-length value node. 472 // If odd, the value is final. (Otherwise, intermediate value or jump delta.) 473 // Then shift-right by 1 bit. 474 // The remaining lead byte value indicates the number of following bytes (0..4) 475 // and contains the value's top bits. 476 static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20 477 // It is a final value if bit 0 is set. 478 static const int32_t kValueIsFinal=1; 479 480 // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds. 481 static const int32_t kMinOneByteValueLead=kMinValueLead/2; // 0x10 482 static const int32_t kMaxOneByteValue=0x40; // At least 6 bits in the first byte. 483 484 static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51 485 static const int32_t kMaxTwoByteValue=0x1aff; 486 487 static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c 488 static const int32_t kFourByteValueLead=0x7e; 489 490 // A little more than Unicode code points. (0x11ffff) 491 static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1; 492 493 static const int32_t kFiveByteValueLead=0x7f; 494 495 // Compact delta integers. 496 static const int32_t kMaxOneByteDelta=0xbf; 497 static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0 498 static const int32_t kMinThreeByteDeltaLead=0xf0; 499 static const int32_t kFourByteDeltaLead=0xfe; 500 static const int32_t kFiveByteDeltaLead=0xff; 501 502 static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff 503 static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff 504 505 uint8_t *ownedArray_; 506 507 // Fixed value referencing the BytesTrie bytes. 508 const uint8_t *bytes_; 509 510 // Iterator variables. 511 512 // Pointer to next trie byte to read. NULL if no more matches. 513 const uint8_t *pos_; 514 // Remaining length of a linear-match node, minus 1. Negative if not in such a node. 515 int32_t remainingMatchLength_; 516 }; 517 518 U_NAMESPACE_END 519 520 #endif // __BYTESTRIE_H__ 521