1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2012,2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: stringtriebuilder.h 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2010dec24 14 * created by: Markus W. Scherer 15 */ 16 17 #ifndef __STRINGTRIEBUILDER_H__ 18 #define __STRINGTRIEBUILDER_H__ 19 20 #include "unicode/utypes.h" 21 22 #if U_SHOW_CPLUSPLUS_API 23 24 #include "unicode/uobject.h" 25 26 /** 27 * \file 28 * \brief C++ API: Builder API for trie builders 29 */ 30 31 // Forward declaration. 32 /// \cond 33 struct UHashtable; 34 typedef struct UHashtable UHashtable; 35 /// \endcond 36 37 /** 38 * Build options for BytesTrieBuilder and CharsTrieBuilder. 39 * @stable ICU 4.8 40 */ 41 enum UStringTrieBuildOption { 42 /** 43 * Builds a trie quickly. 44 * @stable ICU 4.8 45 */ 46 USTRINGTRIE_BUILD_FAST, 47 /** 48 * Builds a trie more slowly, attempting to generate 49 * a shorter but equivalent serialization. 50 * This build option also uses more memory. 51 * 52 * This option can be effective when many integer values are the same 53 * and string/byte sequence suffixes can be shared. 54 * Runtime speed is not expected to improve. 55 * @stable ICU 4.8 56 */ 57 USTRINGTRIE_BUILD_SMALL 58 }; 59 60 U_NAMESPACE_BEGIN 61 62 /** 63 * Base class for string trie builder classes. 64 * 65 * This class is not intended for public subclassing. 66 * @stable ICU 4.8 67 */ 68 class U_COMMON_API StringTrieBuilder : public UObject { 69 public: 70 #ifndef U_HIDE_INTERNAL_API 71 /** @internal */ 72 static int32_t hashNode(const void *node); 73 /** @internal */ 74 static UBool equalNodes(const void *left, const void *right); 75 #endif /* U_HIDE_INTERNAL_API */ 76 77 protected: 78 // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API 79 // or else the compiler will create a public default constructor. 80 /** @internal */ 81 StringTrieBuilder(); 82 /** @internal */ 83 virtual ~StringTrieBuilder(); 84 85 #ifndef U_HIDE_INTERNAL_API 86 /** @internal */ 87 void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode); 88 /** @internal */ 89 void deleteCompactBuilder(); 90 91 /** @internal */ 92 void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode); 93 94 /** @internal */ 95 int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex); 96 /** @internal */ 97 int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length); 98 #endif /* U_HIDE_INTERNAL_API */ 99 100 class Node; 101 102 #ifndef U_HIDE_INTERNAL_API 103 /** @internal */ 104 Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode); 105 /** @internal */ 106 Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, 107 int32_t length, UErrorCode &errorCode); 108 #endif /* U_HIDE_INTERNAL_API */ 109 110 /** @internal */ 111 virtual int32_t getElementStringLength(int32_t i) const = 0; 112 /** @internal */ 113 virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const = 0; 114 /** @internal */ 115 virtual int32_t getElementValue(int32_t i) const = 0; 116 117 // Finds the first unit index after this one where 118 // the first and last element have different units again. 119 /** @internal */ 120 virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0; 121 122 // Number of different units at unitIndex. 123 /** @internal */ 124 virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0; 125 /** @internal */ 126 virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0; 127 /** @internal */ 128 virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const = 0; 129 130 /** @internal */ 131 virtual UBool matchNodesCanHaveValues() const = 0; 132 133 /** @internal */ 134 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0; 135 /** @internal */ 136 virtual int32_t getMinLinearMatch() const = 0; 137 /** @internal */ 138 virtual int32_t getMaxLinearMatchLength() const = 0; 139 140 #ifndef U_HIDE_INTERNAL_API 141 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength). 142 /** @internal */ 143 static const int32_t kMaxBranchLinearSubNodeLength=5; 144 145 // Maximum number of nested split-branch levels for a branch on all 2^16 possible char16_t units. 146 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up. 147 /** @internal */ 148 static const int32_t kMaxSplitBranchLevels=14; 149 150 /** 151 * Makes sure that there is only one unique node registered that is 152 * equivalent to newNode. 153 * @param newNode Input node. The builder takes ownership. 154 * @param errorCode ICU in/out UErrorCode. 155 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. 156 * @return newNode if it is the first of its kind, or 157 * an equivalent node if newNode is a duplicate. 158 * @internal 159 */ 160 Node *registerNode(Node *newNode, UErrorCode &errorCode); 161 /** 162 * Makes sure that there is only one unique FinalValueNode registered 163 * with this value. 164 * Avoids creating a node if the value is a duplicate. 165 * @param value A final value. 166 * @param errorCode ICU in/out UErrorCode. 167 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. 168 * @return A FinalValueNode with the given value. 169 * @internal 170 */ 171 Node *registerFinalValue(int32_t value, UErrorCode &errorCode); 172 #endif /* U_HIDE_INTERNAL_API */ 173 174 /* 175 * C++ note: 176 * registerNode() and registerFinalValue() take ownership of their input nodes, 177 * and only return owned nodes. 178 * If they see a failure UErrorCode, they will delete the input node. 179 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR. 180 * If there is a failure, they return NULL. 181 * 182 * NULL Node pointers can be safely passed into other Nodes because 183 * they call the static Node::hashCode() which checks for a NULL pointer first. 184 * 185 * Therefore, as long as builder functions register a new node, 186 * they need to check for failures only before explicitly dereferencing 187 * a Node pointer, or before setting a new UErrorCode. 188 */ 189 190 // Hash set of nodes, maps from nodes to integer 1. 191 /** @internal */ 192 UHashtable *nodes; 193 194 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API, 195 // it is needed for layout of other objects. 196 /** 197 * @internal 198 * \cond 199 */ 200 class Node : public UObject { 201 public: Node(int32_t initialHash)202 Node(int32_t initialHash) : hash(initialHash), offset(0) {} hashCode()203 inline int32_t hashCode() const { return hash; } 204 // Handles node==NULL. hashCode(const Node * node)205 static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); } 206 // Base class operator==() compares the actual class types. 207 virtual UBool operator==(const Node &other) const; 208 inline UBool operator!=(const Node &other) const { return !operator==(other); } 209 /** 210 * Traverses the Node graph and numbers branch edges, with rightmost edges first. 211 * This is to avoid writing a duplicate node twice. 212 * 213 * Branch nodes in this trie data structure are not symmetric. 214 * Most branch edges "jump" to other nodes but the rightmost branch edges 215 * just continue without a jump. 216 * Therefore, write() must write the rightmost branch edge last 217 * (trie units are written backwards), and must write it at that point even if 218 * it is a duplicate of a node previously written elsewhere. 219 * 220 * This function visits and marks right branch edges first. 221 * Edges are numbered with increasingly negative values because we share the 222 * offset field which gets positive values when nodes are written. 223 * A branch edge also remembers the first number for any of its edges. 224 * 225 * When a further-left branch edge has a number in the range of the rightmost 226 * edge's numbers, then it will be written as part of the required right edge 227 * and we can avoid writing it first. 228 * 229 * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative 230 * edge numbers. 231 * 232 * @param edgeNumber The first edge number for this node and its sub-nodes. 233 * @return An edge number that is at least the maximum-negative 234 * of the input edge number and the numbers of this node and all of its sub-nodes. 235 */ 236 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 237 // write() must set the offset to a positive value. 238 virtual void write(StringTrieBuilder &builder) = 0; 239 // See markRightEdgesFirst. writeUnlessInsideRightEdge(int32_t firstRight,int32_t lastRight,StringTrieBuilder & builder)240 inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight, 241 StringTrieBuilder &builder) { 242 // Note: Edge numbers are negative, lastRight<=firstRight. 243 // If offset>0 then this node and its sub-nodes have been written already 244 // and we need not write them again. 245 // If this node is part of the unwritten right branch edge, 246 // then we wait until that is written. 247 if(offset<0 && (offset<lastRight || firstRight<offset)) { 248 write(builder); 249 } 250 } getOffset()251 inline int32_t getOffset() const { return offset; } 252 protected: 253 int32_t hash; 254 int32_t offset; 255 }; 256 257 #ifndef U_HIDE_INTERNAL_API 258 // This class should not be overridden because 259 // registerFinalValue() compares a stack-allocated FinalValueNode 260 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes) 261 // with the input node, and the 262 // !Node::operator==(other) used inside FinalValueNode::operator==(other) 263 // will be false if the typeid's are different. 264 /** @internal */ 265 class FinalValueNode : public Node { 266 public: FinalValueNode(int32_t v)267 FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {} 268 virtual UBool operator==(const Node &other) const; 269 virtual void write(StringTrieBuilder &builder); 270 protected: 271 int32_t value; 272 }; 273 #endif /* U_HIDE_INTERNAL_API */ 274 275 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API, 276 // it is needed for layout of other objects. 277 /** 278 * @internal 279 */ 280 class ValueNode : public Node { 281 public: ValueNode(int32_t initialHash)282 ValueNode(int32_t initialHash) : Node(initialHash), hasValue(false), value(0) {} 283 virtual UBool operator==(const Node &other) const; setValue(int32_t v)284 void setValue(int32_t v) { 285 hasValue=true; 286 value=v; 287 hash=hash*37u+v; 288 } 289 protected: 290 UBool hasValue; 291 int32_t value; 292 }; 293 294 #ifndef U_HIDE_INTERNAL_API 295 /** 296 * @internal 297 */ 298 class IntermediateValueNode : public ValueNode { 299 public: IntermediateValueNode(int32_t v,Node * nextNode)300 IntermediateValueNode(int32_t v, Node *nextNode) 301 : ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); } 302 virtual UBool operator==(const Node &other) const; 303 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 304 virtual void write(StringTrieBuilder &builder); 305 protected: 306 Node *next; 307 }; 308 #endif /* U_HIDE_INTERNAL_API */ 309 310 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API, 311 // it is needed for layout of other objects. 312 /** 313 * @internal 314 */ 315 class LinearMatchNode : public ValueNode { 316 public: LinearMatchNode(int32_t len,Node * nextNode)317 LinearMatchNode(int32_t len, Node *nextNode) 318 : ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)), 319 length(len), next(nextNode) {} 320 virtual UBool operator==(const Node &other) const; 321 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 322 protected: 323 int32_t length; 324 Node *next; 325 }; 326 327 #ifndef U_HIDE_INTERNAL_API 328 /** 329 * @internal 330 */ 331 class BranchNode : public Node { 332 public: BranchNode(int32_t initialHash)333 BranchNode(int32_t initialHash) : Node(initialHash) {} 334 protected: 335 int32_t firstEdgeNumber; 336 }; 337 338 /** 339 * @internal 340 */ 341 class ListBranchNode : public BranchNode { 342 public: ListBranchNode()343 ListBranchNode() : BranchNode(0x444444), length(0) {} 344 virtual UBool operator==(const Node &other) const; 345 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 346 virtual void write(StringTrieBuilder &builder); 347 // Adds a unit with a final value. add(int32_t c,int32_t value)348 void add(int32_t c, int32_t value) { 349 units[length]=(char16_t)c; 350 equal[length]=NULL; 351 values[length]=value; 352 ++length; 353 hash=(hash*37u+c)*37u+value; 354 } 355 // Adds a unit which leads to another match node. add(int32_t c,Node * node)356 void add(int32_t c, Node *node) { 357 units[length]=(char16_t)c; 358 equal[length]=node; 359 values[length]=0; 360 ++length; 361 hash=(hash*37u+c)*37u+hashCode(node); 362 } 363 protected: 364 Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value". 365 int32_t length; 366 int32_t values[kMaxBranchLinearSubNodeLength]; 367 char16_t units[kMaxBranchLinearSubNodeLength]; 368 }; 369 370 /** 371 * @internal 372 */ 373 class SplitBranchNode : public BranchNode { 374 public: SplitBranchNode(char16_t middleUnit,Node * lessThanNode,Node * greaterOrEqualNode)375 SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode) 376 : BranchNode(((0x555555u*37u+middleUnit)*37u+ 377 hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)), 378 unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {} 379 virtual UBool operator==(const Node &other) const; 380 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 381 virtual void write(StringTrieBuilder &builder); 382 protected: 383 char16_t unit; 384 Node *lessThan; 385 Node *greaterOrEqual; 386 }; 387 388 // Branch head node, for writing the actual node lead unit. 389 /** @internal */ 390 class BranchHeadNode : public ValueNode { 391 public: BranchHeadNode(int32_t len,Node * subNode)392 BranchHeadNode(int32_t len, Node *subNode) 393 : ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)), 394 length(len), next(subNode) {} 395 virtual UBool operator==(const Node &other) const; 396 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 397 virtual void write(StringTrieBuilder &builder); 398 protected: 399 int32_t length; 400 Node *next; // A branch sub-node. 401 }; 402 403 #endif /* U_HIDE_INTERNAL_API */ 404 /// \endcond 405 406 /** @internal */ 407 virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, 408 Node *nextNode) const = 0; 409 410 /** @internal */ 411 virtual int32_t write(int32_t unit) = 0; 412 /** @internal */ 413 virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0; 414 /** @internal */ 415 virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0; 416 /** @internal */ 417 virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0; 418 /** @internal */ 419 virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0; 420 }; 421 422 U_NAMESPACE_END 423 424 #endif /* U_SHOW_CPLUSPLUS_API */ 425 426 #endif // __STRINGTRIEBUILDER_H__ 427