1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2010-2012,2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: stringtriebuilder.h 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2010dec24 14 * created by: Markus W. Scherer 15 */ 16 17 #ifndef __STRINGTRIEBUILDER_H__ 18 #define __STRINGTRIEBUILDER_H__ 19 20 #include "unicode/utypes.h" 21 #include "unicode/uobject.h" 22 23 /** 24 * \file 25 * \brief C++ API: Builder API for trie builders 26 */ 27 28 // Forward declaration. 29 /// \cond 30 struct UHashtable; 31 typedef struct UHashtable UHashtable; 32 /// \endcond 33 34 /** 35 * Build options for BytesTrieBuilder and CharsTrieBuilder. 36 * @stable ICU 4.8 37 */ 38 enum UStringTrieBuildOption { 39 /** 40 * Builds a trie quickly. 41 * @stable ICU 4.8 42 */ 43 USTRINGTRIE_BUILD_FAST, 44 /** 45 * Builds a trie more slowly, attempting to generate 46 * a shorter but equivalent serialization. 47 * This build option also uses more memory. 48 * 49 * This option can be effective when many integer values are the same 50 * and string/byte sequence suffixes can be shared. 51 * Runtime speed is not expected to improve. 52 * @stable ICU 4.8 53 */ 54 USTRINGTRIE_BUILD_SMALL 55 }; 56 57 U_NAMESPACE_BEGIN 58 59 /** 60 * Base class for string trie builder classes. 61 * 62 * This class is not intended for public subclassing. 63 * @stable ICU 4.8 64 */ 65 class U_COMMON_API StringTrieBuilder : public UObject { 66 public: 67 #ifndef U_HIDE_INTERNAL_API 68 /** @internal */ 69 static int32_t hashNode(const void *node); 70 /** @internal */ 71 static UBool equalNodes(const void *left, const void *right); 72 #endif /* U_HIDE_INTERNAL_API */ 73 74 protected: 75 // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API 76 // or else the compiler will create a public default constructor. 77 /** @internal */ 78 StringTrieBuilder(); 79 /** @internal */ 80 virtual ~StringTrieBuilder(); 81 82 #ifndef U_HIDE_INTERNAL_API 83 /** @internal */ 84 void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode); 85 /** @internal */ 86 void deleteCompactBuilder(); 87 88 /** @internal */ 89 void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode); 90 91 /** @internal */ 92 int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex); 93 /** @internal */ 94 int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length); 95 #endif /* U_HIDE_INTERNAL_API */ 96 97 class Node; 98 99 #ifndef U_HIDE_INTERNAL_API 100 /** @internal */ 101 Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode); 102 /** @internal */ 103 Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, 104 int32_t length, UErrorCode &errorCode); 105 #endif /* U_HIDE_INTERNAL_API */ 106 107 /** @internal */ 108 virtual int32_t getElementStringLength(int32_t i) const = 0; 109 /** @internal */ 110 virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const = 0; 111 /** @internal */ 112 virtual int32_t getElementValue(int32_t i) const = 0; 113 114 // Finds the first unit index after this one where 115 // the first and last element have different units again. 116 /** @internal */ 117 virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0; 118 119 // Number of different units at unitIndex. 120 /** @internal */ 121 virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0; 122 /** @internal */ 123 virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0; 124 /** @internal */ 125 virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const = 0; 126 127 /** @internal */ 128 virtual UBool matchNodesCanHaveValues() const = 0; 129 130 /** @internal */ 131 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0; 132 /** @internal */ 133 virtual int32_t getMinLinearMatch() const = 0; 134 /** @internal */ 135 virtual int32_t getMaxLinearMatchLength() const = 0; 136 137 #ifndef U_HIDE_INTERNAL_API 138 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength). 139 /** @internal */ 140 static const int32_t kMaxBranchLinearSubNodeLength=5; 141 142 // Maximum number of nested split-branch levels for a branch on all 2^16 possible char16_t units. 143 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up. 144 /** @internal */ 145 static const int32_t kMaxSplitBranchLevels=14; 146 147 /** 148 * Makes sure that there is only one unique node registered that is 149 * equivalent to newNode. 150 * @param newNode Input node. The builder takes ownership. 151 * @param errorCode ICU in/out UErrorCode. 152 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. 153 * @return newNode if it is the first of its kind, or 154 * an equivalent node if newNode is a duplicate. 155 * @internal 156 */ 157 Node *registerNode(Node *newNode, UErrorCode &errorCode); 158 /** 159 * Makes sure that there is only one unique FinalValueNode registered 160 * with this value. 161 * Avoids creating a node if the value is a duplicate. 162 * @param value A final value. 163 * @param errorCode ICU in/out UErrorCode. 164 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. 165 * @return A FinalValueNode with the given value. 166 * @internal 167 */ 168 Node *registerFinalValue(int32_t value, UErrorCode &errorCode); 169 #endif /* U_HIDE_INTERNAL_API */ 170 171 /* 172 * C++ note: 173 * registerNode() and registerFinalValue() take ownership of their input nodes, 174 * and only return owned nodes. 175 * If they see a failure UErrorCode, they will delete the input node. 176 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR. 177 * If there is a failure, they return NULL. 178 * 179 * NULL Node pointers can be safely passed into other Nodes because 180 * they call the static Node::hashCode() which checks for a NULL pointer first. 181 * 182 * Therefore, as long as builder functions register a new node, 183 * they need to check for failures only before explicitly dereferencing 184 * a Node pointer, or before setting a new UErrorCode. 185 */ 186 187 // Hash set of nodes, maps from nodes to integer 1. 188 /** @internal */ 189 UHashtable *nodes; 190 191 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API, 192 // it is needed for layout of other objects. 193 /** 194 * @internal 195 * \cond 196 */ 197 class Node : public UObject { 198 public: Node(int32_t initialHash)199 Node(int32_t initialHash) : hash(initialHash), offset(0) {} hashCode()200 inline int32_t hashCode() const { return hash; } 201 // Handles node==NULL. hashCode(const Node * node)202 static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); } 203 // Base class operator==() compares the actual class types. 204 virtual UBool operator==(const Node &other) const; 205 inline UBool operator!=(const Node &other) const { return !operator==(other); } 206 /** 207 * Traverses the Node graph and numbers branch edges, with rightmost edges first. 208 * This is to avoid writing a duplicate node twice. 209 * 210 * Branch nodes in this trie data structure are not symmetric. 211 * Most branch edges "jump" to other nodes but the rightmost branch edges 212 * just continue without a jump. 213 * Therefore, write() must write the rightmost branch edge last 214 * (trie units are written backwards), and must write it at that point even if 215 * it is a duplicate of a node previously written elsewhere. 216 * 217 * This function visits and marks right branch edges first. 218 * Edges are numbered with increasingly negative values because we share the 219 * offset field which gets positive values when nodes are written. 220 * A branch edge also remembers the first number for any of its edges. 221 * 222 * When a further-left branch edge has a number in the range of the rightmost 223 * edge's numbers, then it will be written as part of the required right edge 224 * and we can avoid writing it first. 225 * 226 * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative 227 * edge numbers. 228 * 229 * @param edgeNumber The first edge number for this node and its sub-nodes. 230 * @return An edge number that is at least the maximum-negative 231 * of the input edge number and the numbers of this node and all of its sub-nodes. 232 */ 233 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 234 // write() must set the offset to a positive value. 235 virtual void write(StringTrieBuilder &builder) = 0; 236 // See markRightEdgesFirst. writeUnlessInsideRightEdge(int32_t firstRight,int32_t lastRight,StringTrieBuilder & builder)237 inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight, 238 StringTrieBuilder &builder) { 239 // Note: Edge numbers are negative, lastRight<=firstRight. 240 // If offset>0 then this node and its sub-nodes have been written already 241 // and we need not write them again. 242 // If this node is part of the unwritten right branch edge, 243 // then we wait until that is written. 244 if(offset<0 && (offset<lastRight || firstRight<offset)) { 245 write(builder); 246 } 247 } getOffset()248 inline int32_t getOffset() const { return offset; } 249 protected: 250 int32_t hash; 251 int32_t offset; 252 }; 253 254 #ifndef U_HIDE_INTERNAL_API 255 // This class should not be overridden because 256 // registerFinalValue() compares a stack-allocated FinalValueNode 257 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes) 258 // with the input node, and the 259 // !Node::operator==(other) used inside FinalValueNode::operator==(other) 260 // will be false if the typeid's are different. 261 /** @internal */ 262 class FinalValueNode : public Node { 263 public: FinalValueNode(int32_t v)264 FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {} 265 virtual UBool operator==(const Node &other) const; 266 virtual void write(StringTrieBuilder &builder); 267 protected: 268 int32_t value; 269 }; 270 #endif /* U_HIDE_INTERNAL_API */ 271 272 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API, 273 // it is needed for layout of other objects. 274 /** 275 * @internal 276 */ 277 class ValueNode : public Node { 278 public: ValueNode(int32_t initialHash)279 ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {} 280 virtual UBool operator==(const Node &other) const; setValue(int32_t v)281 void setValue(int32_t v) { 282 hasValue=TRUE; 283 value=v; 284 hash=hash*37u+v; 285 } 286 protected: 287 UBool hasValue; 288 int32_t value; 289 }; 290 291 #ifndef U_HIDE_INTERNAL_API 292 /** 293 * @internal 294 */ 295 class IntermediateValueNode : public ValueNode { 296 public: IntermediateValueNode(int32_t v,Node * nextNode)297 IntermediateValueNode(int32_t v, Node *nextNode) 298 : ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); } 299 virtual UBool operator==(const Node &other) const; 300 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 301 virtual void write(StringTrieBuilder &builder); 302 protected: 303 Node *next; 304 }; 305 #endif /* U_HIDE_INTERNAL_API */ 306 307 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API, 308 // it is needed for layout of other objects. 309 /** 310 * @internal 311 */ 312 class LinearMatchNode : public ValueNode { 313 public: LinearMatchNode(int32_t len,Node * nextNode)314 LinearMatchNode(int32_t len, Node *nextNode) 315 : ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)), 316 length(len), next(nextNode) {} 317 virtual UBool operator==(const Node &other) const; 318 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 319 protected: 320 int32_t length; 321 Node *next; 322 }; 323 324 #ifndef U_HIDE_INTERNAL_API 325 /** 326 * @internal 327 */ 328 class BranchNode : public Node { 329 public: BranchNode(int32_t initialHash)330 BranchNode(int32_t initialHash) : Node(initialHash) {} 331 protected: 332 int32_t firstEdgeNumber; 333 }; 334 335 /** 336 * @internal 337 */ 338 class ListBranchNode : public BranchNode { 339 public: ListBranchNode()340 ListBranchNode() : BranchNode(0x444444), length(0) {} 341 virtual UBool operator==(const Node &other) const; 342 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 343 virtual void write(StringTrieBuilder &builder); 344 // Adds a unit with a final value. add(int32_t c,int32_t value)345 void add(int32_t c, int32_t value) { 346 units[length]=(char16_t)c; 347 equal[length]=NULL; 348 values[length]=value; 349 ++length; 350 hash=(hash*37u+c)*37u+value; 351 } 352 // Adds a unit which leads to another match node. add(int32_t c,Node * node)353 void add(int32_t c, Node *node) { 354 units[length]=(char16_t)c; 355 equal[length]=node; 356 values[length]=0; 357 ++length; 358 hash=(hash*37u+c)*37u+hashCode(node); 359 } 360 protected: 361 Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value". 362 int32_t length; 363 int32_t values[kMaxBranchLinearSubNodeLength]; 364 char16_t units[kMaxBranchLinearSubNodeLength]; 365 }; 366 367 /** 368 * @internal 369 */ 370 class SplitBranchNode : public BranchNode { 371 public: SplitBranchNode(char16_t middleUnit,Node * lessThanNode,Node * greaterOrEqualNode)372 SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode) 373 : BranchNode(((0x555555u*37u+middleUnit)*37u+ 374 hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)), 375 unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {} 376 virtual UBool operator==(const Node &other) const; 377 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 378 virtual void write(StringTrieBuilder &builder); 379 protected: 380 char16_t unit; 381 Node *lessThan; 382 Node *greaterOrEqual; 383 }; 384 385 // Branch head node, for writing the actual node lead unit. 386 /** @internal */ 387 class BranchHeadNode : public ValueNode { 388 public: BranchHeadNode(int32_t len,Node * subNode)389 BranchHeadNode(int32_t len, Node *subNode) 390 : ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)), 391 length(len), next(subNode) {} 392 virtual UBool operator==(const Node &other) const; 393 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 394 virtual void write(StringTrieBuilder &builder); 395 protected: 396 int32_t length; 397 Node *next; // A branch sub-node. 398 }; 399 400 #endif /* U_HIDE_INTERNAL_API */ 401 /// \endcond 402 403 /** @internal */ 404 virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, 405 Node *nextNode) const = 0; 406 407 /** @internal */ 408 virtual int32_t write(int32_t unit) = 0; 409 /** @internal */ 410 virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0; 411 /** @internal */ 412 virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0; 413 /** @internal */ 414 virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0; 415 /** @internal */ 416 virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0; 417 }; 418 419 U_NAMESPACE_END 420 421 #endif // __STRINGTRIEBUILDER_H__ 422