1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: stringtriebuilder.h 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2010dec24 12 * created by: Markus W. Scherer 13 */ 14 15 #ifndef __STRINGTRIEBUILDER_H__ 16 #define __STRINGTRIEBUILDER_H__ 17 18 #include "unicode/utypes.h" 19 #include "unicode/uobject.h" 20 21 /** 22 * \file 23 * \brief C++ API: Builder API for trie builders 24 */ 25 26 // Forward declaration. 27 struct UHashtable; 28 typedef struct UHashtable UHashtable; 29 30 /** 31 * Build options for BytesTrieBuilder and CharsTrieBuilder. 32 * @stable ICU 4.8 33 */ 34 enum UStringTrieBuildOption { 35 /** 36 * Builds a trie quickly. 37 * @stable ICU 4.8 38 */ 39 USTRINGTRIE_BUILD_FAST, 40 /** 41 * Builds a trie more slowly, attempting to generate 42 * a shorter but equivalent serialization. 43 * This build option also uses more memory. 44 * 45 * This option can be effective when many integer values are the same 46 * and string/byte sequence suffixes can be shared. 47 * Runtime speed is not expected to improve. 48 * @stable ICU 4.8 49 */ 50 USTRINGTRIE_BUILD_SMALL 51 }; 52 53 U_NAMESPACE_BEGIN 54 55 /** 56 * Base class for string trie builder classes. 57 * 58 * This class is not intended for public subclassing. 59 * @stable ICU 4.8 60 */ 61 class U_COMMON_API StringTrieBuilder : public UObject { 62 public: 63 #ifndef U_HIDE_INTERNAL_API 64 /** @internal */ 65 static UBool hashNode(const void *node); 66 /** @internal */ 67 static UBool equalNodes(const void *left, const void *right); 68 #endif /* U_HIDE_INTERNAL_API */ 69 70 protected: 71 // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API 72 // or else the compiler will create a public default constructor. 73 /** @internal */ 74 StringTrieBuilder(); 75 /** @internal */ 76 virtual ~StringTrieBuilder(); 77 78 #ifndef U_HIDE_INTERNAL_API 79 /** @internal */ 80 void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode); 81 /** @internal */ 82 void deleteCompactBuilder(); 83 84 /** @internal */ 85 void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode); 86 87 /** @internal */ 88 int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex); 89 /** @internal */ 90 int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length); 91 #endif /* U_HIDE_INTERNAL_API */ 92 93 class Node; 94 95 #ifndef U_HIDE_INTERNAL_API 96 /** @internal */ 97 Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode); 98 /** @internal */ 99 Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, 100 int32_t length, UErrorCode &errorCode); 101 #endif /* U_HIDE_INTERNAL_API */ 102 103 /** @internal */ 104 virtual int32_t getElementStringLength(int32_t i) const = 0; 105 /** @internal */ 106 virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0; 107 /** @internal */ 108 virtual int32_t getElementValue(int32_t i) const = 0; 109 110 // Finds the first unit index after this one where 111 // the first and last element have different units again. 112 /** @internal */ 113 virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0; 114 115 // Number of different units at unitIndex. 116 /** @internal */ 117 virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0; 118 /** @internal */ 119 virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0; 120 /** @internal */ 121 virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0; 122 123 /** @internal */ 124 virtual UBool matchNodesCanHaveValues() const = 0; 125 126 /** @internal */ 127 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0; 128 /** @internal */ 129 virtual int32_t getMinLinearMatch() const = 0; 130 /** @internal */ 131 virtual int32_t getMaxLinearMatchLength() const = 0; 132 133 #ifndef U_HIDE_INTERNAL_API 134 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength). 135 /** @internal */ 136 static const int32_t kMaxBranchLinearSubNodeLength=5; 137 138 // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units. 139 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up. 140 /** @internal */ 141 static const int32_t kMaxSplitBranchLevels=14; 142 143 /** 144 * Makes sure that there is only one unique node registered that is 145 * equivalent to newNode. 146 * @param newNode Input node. The builder takes ownership. 147 * @param errorCode ICU in/out UErrorCode. 148 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. 149 * @return newNode if it is the first of its kind, or 150 * an equivalent node if newNode is a duplicate. 151 * @internal 152 */ 153 Node *registerNode(Node *newNode, UErrorCode &errorCode); 154 /** 155 * Makes sure that there is only one unique FinalValueNode registered 156 * with this value. 157 * Avoids creating a node if the value is a duplicate. 158 * @param value A final value. 159 * @param errorCode ICU in/out UErrorCode. 160 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. 161 * @return A FinalValueNode with the given value. 162 * @internal 163 */ 164 Node *registerFinalValue(int32_t value, UErrorCode &errorCode); 165 166 /* 167 * C++ note: 168 * registerNode() and registerFinalValue() take ownership of their input nodes, 169 * and only return owned nodes. 170 * If they see a failure UErrorCode, they will delete the input node. 171 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR. 172 * If there is a failure, they return NULL. 173 * 174 * NULL Node pointers can be safely passed into other Nodes because 175 * they call the static Node::hashCode() which checks for a NULL pointer first. 176 * 177 * Therefore, as long as builder functions register a new node, 178 * they need to check for failures only before explicitly dereferencing 179 * a Node pointer, or before setting a new UErrorCode. 180 */ 181 182 // Hash set of nodes, maps from nodes to integer 1. 183 /** @internal */ 184 UHashtable *nodes; 185 186 /** @internal */ 187 class Node : public UObject { 188 public: Node(int32_t initialHash)189 Node(int32_t initialHash) : hash(initialHash), offset(0) {} hashCode()190 inline int32_t hashCode() const { return hash; } 191 // Handles node==NULL. hashCode(const Node * node)192 static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); } 193 // Base class operator==() compares the actual class types. 194 virtual UBool operator==(const Node &other) const; 195 inline UBool operator!=(const Node &other) const { return !operator==(other); } 196 /** 197 * Traverses the Node graph and numbers branch edges, with rightmost edges first. 198 * This is to avoid writing a duplicate node twice. 199 * 200 * Branch nodes in this trie data structure are not symmetric. 201 * Most branch edges "jump" to other nodes but the rightmost branch edges 202 * just continue without a jump. 203 * Therefore, write() must write the rightmost branch edge last 204 * (trie units are written backwards), and must write it at that point even if 205 * it is a duplicate of a node previously written elsewhere. 206 * 207 * This function visits and marks right branch edges first. 208 * Edges are numbered with increasingly negative values because we share the 209 * offset field which gets positive values when nodes are written. 210 * A branch edge also remembers the first number for any of its edges. 211 * 212 * When a further-left branch edge has a number in the range of the rightmost 213 * edge's numbers, then it will be written as part of the required right edge 214 * and we can avoid writing it first. 215 * 216 * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative 217 * edge numbers. 218 * 219 * @param edgeNumber The first edge number for this node and its sub-nodes. 220 * @return An edge number that is at least the maximum-negative 221 * of the input edge number and the numbers of this node and all of its sub-nodes. 222 */ 223 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 224 // write() must set the offset to a positive value. 225 virtual void write(StringTrieBuilder &builder) = 0; 226 // See markRightEdgesFirst. writeUnlessInsideRightEdge(int32_t firstRight,int32_t lastRight,StringTrieBuilder & builder)227 inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight, 228 StringTrieBuilder &builder) { 229 // Note: Edge numbers are negative, lastRight<=firstRight. 230 // If offset>0 then this node and its sub-nodes have been written already 231 // and we need not write them again. 232 // If this node is part of the unwritten right branch edge, 233 // then we wait until that is written. 234 if(offset<0 && (offset<lastRight || firstRight<offset)) { 235 write(builder); 236 } 237 } getOffset()238 inline int32_t getOffset() const { return offset; } 239 protected: 240 int32_t hash; 241 int32_t offset; 242 private: 243 // No ICU "poor man's RTTI" for this class nor its subclasses. 244 virtual UClassID getDynamicClassID() const; 245 }; 246 247 // This class should not be overridden because 248 // registerFinalValue() compares a stack-allocated FinalValueNode 249 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes) 250 // with the input node, and the 251 // !Node::operator==(other) used inside FinalValueNode::operator==(other) 252 // will be false if the typeid's are different. 253 /** @internal */ 254 class FinalValueNode : public Node { 255 public: FinalValueNode(int32_t v)256 FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {} 257 virtual UBool operator==(const Node &other) const; 258 virtual void write(StringTrieBuilder &builder); 259 protected: 260 int32_t value; 261 }; 262 263 /** 264 * @internal 265 */ 266 class ValueNode : public Node { 267 public: ValueNode(int32_t initialHash)268 ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {} 269 virtual UBool operator==(const Node &other) const; setValue(int32_t v)270 void setValue(int32_t v) { 271 hasValue=TRUE; 272 value=v; 273 hash=hash*37+v; 274 } 275 protected: 276 UBool hasValue; 277 int32_t value; 278 }; 279 280 /** 281 * @internal 282 */ 283 class IntermediateValueNode : public ValueNode { 284 public: IntermediateValueNode(int32_t v,Node * nextNode)285 IntermediateValueNode(int32_t v, Node *nextNode) 286 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); } 287 virtual UBool operator==(const Node &other) const; 288 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 289 virtual void write(StringTrieBuilder &builder); 290 protected: 291 Node *next; 292 }; 293 294 /** 295 * @internal 296 */ 297 class LinearMatchNode : public ValueNode { 298 public: LinearMatchNode(int32_t len,Node * nextNode)299 LinearMatchNode(int32_t len, Node *nextNode) 300 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)), 301 length(len), next(nextNode) {} 302 virtual UBool operator==(const Node &other) const; 303 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 304 protected: 305 int32_t length; 306 Node *next; 307 }; 308 309 /** 310 * @internal 311 */ 312 class BranchNode : public Node { 313 public: BranchNode(int32_t initialHash)314 BranchNode(int32_t initialHash) : Node(initialHash) {} 315 protected: 316 int32_t firstEdgeNumber; 317 }; 318 319 /** 320 * @internal 321 */ 322 class ListBranchNode : public BranchNode { 323 public: ListBranchNode()324 ListBranchNode() : BranchNode(0x444444), length(0) {} 325 virtual UBool operator==(const Node &other) const; 326 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 327 virtual void write(StringTrieBuilder &builder); 328 // Adds a unit with a final value. add(int32_t c,int32_t value)329 void add(int32_t c, int32_t value) { 330 units[length]=(UChar)c; 331 equal[length]=NULL; 332 values[length]=value; 333 ++length; 334 hash=(hash*37+c)*37+value; 335 } 336 // Adds a unit which leads to another match node. add(int32_t c,Node * node)337 void add(int32_t c, Node *node) { 338 units[length]=(UChar)c; 339 equal[length]=node; 340 values[length]=0; 341 ++length; 342 hash=(hash*37+c)*37+hashCode(node); 343 } 344 protected: 345 Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value". 346 int32_t length; 347 int32_t values[kMaxBranchLinearSubNodeLength]; 348 UChar units[kMaxBranchLinearSubNodeLength]; 349 }; 350 351 /** 352 * @internal 353 */ 354 class SplitBranchNode : public BranchNode { 355 public: SplitBranchNode(UChar middleUnit,Node * lessThanNode,Node * greaterOrEqualNode)356 SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode) 357 : BranchNode(((0x555555*37+middleUnit)*37+ 358 hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)), 359 unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {} 360 virtual UBool operator==(const Node &other) const; 361 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 362 virtual void write(StringTrieBuilder &builder); 363 protected: 364 UChar unit; 365 Node *lessThan; 366 Node *greaterOrEqual; 367 }; 368 369 // Branch head node, for writing the actual node lead unit. 370 /** @internal */ 371 class BranchHeadNode : public ValueNode { 372 public: BranchHeadNode(int32_t len,Node * subNode)373 BranchHeadNode(int32_t len, Node *subNode) 374 : ValueNode((0x666666*37+len)*37+hashCode(subNode)), 375 length(len), next(subNode) {} 376 virtual UBool operator==(const Node &other) const; 377 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 378 virtual void write(StringTrieBuilder &builder); 379 protected: 380 int32_t length; 381 Node *next; // A branch sub-node. 382 }; 383 #endif /* U_HIDE_INTERNAL_API */ 384 385 /** @internal */ 386 virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, 387 Node *nextNode) const = 0; 388 389 /** @internal */ 390 virtual int32_t write(int32_t unit) = 0; 391 /** @internal */ 392 virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0; 393 /** @internal */ 394 virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0; 395 /** @internal */ 396 virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0; 397 /** @internal */ 398 virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0; 399 400 private: 401 // No ICU "poor man's RTTI" for this class nor its subclasses. 402 virtual UClassID getDynamicClassID() const; 403 }; 404 405 U_NAMESPACE_END 406 407 #endif // __STRINGTRIEBUILDER_H__ 408