1 /* 2 ******************************************************************************* 3 * Copyright (C) 2010-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: stringtriebuilder.h 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2010dec24 12 * created by: Markus W. Scherer 13 */ 14 15 #ifndef __STRINGTRIEBUILDER_H__ 16 #define __STRINGTRIEBUILDER_H__ 17 18 #include "unicode/utypes.h" 19 #include "unicode/uobject.h" 20 21 // Forward declaration. 22 struct UHashtable; 23 typedef struct UHashtable UHashtable; 24 25 /** 26 * Build options for BytesTrieBuilder and CharsTrieBuilder. 27 * @draft ICU 4.8 28 */ 29 enum UStringTrieBuildOption { 30 /** 31 * Builds a trie quickly. 32 * @draft ICU 4.8 33 */ 34 USTRINGTRIE_BUILD_FAST, 35 /** 36 * Builds a trie more slowly, attempting to generate 37 * a shorter but equivalent serialization. 38 * This build option also uses more memory. 39 * 40 * This option can be effective when many integer values are the same 41 * and string/byte sequence suffixes can be shared. 42 * Runtime speed is not expected to improve. 43 * @draft ICU 4.8 44 */ 45 USTRINGTRIE_BUILD_SMALL 46 }; 47 48 U_NAMESPACE_BEGIN 49 50 /** 51 * Base class for string trie builder classes. 52 * 53 * This class is not intended for public subclassing. 54 * @draft ICU 4.8 55 */ 56 class U_COMMON_API StringTrieBuilder : public UObject { 57 public: 58 /** @internal */ 59 static UBool hashNode(const void *node); 60 /** @internal */ 61 static UBool equalNodes(const void *left, const void *right); 62 63 protected: 64 /** @internal */ 65 StringTrieBuilder(); 66 /** @internal */ 67 virtual ~StringTrieBuilder(); 68 69 /** @internal */ 70 void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode); 71 /** @internal */ 72 void deleteCompactBuilder(); 73 74 /** @internal */ 75 void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode); 76 77 /** @internal */ 78 int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex); 79 /** @internal */ 80 int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length); 81 82 class Node; 83 84 /** @internal */ 85 Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode); 86 /** @internal */ 87 Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, 88 int32_t length, UErrorCode &errorCode); 89 90 /** @internal */ 91 virtual int32_t getElementStringLength(int32_t i) const = 0; 92 /** @internal */ 93 virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0; 94 /** @internal */ 95 virtual int32_t getElementValue(int32_t i) const = 0; 96 97 // Finds the first unit index after this one where 98 // the first and last element have different units again. 99 /** @internal */ 100 virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0; 101 102 // Number of different units at unitIndex. 103 /** @internal */ 104 virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0; 105 /** @internal */ 106 virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0; 107 /** @internal */ 108 virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0; 109 110 /** @internal */ 111 virtual UBool matchNodesCanHaveValues() const = 0; 112 113 /** @internal */ 114 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0; 115 /** @internal */ 116 virtual int32_t getMinLinearMatch() const = 0; 117 /** @internal */ 118 virtual int32_t getMaxLinearMatchLength() const = 0; 119 120 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength). 121 /** @internal */ 122 static const int32_t kMaxBranchLinearSubNodeLength=5; 123 124 // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units. 125 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up. 126 /** @internal */ 127 static const int32_t kMaxSplitBranchLevels=14; 128 129 /** 130 * Makes sure that there is only one unique node registered that is 131 * equivalent to newNode. 132 * @param newNode Input node. The builder takes ownership. 133 * @param errorCode ICU in/out UErrorCode. 134 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. 135 * @return newNode if it is the first of its kind, or 136 * an equivalent node if newNode is a duplicate. 137 * @internal 138 */ 139 Node *registerNode(Node *newNode, UErrorCode &errorCode); 140 /** 141 * Makes sure that there is only one unique FinalValueNode registered 142 * with this value. 143 * Avoids creating a node if the value is a duplicate. 144 * @param value A final value. 145 * @param errorCode ICU in/out UErrorCode. 146 Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL. 147 * @return A FinalValueNode with the given value. 148 * @internal 149 */ 150 Node *registerFinalValue(int32_t value, UErrorCode &errorCode); 151 152 /* 153 * C++ note: 154 * registerNode() and registerFinalValue() take ownership of their input nodes, 155 * and only return owned nodes. 156 * If they see a failure UErrorCode, they will delete the input node. 157 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR. 158 * If there is a failure, they return NULL. 159 * 160 * NULL Node pointers can be safely passed into other Nodes because 161 * they call the static Node::hashCode() which checks for a NULL pointer first. 162 * 163 * Therefore, as long as builder functions register a new node, 164 * they need to check for failures only before explicitly dereferencing 165 * a Node pointer, or before setting a new UErrorCode. 166 */ 167 168 // Hash set of nodes, maps from nodes to integer 1. 169 /** @internal */ 170 UHashtable *nodes; 171 172 /** @internal */ 173 class Node : public UObject { 174 public: Node(int32_t initialHash)175 Node(int32_t initialHash) : hash(initialHash), offset(0) {} hashCode()176 inline int32_t hashCode() const { return hash; } 177 // Handles node==NULL. hashCode(const Node * node)178 static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); } 179 // Base class operator==() compares the actual class types. 180 virtual UBool operator==(const Node &other) const; 181 inline UBool operator!=(const Node &other) const { return !operator==(other); } 182 /** 183 * Traverses the Node graph and numbers branch edges, with rightmost edges first. 184 * This is to avoid writing a duplicate node twice. 185 * 186 * Branch nodes in this trie data structure are not symmetric. 187 * Most branch edges "jump" to other nodes but the rightmost branch edges 188 * just continue without a jump. 189 * Therefore, write() must write the rightmost branch edge last 190 * (trie units are written backwards), and must write it at that point even if 191 * it is a duplicate of a node previously written elsewhere. 192 * 193 * This function visits and marks right branch edges first. 194 * Edges are numbered with increasingly negative values because we share the 195 * offset field which gets positive values when nodes are written. 196 * A branch edge also remembers the first number for any of its edges. 197 * 198 * When a further-left branch edge has a number in the range of the rightmost 199 * edge's numbers, then it will be written as part of the required right edge 200 * and we can avoid writing it first. 201 * 202 * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative 203 * edge numbers. 204 * 205 * @param edgeNumber The first edge number for this node and its sub-nodes. 206 * @return An edge number that is at least the maximum-negative 207 * of the input edge number and the numbers of this node and all of its sub-nodes. 208 */ 209 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 210 // write() must set the offset to a positive value. 211 virtual void write(StringTrieBuilder &builder) = 0; 212 // See markRightEdgesFirst. writeUnlessInsideRightEdge(int32_t firstRight,int32_t lastRight,StringTrieBuilder & builder)213 inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight, 214 StringTrieBuilder &builder) { 215 // Note: Edge numbers are negative, lastRight<=firstRight. 216 // If offset>0 then this node and its sub-nodes have been written already 217 // and we need not write them again. 218 // If this node is part of the unwritten right branch edge, 219 // then we wait until that is written. 220 if(offset<0 && (offset<lastRight || firstRight<offset)) { 221 write(builder); 222 } 223 } getOffset()224 inline int32_t getOffset() const { return offset; } 225 protected: 226 int32_t hash; 227 int32_t offset; 228 private: 229 // No ICU "poor man's RTTI" for this class nor its subclasses. 230 virtual UClassID getDynamicClassID() const; 231 }; 232 233 // This class should not be overridden because 234 // registerFinalValue() compares a stack-allocated FinalValueNode 235 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes) 236 // with the input node, and the 237 // !Node::operator==(other) used inside FinalValueNode::operator==(other) 238 // will be false if the typeid's are different. 239 /** @internal */ 240 class FinalValueNode : public Node { 241 public: FinalValueNode(int32_t v)242 FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {} 243 virtual UBool operator==(const Node &other) const; 244 virtual void write(StringTrieBuilder &builder); 245 protected: 246 int32_t value; 247 }; 248 249 /** @internal */ 250 class ValueNode : public Node { 251 public: ValueNode(int32_t initialHash)252 ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {} 253 virtual UBool operator==(const Node &other) const; setValue(int32_t v)254 void setValue(int32_t v) { 255 hasValue=TRUE; 256 value=v; 257 hash=hash*37+v; 258 } 259 protected: 260 UBool hasValue; 261 int32_t value; 262 }; 263 264 /** @internal */ 265 class IntermediateValueNode : public ValueNode { 266 public: IntermediateValueNode(int32_t v,Node * nextNode)267 IntermediateValueNode(int32_t v, Node *nextNode) 268 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); } 269 virtual UBool operator==(const Node &other) const; 270 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 271 virtual void write(StringTrieBuilder &builder); 272 protected: 273 Node *next; 274 }; 275 276 /** @internal */ 277 class LinearMatchNode : public ValueNode { 278 public: LinearMatchNode(int32_t len,Node * nextNode)279 LinearMatchNode(int32_t len, Node *nextNode) 280 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)), 281 length(len), next(nextNode) {} 282 virtual UBool operator==(const Node &other) const; 283 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 284 protected: 285 int32_t length; 286 Node *next; 287 }; 288 289 /** @internal */ 290 class BranchNode : public Node { 291 public: BranchNode(int32_t initialHash)292 BranchNode(int32_t initialHash) : Node(initialHash) {} 293 protected: 294 int32_t firstEdgeNumber; 295 }; 296 297 /** @internal */ 298 class ListBranchNode : public BranchNode { 299 public: ListBranchNode()300 ListBranchNode() : BranchNode(0x444444), length(0) {} 301 virtual UBool operator==(const Node &other) const; 302 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 303 virtual void write(StringTrieBuilder &builder); 304 // Adds a unit with a final value. add(int32_t c,int32_t value)305 void add(int32_t c, int32_t value) { 306 units[length]=(UChar)c; 307 equal[length]=NULL; 308 values[length]=value; 309 ++length; 310 hash=(hash*37+c)*37+value; 311 } 312 // Adds a unit which leads to another match node. add(int32_t c,Node * node)313 void add(int32_t c, Node *node) { 314 units[length]=(UChar)c; 315 equal[length]=node; 316 values[length]=0; 317 ++length; 318 hash=(hash*37+c)*37+hashCode(node); 319 } 320 protected: 321 Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value". 322 int32_t length; 323 int32_t values[kMaxBranchLinearSubNodeLength]; 324 UChar units[kMaxBranchLinearSubNodeLength]; 325 }; 326 327 /** @internal */ 328 class SplitBranchNode : public BranchNode { 329 public: SplitBranchNode(UChar middleUnit,Node * lessThanNode,Node * greaterOrEqualNode)330 SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode) 331 : BranchNode(((0x555555*37+middleUnit)*37+ 332 hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)), 333 unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {} 334 virtual UBool operator==(const Node &other) const; 335 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 336 virtual void write(StringTrieBuilder &builder); 337 protected: 338 UChar unit; 339 Node *lessThan; 340 Node *greaterOrEqual; 341 }; 342 343 // Branch head node, for writing the actual node lead unit. 344 /** @internal */ 345 class BranchHeadNode : public ValueNode { 346 public: BranchHeadNode(int32_t len,Node * subNode)347 BranchHeadNode(int32_t len, Node *subNode) 348 : ValueNode((0x666666*37+len)*37+hashCode(subNode)), 349 length(len), next(subNode) {} 350 virtual UBool operator==(const Node &other) const; 351 virtual int32_t markRightEdgesFirst(int32_t edgeNumber); 352 virtual void write(StringTrieBuilder &builder); 353 protected: 354 int32_t length; 355 Node *next; // A branch sub-node. 356 }; 357 358 /** @internal */ 359 virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, 360 Node *nextNode) const = 0; 361 362 /** @internal */ 363 virtual int32_t write(int32_t unit) = 0; 364 /** @internal */ 365 virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0; 366 /** @internal */ 367 virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0; 368 /** @internal */ 369 virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0; 370 /** @internal */ 371 virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0; 372 373 private: 374 // No ICU "poor man's RTTI" for this class nor its subclasses. 375 virtual UClassID getDynamicClassID() const; 376 }; 377 378 U_NAMESPACE_END 379 380 #endif // __STRINGTRIEBUILDER_H__ 381