• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *   Copyright (C) 2010-2012, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   file name:  stringtriebuilder.h
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2010dec24
12 *   created by: Markus W. Scherer
13 */
14 
15 #ifndef __STRINGTRIEBUILDER_H__
16 #define __STRINGTRIEBUILDER_H__
17 
18 #include "unicode/utypes.h"
19 #include "unicode/uobject.h"
20 
21 /**
22  * \file
23  * \brief C++ API: Builder API for trie builders
24  */
25 
26 // Forward declaration.
27 struct UHashtable;
28 typedef struct UHashtable UHashtable;
29 
30 /**
31  * Build options for BytesTrieBuilder and CharsTrieBuilder.
32  * @stable ICU 4.8
33  */
34 enum UStringTrieBuildOption {
35     /**
36      * Builds a trie quickly.
37      * @stable ICU 4.8
38      */
39     USTRINGTRIE_BUILD_FAST,
40     /**
41      * Builds a trie more slowly, attempting to generate
42      * a shorter but equivalent serialization.
43      * This build option also uses more memory.
44      *
45      * This option can be effective when many integer values are the same
46      * and string/byte sequence suffixes can be shared.
47      * Runtime speed is not expected to improve.
48      * @stable ICU 4.8
49      */
50     USTRINGTRIE_BUILD_SMALL
51 };
52 
53 U_NAMESPACE_BEGIN
54 
55 /**
56  * Base class for string trie builder classes.
57  *
58  * This class is not intended for public subclassing.
59  * @stable ICU 4.8
60  */
61 class U_COMMON_API StringTrieBuilder : public UObject {
62 public:
63 #ifndef U_HIDE_INTERNAL_API
64     /** @internal */
65     static UBool hashNode(const void *node);
66     /** @internal */
67     static UBool equalNodes(const void *left, const void *right);
68 #endif  /* U_HIDE_INTERNAL_API */
69 
70 protected:
71     // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
72     // or else the compiler will create a public default constructor.
73     /** @internal */
74     StringTrieBuilder();
75     /** @internal */
76     virtual ~StringTrieBuilder();
77 
78 #ifndef U_HIDE_INTERNAL_API
79     /** @internal */
80     void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
81     /** @internal */
82     void deleteCompactBuilder();
83 
84     /** @internal */
85     void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
86 
87     /** @internal */
88     int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
89     /** @internal */
90     int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
91 #endif  /* U_HIDE_INTERNAL_API */
92 
93     class Node;
94 
95 #ifndef U_HIDE_INTERNAL_API
96     /** @internal */
97     Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
98     /** @internal */
99     Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
100                             int32_t length, UErrorCode &errorCode);
101 #endif  /* U_HIDE_INTERNAL_API */
102 
103     /** @internal */
104     virtual int32_t getElementStringLength(int32_t i) const = 0;
105     /** @internal */
106     virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
107     /** @internal */
108     virtual int32_t getElementValue(int32_t i) const = 0;
109 
110     // Finds the first unit index after this one where
111     // the first and last element have different units again.
112     /** @internal */
113     virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
114 
115     // Number of different units at unitIndex.
116     /** @internal */
117     virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
118     /** @internal */
119     virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
120     /** @internal */
121     virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
122 
123     /** @internal */
124     virtual UBool matchNodesCanHaveValues() const = 0;
125 
126     /** @internal */
127     virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
128     /** @internal */
129     virtual int32_t getMinLinearMatch() const = 0;
130     /** @internal */
131     virtual int32_t getMaxLinearMatchLength() const = 0;
132 
133 #ifndef U_HIDE_INTERNAL_API
134     // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
135     /** @internal */
136     static const int32_t kMaxBranchLinearSubNodeLength=5;
137 
138     // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
139     // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
140     /** @internal */
141     static const int32_t kMaxSplitBranchLevels=14;
142 
143     /**
144      * Makes sure that there is only one unique node registered that is
145      * equivalent to newNode.
146      * @param newNode Input node. The builder takes ownership.
147      * @param errorCode ICU in/out UErrorCode.
148                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
149      * @return newNode if it is the first of its kind, or
150      *         an equivalent node if newNode is a duplicate.
151      * @internal
152      */
153     Node *registerNode(Node *newNode, UErrorCode &errorCode);
154     /**
155      * Makes sure that there is only one unique FinalValueNode registered
156      * with this value.
157      * Avoids creating a node if the value is a duplicate.
158      * @param value A final value.
159      * @param errorCode ICU in/out UErrorCode.
160                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
161      * @return A FinalValueNode with the given value.
162      * @internal
163      */
164     Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
165 
166     /*
167      * C++ note:
168      * registerNode() and registerFinalValue() take ownership of their input nodes,
169      * and only return owned nodes.
170      * If they see a failure UErrorCode, they will delete the input node.
171      * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
172      * If there is a failure, they return NULL.
173      *
174      * NULL Node pointers can be safely passed into other Nodes because
175      * they call the static Node::hashCode() which checks for a NULL pointer first.
176      *
177      * Therefore, as long as builder functions register a new node,
178      * they need to check for failures only before explicitly dereferencing
179      * a Node pointer, or before setting a new UErrorCode.
180      */
181 
182     // Hash set of nodes, maps from nodes to integer 1.
183     /** @internal */
184     UHashtable *nodes;
185 
186     /** @internal */
187     class Node : public UObject {
188     public:
Node(int32_t initialHash)189         Node(int32_t initialHash) : hash(initialHash), offset(0) {}
hashCode()190         inline int32_t hashCode() const { return hash; }
191         // Handles node==NULL.
hashCode(const Node * node)192         static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
193         // Base class operator==() compares the actual class types.
194         virtual UBool operator==(const Node &other) const;
195         inline UBool operator!=(const Node &other) const { return !operator==(other); }
196         /**
197          * Traverses the Node graph and numbers branch edges, with rightmost edges first.
198          * This is to avoid writing a duplicate node twice.
199          *
200          * Branch nodes in this trie data structure are not symmetric.
201          * Most branch edges "jump" to other nodes but the rightmost branch edges
202          * just continue without a jump.
203          * Therefore, write() must write the rightmost branch edge last
204          * (trie units are written backwards), and must write it at that point even if
205          * it is a duplicate of a node previously written elsewhere.
206          *
207          * This function visits and marks right branch edges first.
208          * Edges are numbered with increasingly negative values because we share the
209          * offset field which gets positive values when nodes are written.
210          * A branch edge also remembers the first number for any of its edges.
211          *
212          * When a further-left branch edge has a number in the range of the rightmost
213          * edge's numbers, then it will be written as part of the required right edge
214          * and we can avoid writing it first.
215          *
216          * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
217          * edge numbers.
218          *
219          * @param edgeNumber The first edge number for this node and its sub-nodes.
220          * @return An edge number that is at least the maximum-negative
221          *         of the input edge number and the numbers of this node and all of its sub-nodes.
222          */
223         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
224         // write() must set the offset to a positive value.
225         virtual void write(StringTrieBuilder &builder) = 0;
226         // See markRightEdgesFirst.
writeUnlessInsideRightEdge(int32_t firstRight,int32_t lastRight,StringTrieBuilder & builder)227         inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
228                                                StringTrieBuilder &builder) {
229             // Note: Edge numbers are negative, lastRight<=firstRight.
230             // If offset>0 then this node and its sub-nodes have been written already
231             // and we need not write them again.
232             // If this node is part of the unwritten right branch edge,
233             // then we wait until that is written.
234             if(offset<0 && (offset<lastRight || firstRight<offset)) {
235                 write(builder);
236             }
237         }
getOffset()238         inline int32_t getOffset() const { return offset; }
239     protected:
240         int32_t hash;
241         int32_t offset;
242     private:
243         // No ICU "poor man's RTTI" for this class nor its subclasses.
244         virtual UClassID getDynamicClassID() const;
245     };
246 
247     // This class should not be overridden because
248     // registerFinalValue() compares a stack-allocated FinalValueNode
249     // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
250     // with the input node, and the
251     // !Node::operator==(other) used inside FinalValueNode::operator==(other)
252     // will be false if the typeid's are different.
253     /** @internal */
254     class FinalValueNode : public Node {
255     public:
FinalValueNode(int32_t v)256         FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
257         virtual UBool operator==(const Node &other) const;
258         virtual void write(StringTrieBuilder &builder);
259     protected:
260         int32_t value;
261     };
262 
263     /**
264      * @internal
265      */
266     class ValueNode : public Node {
267     public:
ValueNode(int32_t initialHash)268         ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
269         virtual UBool operator==(const Node &other) const;
setValue(int32_t v)270         void setValue(int32_t v) {
271             hasValue=TRUE;
272             value=v;
273             hash=hash*37+v;
274         }
275     protected:
276         UBool hasValue;
277         int32_t value;
278     };
279 
280     /**
281      * @internal
282      */
283     class IntermediateValueNode : public ValueNode {
284     public:
IntermediateValueNode(int32_t v,Node * nextNode)285         IntermediateValueNode(int32_t v, Node *nextNode)
286                 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
287         virtual UBool operator==(const Node &other) const;
288         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
289         virtual void write(StringTrieBuilder &builder);
290     protected:
291         Node *next;
292     };
293 
294     /**
295      * @internal
296      */
297     class LinearMatchNode : public ValueNode {
298     public:
LinearMatchNode(int32_t len,Node * nextNode)299         LinearMatchNode(int32_t len, Node *nextNode)
300                 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
301                   length(len), next(nextNode) {}
302         virtual UBool operator==(const Node &other) const;
303         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
304     protected:
305         int32_t length;
306         Node *next;
307     };
308 
309     /**
310      * @internal
311      */
312     class BranchNode : public Node {
313     public:
BranchNode(int32_t initialHash)314         BranchNode(int32_t initialHash) : Node(initialHash) {}
315     protected:
316         int32_t firstEdgeNumber;
317     };
318 
319     /**
320      * @internal
321      */
322     class ListBranchNode : public BranchNode {
323     public:
ListBranchNode()324         ListBranchNode() : BranchNode(0x444444), length(0) {}
325         virtual UBool operator==(const Node &other) const;
326         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
327         virtual void write(StringTrieBuilder &builder);
328         // Adds a unit with a final value.
add(int32_t c,int32_t value)329         void add(int32_t c, int32_t value) {
330             units[length]=(UChar)c;
331             equal[length]=NULL;
332             values[length]=value;
333             ++length;
334             hash=(hash*37+c)*37+value;
335         }
336         // Adds a unit which leads to another match node.
add(int32_t c,Node * node)337         void add(int32_t c, Node *node) {
338             units[length]=(UChar)c;
339             equal[length]=node;
340             values[length]=0;
341             ++length;
342             hash=(hash*37+c)*37+hashCode(node);
343         }
344     protected:
345         Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
346         int32_t length;
347         int32_t values[kMaxBranchLinearSubNodeLength];
348         UChar units[kMaxBranchLinearSubNodeLength];
349     };
350 
351     /**
352      * @internal
353      */
354     class SplitBranchNode : public BranchNode {
355     public:
SplitBranchNode(UChar middleUnit,Node * lessThanNode,Node * greaterOrEqualNode)356         SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
357                 : BranchNode(((0x555555*37+middleUnit)*37+
358                               hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
359                   unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
360         virtual UBool operator==(const Node &other) const;
361         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
362         virtual void write(StringTrieBuilder &builder);
363     protected:
364         UChar unit;
365         Node *lessThan;
366         Node *greaterOrEqual;
367     };
368 
369     // Branch head node, for writing the actual node lead unit.
370     /** @internal */
371     class BranchHeadNode : public ValueNode {
372     public:
BranchHeadNode(int32_t len,Node * subNode)373         BranchHeadNode(int32_t len, Node *subNode)
374                 : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
375                   length(len), next(subNode) {}
376         virtual UBool operator==(const Node &other) const;
377         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
378         virtual void write(StringTrieBuilder &builder);
379     protected:
380         int32_t length;
381         Node *next;  // A branch sub-node.
382     };
383 #endif  /* U_HIDE_INTERNAL_API */
384 
385     /** @internal */
386     virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
387                                         Node *nextNode) const = 0;
388 
389     /** @internal */
390     virtual int32_t write(int32_t unit) = 0;
391     /** @internal */
392     virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
393     /** @internal */
394     virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
395     /** @internal */
396     virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
397     /** @internal */
398     virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
399 
400 private:
401     // No ICU "poor man's RTTI" for this class nor its subclasses.
402     virtual UClassID getDynamicClassID() const;
403 };
404 
405 U_NAMESPACE_END
406 
407 #endif  // __STRINGTRIEBUILDER_H__
408