• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *   Copyright (C) 2010-2011, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 *******************************************************************************
6 *   file name:  stringtriebuilder.h
7 *   encoding:   US-ASCII
8 *   tab size:   8 (not used)
9 *   indentation:4
10 *
11 *   created on: 2010dec24
12 *   created by: Markus W. Scherer
13 */
14 
15 #ifndef __STRINGTRIEBUILDER_H__
16 #define __STRINGTRIEBUILDER_H__
17 
18 #include "unicode/utypes.h"
19 #include "unicode/uobject.h"
20 
21 // Forward declaration.
22 struct UHashtable;
23 typedef struct UHashtable UHashtable;
24 
25 /**
26  * Build options for BytesTrieBuilder and CharsTrieBuilder.
27  * @draft ICU 4.8
28  */
29 enum UStringTrieBuildOption {
30     /**
31      * Builds a trie quickly.
32      * @draft ICU 4.8
33      */
34     USTRINGTRIE_BUILD_FAST,
35     /**
36      * Builds a trie more slowly, attempting to generate
37      * a shorter but equivalent serialization.
38      * This build option also uses more memory.
39      *
40      * This option can be effective when many integer values are the same
41      * and string/byte sequence suffixes can be shared.
42      * Runtime speed is not expected to improve.
43      * @draft ICU 4.8
44      */
45     USTRINGTRIE_BUILD_SMALL
46 };
47 
48 U_NAMESPACE_BEGIN
49 
50 /**
51  * Base class for string trie builder classes.
52  *
53  * This class is not intended for public subclassing.
54  * @draft ICU 4.8
55  */
56 class U_COMMON_API StringTrieBuilder : public UObject {
57 public:
58     /** @internal */
59     static UBool hashNode(const void *node);
60     /** @internal */
61     static UBool equalNodes(const void *left, const void *right);
62 
63 protected:
64     /** @internal */
65     StringTrieBuilder();
66     /** @internal */
67     virtual ~StringTrieBuilder();
68 
69     /** @internal */
70     void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
71     /** @internal */
72     void deleteCompactBuilder();
73 
74     /** @internal */
75     void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
76 
77     /** @internal */
78     int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
79     /** @internal */
80     int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
81 
82     class Node;
83 
84     /** @internal */
85     Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
86     /** @internal */
87     Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
88                             int32_t length, UErrorCode &errorCode);
89 
90     /** @internal */
91     virtual int32_t getElementStringLength(int32_t i) const = 0;
92     /** @internal */
93     virtual UChar getElementUnit(int32_t i, int32_t unitIndex) const = 0;
94     /** @internal */
95     virtual int32_t getElementValue(int32_t i) const = 0;
96 
97     // Finds the first unit index after this one where
98     // the first and last element have different units again.
99     /** @internal */
100     virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
101 
102     // Number of different units at unitIndex.
103     /** @internal */
104     virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
105     /** @internal */
106     virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
107     /** @internal */
108     virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, UChar unit) const = 0;
109 
110     /** @internal */
111     virtual UBool matchNodesCanHaveValues() const = 0;
112 
113     /** @internal */
114     virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
115     /** @internal */
116     virtual int32_t getMinLinearMatch() const = 0;
117     /** @internal */
118     virtual int32_t getMaxLinearMatchLength() const = 0;
119 
120     // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
121     /** @internal */
122     static const int32_t kMaxBranchLinearSubNodeLength=5;
123 
124     // Maximum number of nested split-branch levels for a branch on all 2^16 possible UChar units.
125     // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
126     /** @internal */
127     static const int32_t kMaxSplitBranchLevels=14;
128 
129     /**
130      * Makes sure that there is only one unique node registered that is
131      * equivalent to newNode.
132      * @param newNode Input node. The builder takes ownership.
133      * @param errorCode ICU in/out UErrorCode.
134                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
135      * @return newNode if it is the first of its kind, or
136      *         an equivalent node if newNode is a duplicate.
137      * @internal
138      */
139     Node *registerNode(Node *newNode, UErrorCode &errorCode);
140     /**
141      * Makes sure that there is only one unique FinalValueNode registered
142      * with this value.
143      * Avoids creating a node if the value is a duplicate.
144      * @param value A final value.
145      * @param errorCode ICU in/out UErrorCode.
146                         Set to U_MEMORY_ALLOCATION_ERROR if it was success but newNode==NULL.
147      * @return A FinalValueNode with the given value.
148      * @internal
149      */
150     Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
151 
152     /*
153      * C++ note:
154      * registerNode() and registerFinalValue() take ownership of their input nodes,
155      * and only return owned nodes.
156      * If they see a failure UErrorCode, they will delete the input node.
157      * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
158      * If there is a failure, they return NULL.
159      *
160      * NULL Node pointers can be safely passed into other Nodes because
161      * they call the static Node::hashCode() which checks for a NULL pointer first.
162      *
163      * Therefore, as long as builder functions register a new node,
164      * they need to check for failures only before explicitly dereferencing
165      * a Node pointer, or before setting a new UErrorCode.
166      */
167 
168     // Hash set of nodes, maps from nodes to integer 1.
169     /** @internal */
170     UHashtable *nodes;
171 
172     /** @internal */
173     class Node : public UObject {
174     public:
Node(int32_t initialHash)175         Node(int32_t initialHash) : hash(initialHash), offset(0) {}
hashCode()176         inline int32_t hashCode() const { return hash; }
177         // Handles node==NULL.
hashCode(const Node * node)178         static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
179         // Base class operator==() compares the actual class types.
180         virtual UBool operator==(const Node &other) const;
181         inline UBool operator!=(const Node &other) const { return !operator==(other); }
182         /**
183          * Traverses the Node graph and numbers branch edges, with rightmost edges first.
184          * This is to avoid writing a duplicate node twice.
185          *
186          * Branch nodes in this trie data structure are not symmetric.
187          * Most branch edges "jump" to other nodes but the rightmost branch edges
188          * just continue without a jump.
189          * Therefore, write() must write the rightmost branch edge last
190          * (trie units are written backwards), and must write it at that point even if
191          * it is a duplicate of a node previously written elsewhere.
192          *
193          * This function visits and marks right branch edges first.
194          * Edges are numbered with increasingly negative values because we share the
195          * offset field which gets positive values when nodes are written.
196          * A branch edge also remembers the first number for any of its edges.
197          *
198          * When a further-left branch edge has a number in the range of the rightmost
199          * edge's numbers, then it will be written as part of the required right edge
200          * and we can avoid writing it first.
201          *
202          * After root.markRightEdgesFirst(-1) the offsets of all nodes are negative
203          * edge numbers.
204          *
205          * @param edgeNumber The first edge number for this node and its sub-nodes.
206          * @return An edge number that is at least the maximum-negative
207          *         of the input edge number and the numbers of this node and all of its sub-nodes.
208          */
209         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
210         // write() must set the offset to a positive value.
211         virtual void write(StringTrieBuilder &builder) = 0;
212         // See markRightEdgesFirst.
writeUnlessInsideRightEdge(int32_t firstRight,int32_t lastRight,StringTrieBuilder & builder)213         inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
214                                                StringTrieBuilder &builder) {
215             // Note: Edge numbers are negative, lastRight<=firstRight.
216             // If offset>0 then this node and its sub-nodes have been written already
217             // and we need not write them again.
218             // If this node is part of the unwritten right branch edge,
219             // then we wait until that is written.
220             if(offset<0 && (offset<lastRight || firstRight<offset)) {
221                 write(builder);
222             }
223         }
getOffset()224         inline int32_t getOffset() const { return offset; }
225     protected:
226         int32_t hash;
227         int32_t offset;
228     private:
229         // No ICU "poor man's RTTI" for this class nor its subclasses.
230         virtual UClassID getDynamicClassID() const;
231     };
232 
233     // This class should not be overridden because
234     // registerFinalValue() compares a stack-allocated FinalValueNode
235     // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
236     // with the input node, and the
237     // !Node::operator==(other) used inside FinalValueNode::operator==(other)
238     // will be false if the typeid's are different.
239     /** @internal */
240     class FinalValueNode : public Node {
241     public:
FinalValueNode(int32_t v)242         FinalValueNode(int32_t v) : Node(0x111111*37+v), value(v) {}
243         virtual UBool operator==(const Node &other) const;
244         virtual void write(StringTrieBuilder &builder);
245     protected:
246         int32_t value;
247     };
248 
249     /** @internal */
250     class ValueNode : public Node {
251     public:
ValueNode(int32_t initialHash)252         ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
253         virtual UBool operator==(const Node &other) const;
setValue(int32_t v)254         void setValue(int32_t v) {
255             hasValue=TRUE;
256             value=v;
257             hash=hash*37+v;
258         }
259     protected:
260         UBool hasValue;
261         int32_t value;
262     };
263 
264     /** @internal */
265     class IntermediateValueNode : public ValueNode {
266     public:
IntermediateValueNode(int32_t v,Node * nextNode)267         IntermediateValueNode(int32_t v, Node *nextNode)
268                 : ValueNode(0x222222*37+hashCode(nextNode)), next(nextNode) { setValue(v); }
269         virtual UBool operator==(const Node &other) const;
270         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
271         virtual void write(StringTrieBuilder &builder);
272     protected:
273         Node *next;
274     };
275 
276     /** @internal */
277     class LinearMatchNode : public ValueNode {
278     public:
LinearMatchNode(int32_t len,Node * nextNode)279         LinearMatchNode(int32_t len, Node *nextNode)
280                 : ValueNode((0x333333*37+len)*37+hashCode(nextNode)),
281                   length(len), next(nextNode) {}
282         virtual UBool operator==(const Node &other) const;
283         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
284     protected:
285         int32_t length;
286         Node *next;
287     };
288 
289     /** @internal */
290     class BranchNode : public Node {
291     public:
BranchNode(int32_t initialHash)292         BranchNode(int32_t initialHash) : Node(initialHash) {}
293     protected:
294         int32_t firstEdgeNumber;
295     };
296 
297     /** @internal */
298     class ListBranchNode : public BranchNode {
299     public:
ListBranchNode()300         ListBranchNode() : BranchNode(0x444444), length(0) {}
301         virtual UBool operator==(const Node &other) const;
302         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
303         virtual void write(StringTrieBuilder &builder);
304         // Adds a unit with a final value.
add(int32_t c,int32_t value)305         void add(int32_t c, int32_t value) {
306             units[length]=(UChar)c;
307             equal[length]=NULL;
308             values[length]=value;
309             ++length;
310             hash=(hash*37+c)*37+value;
311         }
312         // Adds a unit which leads to another match node.
add(int32_t c,Node * node)313         void add(int32_t c, Node *node) {
314             units[length]=(UChar)c;
315             equal[length]=node;
316             values[length]=0;
317             ++length;
318             hash=(hash*37+c)*37+hashCode(node);
319         }
320     protected:
321         Node *equal[kMaxBranchLinearSubNodeLength];  // NULL means "has final value".
322         int32_t length;
323         int32_t values[kMaxBranchLinearSubNodeLength];
324         UChar units[kMaxBranchLinearSubNodeLength];
325     };
326 
327     /** @internal */
328     class SplitBranchNode : public BranchNode {
329     public:
SplitBranchNode(UChar middleUnit,Node * lessThanNode,Node * greaterOrEqualNode)330         SplitBranchNode(UChar middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
331                 : BranchNode(((0x555555*37+middleUnit)*37+
332                               hashCode(lessThanNode))*37+hashCode(greaterOrEqualNode)),
333                   unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
334         virtual UBool operator==(const Node &other) const;
335         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
336         virtual void write(StringTrieBuilder &builder);
337     protected:
338         UChar unit;
339         Node *lessThan;
340         Node *greaterOrEqual;
341     };
342 
343     // Branch head node, for writing the actual node lead unit.
344     /** @internal */
345     class BranchHeadNode : public ValueNode {
346     public:
BranchHeadNode(int32_t len,Node * subNode)347         BranchHeadNode(int32_t len, Node *subNode)
348                 : ValueNode((0x666666*37+len)*37+hashCode(subNode)),
349                   length(len), next(subNode) {}
350         virtual UBool operator==(const Node &other) const;
351         virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
352         virtual void write(StringTrieBuilder &builder);
353     protected:
354         int32_t length;
355         Node *next;  // A branch sub-node.
356     };
357 
358     /** @internal */
359     virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
360                                         Node *nextNode) const = 0;
361 
362     /** @internal */
363     virtual int32_t write(int32_t unit) = 0;
364     /** @internal */
365     virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
366     /** @internal */
367     virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
368     /** @internal */
369     virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
370     /** @internal */
371     virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
372 
373 private:
374     // No ICU "poor man's RTTI" for this class nor its subclasses.
375     virtual UClassID getDynamicClassID() const;
376 };
377 
378 U_NAMESPACE_END
379 
380 #endif  // __STRINGTRIEBUILDER_H__
381