• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (c) 2002-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9 //
10 //  rbbitblb.cpp
11 //
12 
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_BREAK_ITERATION
17 
18 #include "unicode/unistr.h"
19 #include "rbbitblb.h"
20 #include "rbbirb.h"
21 #include "rbbisetb.h"
22 #include "rbbidata.h"
23 #include "cstring.h"
24 #include "uassert.h"
25 #include "cmemory.h"
26 
27 U_NAMESPACE_BEGIN
28 
RBBITableBuilder(RBBIRuleBuilder * rb,RBBINode ** rootNode)29 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
30  fTree(*rootNode) {
31     fRB                 = rb;
32     fStatus             = fRB->fStatus;
33     UErrorCode status   = U_ZERO_ERROR;
34     fDStates            = new UVector(status);
35     if (U_FAILURE(*fStatus)) {
36         return;
37     }
38     if (U_FAILURE(status)) {
39         *fStatus = status;
40         return;
41     }
42     if (fDStates == NULL) {
43         *fStatus = U_MEMORY_ALLOCATION_ERROR;;
44     }
45 }
46 
47 
48 
~RBBITableBuilder()49 RBBITableBuilder::~RBBITableBuilder() {
50     int i;
51     for (i=0; i<fDStates->size(); i++) {
52         delete (RBBIStateDescriptor *)fDStates->elementAt(i);
53     }
54     delete   fDStates;
55 }
56 
57 
58 //-----------------------------------------------------------------------------
59 //
60 //   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion
61 //                               table from the RBBI rules parse tree.
62 //
63 //-----------------------------------------------------------------------------
build()64 void  RBBITableBuilder::build() {
65 
66     if (U_FAILURE(*fStatus)) {
67         return;
68     }
69 
70     // If there were no rules, just return.  This situation can easily arise
71     //   for the reverse rules.
72     if (fTree==NULL) {
73         return;
74     }
75 
76     //
77     // Walk through the tree, replacing any references to $variables with a copy of the
78     //   parse tree for the substition expression.
79     //
80     fTree = fTree->flattenVariables();
81 #ifdef RBBI_DEBUG
82     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
83         RBBIDebugPuts("\nParse tree after flattening variable references.");
84         RBBINode::printTree(fTree, TRUE);
85     }
86 #endif
87 
88     //
89     // If the rules contained any references to {bof}
90     //   add a {bof} <cat> <former root of tree> to the
91     //   tree.  Means that all matches must start out with the
92     //   {bof} fake character.
93     //
94     if (fRB->fSetBuilder->sawBOF()) {
95         RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
96         RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
97         // Delete and exit if memory allocation failed.
98         if (bofTop == NULL || bofLeaf == NULL) {
99             *fStatus = U_MEMORY_ALLOCATION_ERROR;
100             delete bofTop;
101             delete bofLeaf;
102             return;
103         }
104         bofTop->fLeftChild  = bofLeaf;
105         bofTop->fRightChild = fTree;
106         bofLeaf->fParent    = bofTop;
107         bofLeaf->fVal       = 2;      // Reserved value for {bof}.
108         fTree               = bofTop;
109     }
110 
111     //
112     // Add a unique right-end marker to the expression.
113     //   Appears as a cat-node, left child being the original tree,
114     //   right child being the end marker.
115     //
116     RBBINode *cn = new RBBINode(RBBINode::opCat);
117     // Exit if memory allocation failed.
118     if (cn == NULL) {
119         *fStatus = U_MEMORY_ALLOCATION_ERROR;
120         return;
121     }
122     cn->fLeftChild = fTree;
123     fTree->fParent = cn;
124     cn->fRightChild = new RBBINode(RBBINode::endMark);
125     // Delete and exit if memory allocation failed.
126     if (cn->fRightChild == NULL) {
127         *fStatus = U_MEMORY_ALLOCATION_ERROR;
128         delete cn;
129         return;
130     }
131     cn->fRightChild->fParent = cn;
132     fTree = cn;
133 
134     //
135     //  Replace all references to UnicodeSets with the tree for the equivalent
136     //      expression.
137     //
138     fTree->flattenSets();
139 #ifdef RBBI_DEBUG
140     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
141         RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
142         RBBINode::printTree(fTree, TRUE);
143     }
144 #endif
145 
146 
147     //
148     // calculate the functions nullable, firstpos, lastpos and followpos on
149     // nodes in the parse tree.
150     //    See the alogrithm description in Aho.
151     //    Understanding how this works by looking at the code alone will be
152     //       nearly impossible.
153     //
154     calcNullable(fTree);
155     calcFirstPos(fTree);
156     calcLastPos(fTree);
157     calcFollowPos(fTree);
158     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
159         RBBIDebugPuts("\n");
160         printPosSets(fTree);
161     }
162 
163     //
164     //  For "chained" rules, modify the followPos sets
165     //
166     if (fRB->fChainRules) {
167         calcChainedFollowPos(fTree);
168     }
169 
170     //
171     //  BOF (start of input) test fixup.
172     //
173     if (fRB->fSetBuilder->sawBOF()) {
174         bofFixup();
175     }
176 
177     //
178     // Build the DFA state transition tables.
179     //
180     buildStateTable();
181     flagAcceptingStates();
182     flagLookAheadStates();
183     flagTaggedStates();
184 
185     //
186     // Update the global table of rule status {tag} values
187     // The rule builder has a global vector of status values that are common
188     //    for all tables.  Merge the ones from this table into the global set.
189     //
190     mergeRuleStatusVals();
191 
192     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
193 }
194 
195 
196 
197 //-----------------------------------------------------------------------------
198 //
199 //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
200 //
201 //-----------------------------------------------------------------------------
calcNullable(RBBINode * n)202 void RBBITableBuilder::calcNullable(RBBINode *n) {
203     if (n == NULL) {
204         return;
205     }
206     if (n->fType == RBBINode::setRef ||
207         n->fType == RBBINode::endMark ) {
208         // These are non-empty leaf node types.
209         n->fNullable = FALSE;
210         return;
211     }
212 
213     if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
214         // Lookahead marker node.  It's a leaf, so no recursion on children.
215         // It's nullable because it does not match any literal text from the input stream.
216         n->fNullable = TRUE;
217         return;
218     }
219 
220 
221     // The node is not a leaf.
222     //  Calculate nullable on its children.
223     calcNullable(n->fLeftChild);
224     calcNullable(n->fRightChild);
225 
226     // Apply functions from table 3.40 in Aho
227     if (n->fType == RBBINode::opOr) {
228         n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
229     }
230     else if (n->fType == RBBINode::opCat) {
231         n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
232     }
233     else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
234         n->fNullable = TRUE;
235     }
236     else {
237         n->fNullable = FALSE;
238     }
239 }
240 
241 
242 
243 
244 //-----------------------------------------------------------------------------
245 //
246 //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
247 //
248 //-----------------------------------------------------------------------------
calcFirstPos(RBBINode * n)249 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
250     if (n == NULL) {
251         return;
252     }
253     if (n->fType == RBBINode::leafChar  ||
254         n->fType == RBBINode::endMark   ||
255         n->fType == RBBINode::lookAhead ||
256         n->fType == RBBINode::tag) {
257         // These are non-empty leaf node types.
258         // Note: In order to maintain the sort invariant on the set,
259         // this function should only be called on a node whose set is
260         // empty to start with.
261         n->fFirstPosSet->addElement(n, *fStatus);
262         return;
263     }
264 
265     // The node is not a leaf.
266     //  Calculate firstPos on its children.
267     calcFirstPos(n->fLeftChild);
268     calcFirstPos(n->fRightChild);
269 
270     // Apply functions from table 3.40 in Aho
271     if (n->fType == RBBINode::opOr) {
272         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
273         setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
274     }
275     else if (n->fType == RBBINode::opCat) {
276         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
277         if (n->fLeftChild->fNullable) {
278             setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
279         }
280     }
281     else if (n->fType == RBBINode::opStar ||
282              n->fType == RBBINode::opQuestion ||
283              n->fType == RBBINode::opPlus) {
284         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
285     }
286 }
287 
288 
289 
290 //-----------------------------------------------------------------------------
291 //
292 //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
293 //
294 //-----------------------------------------------------------------------------
calcLastPos(RBBINode * n)295 void RBBITableBuilder::calcLastPos(RBBINode *n) {
296     if (n == NULL) {
297         return;
298     }
299     if (n->fType == RBBINode::leafChar  ||
300         n->fType == RBBINode::endMark   ||
301         n->fType == RBBINode::lookAhead ||
302         n->fType == RBBINode::tag) {
303         // These are non-empty leaf node types.
304         // Note: In order to maintain the sort invariant on the set,
305         // this function should only be called on a node whose set is
306         // empty to start with.
307         n->fLastPosSet->addElement(n, *fStatus);
308         return;
309     }
310 
311     // The node is not a leaf.
312     //  Calculate lastPos on its children.
313     calcLastPos(n->fLeftChild);
314     calcLastPos(n->fRightChild);
315 
316     // Apply functions from table 3.40 in Aho
317     if (n->fType == RBBINode::opOr) {
318         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
319         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
320     }
321     else if (n->fType == RBBINode::opCat) {
322         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
323         if (n->fRightChild->fNullable) {
324             setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
325         }
326     }
327     else if (n->fType == RBBINode::opStar     ||
328              n->fType == RBBINode::opQuestion ||
329              n->fType == RBBINode::opPlus) {
330         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
331     }
332 }
333 
334 
335 
336 //-----------------------------------------------------------------------------
337 //
338 //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
339 //
340 //-----------------------------------------------------------------------------
calcFollowPos(RBBINode * n)341 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
342     if (n == NULL ||
343         n->fType == RBBINode::leafChar ||
344         n->fType == RBBINode::endMark) {
345         return;
346     }
347 
348     calcFollowPos(n->fLeftChild);
349     calcFollowPos(n->fRightChild);
350 
351     // Aho rule #1
352     if (n->fType == RBBINode::opCat) {
353         RBBINode *i;   // is 'i' in Aho's description
354         uint32_t     ix;
355 
356         UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
357 
358         for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
359             i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
360             setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
361         }
362     }
363 
364     // Aho rule #2
365     if (n->fType == RBBINode::opStar ||
366         n->fType == RBBINode::opPlus) {
367         RBBINode   *i;  // again, n and i are the names from Aho's description.
368         uint32_t    ix;
369 
370         for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
371             i = (RBBINode *)n->fLastPosSet->elementAt(ix);
372             setAdd(i->fFollowPos, n->fFirstPosSet);
373         }
374     }
375 
376 
377 
378 }
379 
380 //-----------------------------------------------------------------------------
381 //
382 //    addRuleRootNodes    Recursively walk a parse tree, adding all nodes flagged
383 //                        as roots of a rule to a destination vector.
384 //
385 //-----------------------------------------------------------------------------
addRuleRootNodes(UVector * dest,RBBINode * node)386 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
387     if (node == NULL || U_FAILURE(*fStatus)) {
388         return;
389     }
390     if (node->fRuleRoot) {
391         dest->addElement(node, *fStatus);
392         // Note: rules cannot nest. If we found a rule start node,
393         //       no child node can also be a start node.
394         return;
395     }
396     addRuleRootNodes(dest, node->fLeftChild);
397     addRuleRootNodes(dest, node->fRightChild);
398 }
399 
400 //-----------------------------------------------------------------------------
401 //
402 //   calcChainedFollowPos.    Modify the previously calculated followPos sets
403 //                            to implement rule chaining.  NOT described by Aho
404 //
405 //-----------------------------------------------------------------------------
calcChainedFollowPos(RBBINode * tree)406 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
407 
408     UVector         endMarkerNodes(*fStatus);
409     UVector         leafNodes(*fStatus);
410     int32_t         i;
411 
412     if (U_FAILURE(*fStatus)) {
413         return;
414     }
415 
416     // get a list of all endmarker nodes.
417     tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
418 
419     // get a list all leaf nodes
420     tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
421     if (U_FAILURE(*fStatus)) {
422         return;
423     }
424 
425     // Collect all leaf nodes that can start matches for rules
426     // with inbound chaining enabled, which is the union of the
427     // firstPosition sets from each of the rule root nodes.
428 
429     UVector ruleRootNodes(*fStatus);
430     addRuleRootNodes(&ruleRootNodes, tree);
431 
432     UVector matchStartNodes(*fStatus);
433     for (int i=0; i<ruleRootNodes.size(); ++i) {
434         RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(i));
435         if (node->fChainIn) {
436             setAdd(&matchStartNodes, node->fFirstPosSet);
437         }
438     }
439     if (U_FAILURE(*fStatus)) {
440         return;
441     }
442 
443     int32_t  endNodeIx;
444     int32_t  startNodeIx;
445 
446     for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
447         RBBINode *tNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
448         RBBINode *endNode = NULL;
449 
450         // Identify leaf nodes that correspond to overall rule match positions.
451         //   These include an endMarkerNode in their followPos sets.
452         for (i=0; i<endMarkerNodes.size(); i++) {
453             if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
454                 endNode = tNode;
455                 break;
456             }
457         }
458         if (endNode == NULL) {
459             // node wasn't an end node.  Try again with the next.
460             continue;
461         }
462 
463         // We've got a node that can end a match.
464 
465         // Line Break Specific hack:  If this node's val correspond to the $CM char class,
466         //                            don't chain from it.
467         // TODO:  Add rule syntax for this behavior, get specifics out of here and
468         //        into the rule file.
469         if (fRB->fLBCMNoChain) {
470             UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
471             if (c != -1) {
472                 // c == -1 occurs with sets containing only the {eof} marker string.
473                 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
474                 if (cLBProp == U_LB_COMBINING_MARK) {
475                     continue;
476                 }
477             }
478         }
479 
480 
481         // Now iterate over the nodes that can start a match, looking for ones
482         //   with the same char class as our ending node.
483         RBBINode *startNode;
484         for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
485             startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
486             if (startNode->fType != RBBINode::leafChar) {
487                 continue;
488             }
489 
490             if (endNode->fVal == startNode->fVal) {
491                 // The end val (character class) of one possible match is the
492                 //   same as the start of another.
493 
494                 // Add all nodes from the followPos of the start node to the
495                 //  followPos set of the end node, which will have the effect of
496                 //  letting matches transition from a match state at endNode
497                 //  to the second char of a match starting with startNode.
498                 setAdd(endNode->fFollowPos, startNode->fFollowPos);
499             }
500         }
501     }
502 }
503 
504 
505 //-----------------------------------------------------------------------------
506 //
507 //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
508 //                Do an swizzle similar to chaining, modifying the followPos set of
509 //                the bofNode to include the followPos nodes from other {bot} nodes
510 //                scattered through the tree.
511 //
512 //                This function has much in common with calcChainedFollowPos().
513 //
514 //-----------------------------------------------------------------------------
bofFixup()515 void RBBITableBuilder::bofFixup() {
516 
517     if (U_FAILURE(*fStatus)) {
518         return;
519     }
520 
521     //   The parse tree looks like this ...
522     //         fTree root  --->       <cat>
523     //                               /     \       .
524     //                            <cat>   <#end node>
525     //                           /     \  .
526     //                     <bofNode>   rest
527     //                               of tree
528     //
529     //    We will be adding things to the followPos set of the <bofNode>
530     //
531     RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
532     U_ASSERT(bofNode->fType == RBBINode::leafChar);
533     U_ASSERT(bofNode->fVal == 2);
534 
535     // Get all nodes that can be the start a match of the user-written rules
536     //  (excluding the fake bofNode)
537     //  We want the nodes that can start a match in the
538     //     part labeled "rest of tree"
539     //
540     UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
541 
542     RBBINode *startNode;
543     int       startNodeIx;
544     for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
545         startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
546         if (startNode->fType != RBBINode::leafChar) {
547             continue;
548         }
549 
550         if (startNode->fVal == bofNode->fVal) {
551             //  We found a leaf node corresponding to a {bof} that was
552             //    explicitly written into a rule.
553             //  Add everything from the followPos set of this node to the
554             //    followPos set of the fake bofNode at the start of the tree.
555             //
556             setAdd(bofNode->fFollowPos, startNode->fFollowPos);
557         }
558     }
559 }
560 
561 //-----------------------------------------------------------------------------
562 //
563 //   buildStateTable()    Determine the set of runtime DFA states and the
564 //                        transition tables for these states, by the algorithm
565 //                        of fig. 3.44 in Aho.
566 //
567 //                        Most of the comments are quotes of Aho's psuedo-code.
568 //
569 //-----------------------------------------------------------------------------
buildStateTable()570 void RBBITableBuilder::buildStateTable() {
571     if (U_FAILURE(*fStatus)) {
572         return;
573     }
574     RBBIStateDescriptor *failState;
575     // Set it to NULL to avoid uninitialized warning
576     RBBIStateDescriptor *initialState = NULL;
577     //
578     // Add a dummy state 0 - the stop state.  Not from Aho.
579     int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
580     failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
581     if (failState == NULL) {
582         *fStatus = U_MEMORY_ALLOCATION_ERROR;
583         goto ExitBuildSTdeleteall;
584     }
585     failState->fPositions = new UVector(*fStatus);
586     if (failState->fPositions == NULL) {
587         *fStatus = U_MEMORY_ALLOCATION_ERROR;
588     }
589     if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
590         goto ExitBuildSTdeleteall;
591     }
592     fDStates->addElement(failState, *fStatus);
593     if (U_FAILURE(*fStatus)) {
594         goto ExitBuildSTdeleteall;
595     }
596 
597     // initially, the only unmarked state in Dstates is firstpos(root),
598     //       where toot is the root of the syntax tree for (r)#;
599     initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
600     if (initialState == NULL) {
601         *fStatus = U_MEMORY_ALLOCATION_ERROR;
602     }
603     if (U_FAILURE(*fStatus)) {
604         goto ExitBuildSTdeleteall;
605     }
606     initialState->fPositions = new UVector(*fStatus);
607     if (initialState->fPositions == NULL) {
608         *fStatus = U_MEMORY_ALLOCATION_ERROR;
609     }
610     if (U_FAILURE(*fStatus)) {
611         goto ExitBuildSTdeleteall;
612     }
613     setAdd(initialState->fPositions, fTree->fFirstPosSet);
614     fDStates->addElement(initialState, *fStatus);
615     if (U_FAILURE(*fStatus)) {
616         goto ExitBuildSTdeleteall;
617     }
618 
619     // while there is an unmarked state T in Dstates do begin
620     for (;;) {
621         RBBIStateDescriptor *T = NULL;
622         int32_t              tx;
623         for (tx=1; tx<fDStates->size(); tx++) {
624             RBBIStateDescriptor *temp;
625             temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
626             if (temp->fMarked == FALSE) {
627                 T = temp;
628                 break;
629             }
630         }
631         if (T == NULL) {
632             break;
633         }
634 
635         // mark T;
636         T->fMarked = TRUE;
637 
638         // for each input symbol a do begin
639         int32_t  a;
640         for (a = 1; a<=lastInputSymbol; a++) {
641             // let U be the set of positions that are in followpos(p)
642             //    for some position p in T
643             //    such that the symbol at position p is a;
644             UVector    *U = NULL;
645             RBBINode   *p;
646             int32_t     px;
647             for (px=0; px<T->fPositions->size(); px++) {
648                 p = (RBBINode *)T->fPositions->elementAt(px);
649                 if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
650                     if (U == NULL) {
651                         U = new UVector(*fStatus);
652                         if (U == NULL) {
653                         	*fStatus = U_MEMORY_ALLOCATION_ERROR;
654                         	goto ExitBuildSTdeleteall;
655                         }
656                     }
657                     setAdd(U, p->fFollowPos);
658                 }
659             }
660 
661             // if U is not empty and not in DStates then
662             int32_t  ux = 0;
663             UBool    UinDstates = FALSE;
664             if (U != NULL) {
665                 U_ASSERT(U->size() > 0);
666                 int  ix;
667                 for (ix=0; ix<fDStates->size(); ix++) {
668                     RBBIStateDescriptor *temp2;
669                     temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
670                     if (setEquals(U, temp2->fPositions)) {
671                         delete U;
672                         U  = temp2->fPositions;
673                         ux = ix;
674                         UinDstates = TRUE;
675                         break;
676                     }
677                 }
678 
679                 // Add U as an unmarked state to Dstates
680                 if (!UinDstates)
681                 {
682                     RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
683                     if (newState == NULL) {
684                     	*fStatus = U_MEMORY_ALLOCATION_ERROR;
685                     }
686                     if (U_FAILURE(*fStatus)) {
687                         goto ExitBuildSTdeleteall;
688                     }
689                     newState->fPositions = U;
690                     fDStates->addElement(newState, *fStatus);
691                     if (U_FAILURE(*fStatus)) {
692                         return;
693                     }
694                     ux = fDStates->size()-1;
695                 }
696 
697                 // Dtran[T, a] := U;
698                 T->fDtran->setElementAt(ux, a);
699             }
700         }
701     }
702     return;
703     // delete local pointers only if error occured.
704 ExitBuildSTdeleteall:
705     delete initialState;
706     delete failState;
707 }
708 
709 
710 
711 //-----------------------------------------------------------------------------
712 //
713 //   flagAcceptingStates    Identify accepting states.
714 //                          First get a list of all of the end marker nodes.
715 //                          Then, for each state s,
716 //                              if s contains one of the end marker nodes in its list of tree positions then
717 //                                  s is an accepting state.
718 //
719 //-----------------------------------------------------------------------------
flagAcceptingStates()720 void     RBBITableBuilder::flagAcceptingStates() {
721     if (U_FAILURE(*fStatus)) {
722         return;
723     }
724     UVector     endMarkerNodes(*fStatus);
725     RBBINode    *endMarker;
726     int32_t     i;
727     int32_t     n;
728 
729     if (U_FAILURE(*fStatus)) {
730         return;
731     }
732 
733     fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
734     if (U_FAILURE(*fStatus)) {
735         return;
736     }
737 
738     for (i=0; i<endMarkerNodes.size(); i++) {
739         endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
740         for (n=0; n<fDStates->size(); n++) {
741             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
742             if (sd->fPositions->indexOf(endMarker) >= 0) {
743                 // Any non-zero value for fAccepting means this is an accepting node.
744                 // The value is what will be returned to the user as the break status.
745                 // If no other value was specified, force it to -1.
746 
747                 if (sd->fAccepting==0) {
748                     // State hasn't been marked as accepting yet.  Do it now.
749                     sd->fAccepting = endMarker->fVal;
750                     if (sd->fAccepting == 0) {
751                         sd->fAccepting = -1;
752                     }
753                 }
754                 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
755                     // Both lookahead and non-lookahead accepting for this state.
756                     // Favor the look-ahead.  Expedient for line break.
757                     // TODO:  need a more elegant resolution for conflicting rules.
758                     sd->fAccepting = endMarker->fVal;
759                 }
760                 // implicit else:
761                 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
762 
763                 // If the end marker node is from a look-ahead rule, set
764                 //   the fLookAhead field or this state also.
765                 if (endMarker->fLookAheadEnd) {
766                     // TODO:  don't change value if already set?
767                     // TODO:  allow for more than one active look-ahead rule in engine.
768                     //        Make value here an index to a side array in engine?
769                     sd->fLookAhead = sd->fAccepting;
770                 }
771             }
772         }
773     }
774 }
775 
776 
777 //-----------------------------------------------------------------------------
778 //
779 //    flagLookAheadStates   Very similar to flagAcceptingStates, above.
780 //
781 //-----------------------------------------------------------------------------
flagLookAheadStates()782 void     RBBITableBuilder::flagLookAheadStates() {
783     if (U_FAILURE(*fStatus)) {
784         return;
785     }
786     UVector     lookAheadNodes(*fStatus);
787     RBBINode    *lookAheadNode;
788     int32_t     i;
789     int32_t     n;
790 
791     fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
792     if (U_FAILURE(*fStatus)) {
793         return;
794     }
795     for (i=0; i<lookAheadNodes.size(); i++) {
796         lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
797 
798         for (n=0; n<fDStates->size(); n++) {
799             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
800             if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
801                 sd->fLookAhead = lookAheadNode->fVal;
802             }
803         }
804     }
805 }
806 
807 
808 
809 
810 //-----------------------------------------------------------------------------
811 //
812 //    flagTaggedStates
813 //
814 //-----------------------------------------------------------------------------
flagTaggedStates()815 void     RBBITableBuilder::flagTaggedStates() {
816     if (U_FAILURE(*fStatus)) {
817         return;
818     }
819     UVector     tagNodes(*fStatus);
820     RBBINode    *tagNode;
821     int32_t     i;
822     int32_t     n;
823 
824     if (U_FAILURE(*fStatus)) {
825         return;
826     }
827     fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
828     if (U_FAILURE(*fStatus)) {
829         return;
830     }
831     for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
832         tagNode = (RBBINode *)tagNodes.elementAt(i);
833 
834         for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
835             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
836             if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
837                 sortedAdd(&sd->fTagVals, tagNode->fVal);
838             }
839         }
840     }
841 }
842 
843 
844 
845 
846 //-----------------------------------------------------------------------------
847 //
848 //  mergeRuleStatusVals
849 //
850 //      Update the global table of rule status {tag} values
851 //      The rule builder has a global vector of status values that are common
852 //      for all tables.  Merge the ones from this table into the global set.
853 //
854 //-----------------------------------------------------------------------------
mergeRuleStatusVals()855 void  RBBITableBuilder::mergeRuleStatusVals() {
856     //
857     //  The basic outline of what happens here is this...
858     //
859     //    for each state in this state table
860     //       if the status tag list for this state is in the global statuses list
861     //           record where and
862     //           continue with the next state
863     //       else
864     //           add the tag list for this state to the global list.
865     //
866     int i;
867     int n;
868 
869     // Pre-set a single tag of {0} into the table.
870     //   We will need this as a default, for rule sets with no explicit tagging.
871     if (fRB->fRuleStatusVals->size() == 0) {
872         fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
873         fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
874     }
875 
876     //    For each state
877     for (n=0; n<fDStates->size(); n++) {
878         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
879         UVector *thisStatesTagValues = sd->fTagVals;
880         if (thisStatesTagValues == NULL) {
881             // No tag values are explicitly associated with this state.
882             //   Set the default tag value.
883             sd->fTagsIdx = 0;
884             continue;
885         }
886 
887         // There are tag(s) associated with this state.
888         //   fTagsIdx will be the index into the global tag list for this state's tag values.
889         //   Initial value of -1 flags that we haven't got it set yet.
890         sd->fTagsIdx = -1;
891         int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
892         int32_t  nextTagGroupStart = 0;
893 
894         // Loop runs once per group of tags in the global list
895         while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
896             thisTagGroupStart = nextTagGroupStart;
897             nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
898             if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
899                 // The number of tags for this state is different from
900                 //    the number of tags in this group from the global list.
901                 //    Continue with the next group from the global list.
902                 continue;
903             }
904             // The lengths match, go ahead and compare the actual tag values
905             //    between this state and the group from the global list.
906             for (i=0; i<thisStatesTagValues->size(); i++) {
907                 if (thisStatesTagValues->elementAti(i) !=
908                     fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
909                     // Mismatch.
910                     break;
911                 }
912             }
913 
914             if (i == thisStatesTagValues->size()) {
915                 // We found a set of tag values in the global list that match
916                 //   those for this state.  Use them.
917                 sd->fTagsIdx = thisTagGroupStart;
918                 break;
919             }
920         }
921 
922         if (sd->fTagsIdx == -1) {
923             // No suitable entry in the global tag list already.  Add one
924             sd->fTagsIdx = fRB->fRuleStatusVals->size();
925             fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
926             for (i=0; i<thisStatesTagValues->size(); i++) {
927                 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
928             }
929         }
930     }
931 }
932 
933 
934 
935 
936 
937 
938 
939 //-----------------------------------------------------------------------------
940 //
941 //  sortedAdd  Add a value to a vector of sorted values (ints).
942 //             Do not replicate entries; if the value is already there, do not
943 //                add a second one.
944 //             Lazily create the vector if it does not already exist.
945 //
946 //-----------------------------------------------------------------------------
sortedAdd(UVector ** vector,int32_t val)947 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
948     int32_t i;
949 
950     if (*vector == NULL) {
951         *vector = new UVector(*fStatus);
952     }
953     if (*vector == NULL || U_FAILURE(*fStatus)) {
954         return;
955     }
956     UVector *vec = *vector;
957     int32_t  vSize = vec->size();
958     for (i=0; i<vSize; i++) {
959         int32_t valAtI = vec->elementAti(i);
960         if (valAtI == val) {
961             // The value is already in the vector.  Don't add it again.
962             return;
963         }
964         if (valAtI > val) {
965             break;
966         }
967     }
968     vec->insertElementAt(val, i, *fStatus);
969 }
970 
971 
972 
973 //-----------------------------------------------------------------------------
974 //
975 //  setAdd     Set operation on UVector
976 //             dest = dest union source
977 //             Elements may only appear once and must be sorted.
978 //
979 //-----------------------------------------------------------------------------
setAdd(UVector * dest,UVector * source)980 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
981     int32_t destOriginalSize = dest->size();
982     int32_t sourceSize       = source->size();
983     int32_t di           = 0;
984     MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
985     void **destPtr, **sourcePtr;
986     void **destLim, **sourceLim;
987 
988     if (destOriginalSize > destArray.getCapacity()) {
989         if (destArray.resize(destOriginalSize) == NULL) {
990             return;
991         }
992     }
993     destPtr = destArray.getAlias();
994     destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
995 
996     if (sourceSize > sourceArray.getCapacity()) {
997         if (sourceArray.resize(sourceSize) == NULL) {
998             return;
999         }
1000     }
1001     sourcePtr = sourceArray.getAlias();
1002     sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
1003 
1004     // Avoid multiple "get element" calls by getting the contents into arrays
1005     (void) dest->toArray(destPtr);
1006     (void) source->toArray(sourcePtr);
1007 
1008     dest->setSize(sourceSize+destOriginalSize, *fStatus);
1009 
1010     while (sourcePtr < sourceLim && destPtr < destLim) {
1011         if (*destPtr == *sourcePtr) {
1012             dest->setElementAt(*sourcePtr++, di++);
1013             destPtr++;
1014         }
1015         // This check is required for machines with segmented memory, like i5/OS.
1016         // Direct pointer comparison is not recommended.
1017         else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1018             dest->setElementAt(*destPtr++, di++);
1019         }
1020         else { /* *sourcePtr < *destPtr */
1021             dest->setElementAt(*sourcePtr++, di++);
1022         }
1023     }
1024 
1025     // At most one of these two cleanup loops will execute
1026     while (destPtr < destLim) {
1027         dest->setElementAt(*destPtr++, di++);
1028     }
1029     while (sourcePtr < sourceLim) {
1030         dest->setElementAt(*sourcePtr++, di++);
1031     }
1032 
1033     dest->setSize(di, *fStatus);
1034 }
1035 
1036 
1037 
1038 //-----------------------------------------------------------------------------
1039 //
1040 //  setEqual    Set operation on UVector.
1041 //              Compare for equality.
1042 //              Elements must be sorted.
1043 //
1044 //-----------------------------------------------------------------------------
setEquals(UVector * a,UVector * b)1045 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1046     return a->equals(*b);
1047 }
1048 
1049 
1050 //-----------------------------------------------------------------------------
1051 //
1052 //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
1053 //                 for each node in the tree.
1054 //
1055 //-----------------------------------------------------------------------------
1056 #ifdef RBBI_DEBUG
printPosSets(RBBINode * n)1057 void RBBITableBuilder::printPosSets(RBBINode *n) {
1058     if (n==NULL) {
1059         return;
1060     }
1061     printf("\n");
1062     RBBINode::printNodeHeader();
1063     RBBINode::printNode(n);
1064     RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");
1065 
1066     RBBIDebugPrintf("         firstpos:  ");
1067     printSet(n->fFirstPosSet);
1068 
1069     RBBIDebugPrintf("         lastpos:   ");
1070     printSet(n->fLastPosSet);
1071 
1072     RBBIDebugPrintf("         followpos: ");
1073     printSet(n->fFollowPos);
1074 
1075     printPosSets(n->fLeftChild);
1076     printPosSets(n->fRightChild);
1077 }
1078 #endif
1079 
1080 
1081 
1082 //-----------------------------------------------------------------------------
1083 //
1084 //   getTableSize()    Calculate the size of the runtime form of this
1085 //                     state transition table.
1086 //
1087 //-----------------------------------------------------------------------------
getTableSize() const1088 int32_t  RBBITableBuilder::getTableSize() const {
1089     int32_t    size = 0;
1090     int32_t    numRows;
1091     int32_t    numCols;
1092     int32_t    rowSize;
1093 
1094     if (fTree == NULL) {
1095         return 0;
1096     }
1097 
1098     size    = sizeof(RBBIStateTable) - 4;    // The header, with no rows to the table.
1099 
1100     numRows = fDStates->size();
1101     numCols = fRB->fSetBuilder->getNumCharCategories();
1102 
1103     //  Note  The declaration of RBBIStateTableRow is for a table of two columns.
1104     //        Therefore we subtract two from numCols when determining
1105     //        how much storage to add to a row for the total columns.
1106     rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
1107     size   += numRows * rowSize;
1108     return size;
1109 }
1110 
1111 
1112 
1113 //-----------------------------------------------------------------------------
1114 //
1115 //   exportTable()    export the state transition table in the format required
1116 //                    by the runtime engine.  getTableSize() bytes of memory
1117 //                    must be available at the output address "where".
1118 //
1119 //-----------------------------------------------------------------------------
exportTable(void * where)1120 void RBBITableBuilder::exportTable(void *where) {
1121     RBBIStateTable    *table = (RBBIStateTable *)where;
1122     uint32_t           state;
1123     int                col;
1124 
1125     if (U_FAILURE(*fStatus) || fTree == NULL) {
1126         return;
1127     }
1128 
1129     if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
1130         fDStates->size() > 0x7fff) {
1131         *fStatus = U_BRK_INTERNAL_ERROR;
1132         return;
1133     }
1134 
1135     table->fRowLen    = sizeof(RBBIStateTableRow) +
1136                             sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
1137     table->fNumStates = fDStates->size();
1138     table->fFlags     = 0;
1139     if (fRB->fLookAheadHardBreak) {
1140         table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
1141     }
1142     if (fRB->fSetBuilder->sawBOF()) {
1143         table->fFlags  |= RBBI_BOF_REQUIRED;
1144     }
1145     table->fReserved  = 0;
1146 
1147     for (state=0; state<table->fNumStates; state++) {
1148         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1149         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1150         U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1151         U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1152         row->fAccepting = (int16_t)sd->fAccepting;
1153         row->fLookAhead = (int16_t)sd->fLookAhead;
1154         row->fTagIdx    = (int16_t)sd->fTagsIdx;
1155         for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
1156             row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1157         }
1158     }
1159 }
1160 
1161 
1162 
1163 //-----------------------------------------------------------------------------
1164 //
1165 //   printSet    Debug function.   Print the contents of a UVector
1166 //
1167 //-----------------------------------------------------------------------------
1168 #ifdef RBBI_DEBUG
printSet(UVector * s)1169 void RBBITableBuilder::printSet(UVector *s) {
1170     int32_t  i;
1171     for (i=0; i<s->size(); i++) {
1172         const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1173         RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
1174     }
1175     RBBIDebugPrintf("\n");
1176 }
1177 #endif
1178 
1179 
1180 //-----------------------------------------------------------------------------
1181 //
1182 //   printStates    Debug Function.  Dump the fully constructed state transition table.
1183 //
1184 //-----------------------------------------------------------------------------
1185 #ifdef RBBI_DEBUG
printStates()1186 void RBBITableBuilder::printStates() {
1187     int     c;    // input "character"
1188     int     n;    // state number
1189 
1190     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1191     RBBIDebugPrintf("      | Acc  LA    Tag");
1192     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1193         RBBIDebugPrintf(" %2d", c);
1194     }
1195     RBBIDebugPrintf("\n");
1196     RBBIDebugPrintf("      |---------------");
1197     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1198         RBBIDebugPrintf("---");
1199     }
1200     RBBIDebugPrintf("\n");
1201 
1202     for (n=0; n<fDStates->size(); n++) {
1203         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1204         RBBIDebugPrintf("  %3d | " , n);
1205         RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1206         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1207             RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1208         }
1209         RBBIDebugPrintf("\n");
1210     }
1211     RBBIDebugPrintf("\n\n");
1212 }
1213 #endif
1214 
1215 
1216 
1217 //-----------------------------------------------------------------------------
1218 //
1219 //   printRuleStatusTable    Debug Function.  Dump the common rule status table
1220 //
1221 //-----------------------------------------------------------------------------
1222 #ifdef RBBI_DEBUG
printRuleStatusTable()1223 void RBBITableBuilder::printRuleStatusTable() {
1224     int32_t  thisRecord = 0;
1225     int32_t  nextRecord = 0;
1226     int      i;
1227     UVector  *tbl = fRB->fRuleStatusVals;
1228 
1229     RBBIDebugPrintf("index |  tags \n");
1230     RBBIDebugPrintf("-------------------\n");
1231 
1232     while (nextRecord < tbl->size()) {
1233         thisRecord = nextRecord;
1234         nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1235         RBBIDebugPrintf("%4d   ", thisRecord);
1236         for (i=thisRecord+1; i<nextRecord; i++) {
1237             RBBIDebugPrintf("  %5d", tbl->elementAti(i));
1238         }
1239         RBBIDebugPrintf("\n");
1240     }
1241     RBBIDebugPrintf("\n\n");
1242 }
1243 #endif
1244 
1245 
1246 //-----------------------------------------------------------------------------
1247 //
1248 //   RBBIStateDescriptor     Methods.  This is a very struct-like class
1249 //                           Most access is directly to the fields.
1250 //
1251 //-----------------------------------------------------------------------------
1252 
RBBIStateDescriptor(int lastInputSymbol,UErrorCode * fStatus)1253 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1254     fMarked    = FALSE;
1255     fAccepting = 0;
1256     fLookAhead = 0;
1257     fTagsIdx   = 0;
1258     fTagVals   = NULL;
1259     fPositions = NULL;
1260     fDtran     = NULL;
1261 
1262     fDtran     = new UVector(lastInputSymbol+1, *fStatus);
1263     if (U_FAILURE(*fStatus)) {
1264         return;
1265     }
1266     if (fDtran == NULL) {
1267         *fStatus = U_MEMORY_ALLOCATION_ERROR;
1268         return;
1269     }
1270     fDtran->setSize(lastInputSymbol+1, *fStatus);    // fDtran needs to be pre-sized.
1271                                            //   It is indexed by input symbols, and will
1272                                            //   hold  the next state number for each
1273                                            //   symbol.
1274 }
1275 
1276 
~RBBIStateDescriptor()1277 RBBIStateDescriptor::~RBBIStateDescriptor() {
1278     delete       fPositions;
1279     delete       fDtran;
1280     delete       fTagVals;
1281     fPositions = NULL;
1282     fDtran     = NULL;
1283     fTagVals   = NULL;
1284 }
1285 
1286 U_NAMESPACE_END
1287 
1288 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1289