• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (c) 2002-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9 //
10 //  rbbitblb.cpp
11 //
12 
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_BREAK_ITERATION
17 
18 #include "unicode/unistr.h"
19 #include "rbbitblb.h"
20 #include "rbbirb.h"
21 #include "rbbiscan.h"
22 #include "rbbisetb.h"
23 #include "rbbidata.h"
24 #include "cstring.h"
25 #include "uassert.h"
26 #include "uvectr32.h"
27 #include "cmemory.h"
28 
29 U_NAMESPACE_BEGIN
30 
RBBITableBuilder(RBBIRuleBuilder * rb,RBBINode ** rootNode,UErrorCode & status)31 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
32         fRB(rb),
33         fTree(*rootNode),
34         fStatus(&status),
35         fDStates(nullptr),
36         fSafeTable(nullptr) {
37     if (U_FAILURE(status)) {
38         return;
39     }
40     // fDStates is UVector<RBBIStateDescriptor *>
41     fDStates = new UVector(status);
42     if (U_SUCCESS(status) && fDStates == nullptr ) {
43         status = U_MEMORY_ALLOCATION_ERROR;
44     }
45 }
46 
47 
48 
~RBBITableBuilder()49 RBBITableBuilder::~RBBITableBuilder() {
50     int i;
51     for (i=0; i<fDStates->size(); i++) {
52         delete (RBBIStateDescriptor *)fDStates->elementAt(i);
53     }
54     delete fDStates;
55     delete fSafeTable;
56     delete fLookAheadRuleMap;
57 }
58 
59 
60 //-----------------------------------------------------------------------------
61 //
62 //   RBBITableBuilder::buildForwardTable  -  This is the main function for building
63 //                               the DFA state transition table from the RBBI rules parse tree.
64 //
65 //-----------------------------------------------------------------------------
buildForwardTable()66 void  RBBITableBuilder::buildForwardTable() {
67 
68     if (U_FAILURE(*fStatus)) {
69         return;
70     }
71 
72     // If there were no rules, just return.  This situation can easily arise
73     //   for the reverse rules.
74     if (fTree==NULL) {
75         return;
76     }
77 
78     //
79     // Walk through the tree, replacing any references to $variables with a copy of the
80     //   parse tree for the substition expression.
81     //
82     fTree = fTree->flattenVariables();
83 #ifdef RBBI_DEBUG
84     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
85         RBBIDebugPuts("\nParse tree after flattening variable references.");
86         RBBINode::printTree(fTree, TRUE);
87     }
88 #endif
89 
90     //
91     // If the rules contained any references to {bof}
92     //   add a {bof} <cat> <former root of tree> to the
93     //   tree.  Means that all matches must start out with the
94     //   {bof} fake character.
95     //
96     if (fRB->fSetBuilder->sawBOF()) {
97         RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
98         RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
99         // Delete and exit if memory allocation failed.
100         if (bofTop == NULL || bofLeaf == NULL) {
101             *fStatus = U_MEMORY_ALLOCATION_ERROR;
102             delete bofTop;
103             delete bofLeaf;
104             return;
105         }
106         bofTop->fLeftChild  = bofLeaf;
107         bofTop->fRightChild = fTree;
108         bofLeaf->fParent    = bofTop;
109         bofLeaf->fVal       = 2;      // Reserved value for {bof}.
110         fTree               = bofTop;
111     }
112 
113     //
114     // Add a unique right-end marker to the expression.
115     //   Appears as a cat-node, left child being the original tree,
116     //   right child being the end marker.
117     //
118     RBBINode *cn = new RBBINode(RBBINode::opCat);
119     // Exit if memory allocation failed.
120     if (cn == NULL) {
121         *fStatus = U_MEMORY_ALLOCATION_ERROR;
122         return;
123     }
124     cn->fLeftChild = fTree;
125     fTree->fParent = cn;
126     RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark);
127     // Delete and exit if memory allocation failed.
128     if (cn->fRightChild == NULL) {
129         *fStatus = U_MEMORY_ALLOCATION_ERROR;
130         delete cn;
131         return;
132     }
133     cn->fRightChild->fParent = cn;
134     fTree = cn;
135 
136     //
137     //  Replace all references to UnicodeSets with the tree for the equivalent
138     //      expression.
139     //
140     fTree->flattenSets();
141 #ifdef RBBI_DEBUG
142     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
143         RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
144         RBBINode::printTree(fTree, TRUE);
145     }
146 #endif
147 
148 
149     //
150     // calculate the functions nullable, firstpos, lastpos and followpos on
151     // nodes in the parse tree.
152     //    See the alogrithm description in Aho.
153     //    Understanding how this works by looking at the code alone will be
154     //       nearly impossible.
155     //
156     calcNullable(fTree);
157     calcFirstPos(fTree);
158     calcLastPos(fTree);
159     calcFollowPos(fTree);
160     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
161         RBBIDebugPuts("\n");
162         printPosSets(fTree);
163     }
164 
165     //
166     //  For "chained" rules, modify the followPos sets
167     //
168     if (fRB->fChainRules) {
169         calcChainedFollowPos(fTree, endMarkerNode);
170     }
171 
172     //
173     //  BOF (start of input) test fixup.
174     //
175     if (fRB->fSetBuilder->sawBOF()) {
176         bofFixup();
177     }
178 
179     //
180     // Build the DFA state transition tables.
181     //
182     buildStateTable();
183     mapLookAheadRules();
184     flagAcceptingStates();
185     flagLookAheadStates();
186     flagTaggedStates();
187 
188     //
189     // Update the global table of rule status {tag} values
190     // The rule builder has a global vector of status values that are common
191     //    for all tables.  Merge the ones from this table into the global set.
192     //
193     mergeRuleStatusVals();
194 }
195 
196 
197 
198 //-----------------------------------------------------------------------------
199 //
200 //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
201 //
202 //-----------------------------------------------------------------------------
calcNullable(RBBINode * n)203 void RBBITableBuilder::calcNullable(RBBINode *n) {
204     if (n == NULL) {
205         return;
206     }
207     if (n->fType == RBBINode::setRef ||
208         n->fType == RBBINode::endMark ) {
209         // These are non-empty leaf node types.
210         n->fNullable = FALSE;
211         return;
212     }
213 
214     if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
215         // Lookahead marker node.  It's a leaf, so no recursion on children.
216         // It's nullable because it does not match any literal text from the input stream.
217         n->fNullable = TRUE;
218         return;
219     }
220 
221 
222     // The node is not a leaf.
223     //  Calculate nullable on its children.
224     calcNullable(n->fLeftChild);
225     calcNullable(n->fRightChild);
226 
227     // Apply functions from table 3.40 in Aho
228     if (n->fType == RBBINode::opOr) {
229         n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
230     }
231     else if (n->fType == RBBINode::opCat) {
232         n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
233     }
234     else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
235         n->fNullable = TRUE;
236     }
237     else {
238         n->fNullable = FALSE;
239     }
240 }
241 
242 
243 
244 
245 //-----------------------------------------------------------------------------
246 //
247 //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
248 //
249 //-----------------------------------------------------------------------------
calcFirstPos(RBBINode * n)250 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
251     if (n == NULL) {
252         return;
253     }
254     if (n->fType == RBBINode::leafChar  ||
255         n->fType == RBBINode::endMark   ||
256         n->fType == RBBINode::lookAhead ||
257         n->fType == RBBINode::tag) {
258         // These are non-empty leaf node types.
259         // Note: In order to maintain the sort invariant on the set,
260         // this function should only be called on a node whose set is
261         // empty to start with.
262         n->fFirstPosSet->addElement(n, *fStatus);
263         return;
264     }
265 
266     // The node is not a leaf.
267     //  Calculate firstPos on its children.
268     calcFirstPos(n->fLeftChild);
269     calcFirstPos(n->fRightChild);
270 
271     // Apply functions from table 3.40 in Aho
272     if (n->fType == RBBINode::opOr) {
273         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
274         setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
275     }
276     else if (n->fType == RBBINode::opCat) {
277         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
278         if (n->fLeftChild->fNullable) {
279             setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
280         }
281     }
282     else if (n->fType == RBBINode::opStar ||
283              n->fType == RBBINode::opQuestion ||
284              n->fType == RBBINode::opPlus) {
285         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
286     }
287 }
288 
289 
290 
291 //-----------------------------------------------------------------------------
292 //
293 //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
294 //
295 //-----------------------------------------------------------------------------
calcLastPos(RBBINode * n)296 void RBBITableBuilder::calcLastPos(RBBINode *n) {
297     if (n == NULL) {
298         return;
299     }
300     if (n->fType == RBBINode::leafChar  ||
301         n->fType == RBBINode::endMark   ||
302         n->fType == RBBINode::lookAhead ||
303         n->fType == RBBINode::tag) {
304         // These are non-empty leaf node types.
305         // Note: In order to maintain the sort invariant on the set,
306         // this function should only be called on a node whose set is
307         // empty to start with.
308         n->fLastPosSet->addElement(n, *fStatus);
309         return;
310     }
311 
312     // The node is not a leaf.
313     //  Calculate lastPos on its children.
314     calcLastPos(n->fLeftChild);
315     calcLastPos(n->fRightChild);
316 
317     // Apply functions from table 3.40 in Aho
318     if (n->fType == RBBINode::opOr) {
319         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
320         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
321     }
322     else if (n->fType == RBBINode::opCat) {
323         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
324         if (n->fRightChild->fNullable) {
325             setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
326         }
327     }
328     else if (n->fType == RBBINode::opStar     ||
329              n->fType == RBBINode::opQuestion ||
330              n->fType == RBBINode::opPlus) {
331         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
332     }
333 }
334 
335 
336 
337 //-----------------------------------------------------------------------------
338 //
339 //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
340 //
341 //-----------------------------------------------------------------------------
calcFollowPos(RBBINode * n)342 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
343     if (n == NULL ||
344         n->fType == RBBINode::leafChar ||
345         n->fType == RBBINode::endMark) {
346         return;
347     }
348 
349     calcFollowPos(n->fLeftChild);
350     calcFollowPos(n->fRightChild);
351 
352     // Aho rule #1
353     if (n->fType == RBBINode::opCat) {
354         RBBINode *i;   // is 'i' in Aho's description
355         uint32_t     ix;
356 
357         UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
358 
359         for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
360             i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
361             setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
362         }
363     }
364 
365     // Aho rule #2
366     if (n->fType == RBBINode::opStar ||
367         n->fType == RBBINode::opPlus) {
368         RBBINode   *i;  // again, n and i are the names from Aho's description.
369         uint32_t    ix;
370 
371         for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
372             i = (RBBINode *)n->fLastPosSet->elementAt(ix);
373             setAdd(i->fFollowPos, n->fFirstPosSet);
374         }
375     }
376 
377 
378 
379 }
380 
381 //-----------------------------------------------------------------------------
382 //
383 //    addRuleRootNodes    Recursively walk a parse tree, adding all nodes flagged
384 //                        as roots of a rule to a destination vector.
385 //
386 //-----------------------------------------------------------------------------
addRuleRootNodes(UVector * dest,RBBINode * node)387 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
388     if (node == NULL || U_FAILURE(*fStatus)) {
389         return;
390     }
391     if (node->fRuleRoot) {
392         dest->addElement(node, *fStatus);
393         // Note: rules cannot nest. If we found a rule start node,
394         //       no child node can also be a start node.
395         return;
396     }
397     addRuleRootNodes(dest, node->fLeftChild);
398     addRuleRootNodes(dest, node->fRightChild);
399 }
400 
401 //-----------------------------------------------------------------------------
402 //
403 //   calcChainedFollowPos.    Modify the previously calculated followPos sets
404 //                            to implement rule chaining.  NOT described by Aho
405 //
406 //-----------------------------------------------------------------------------
calcChainedFollowPos(RBBINode * tree,RBBINode * endMarkNode)407 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) {
408 
409     UVector         leafNodes(*fStatus);
410     if (U_FAILURE(*fStatus)) {
411         return;
412     }
413 
414     // get a list all leaf nodes
415     tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
416     if (U_FAILURE(*fStatus)) {
417         return;
418     }
419 
420     // Collect all leaf nodes that can start matches for rules
421     // with inbound chaining enabled, which is the union of the
422     // firstPosition sets from each of the rule root nodes.
423 
424     UVector ruleRootNodes(*fStatus);
425     addRuleRootNodes(&ruleRootNodes, tree);
426 
427     UVector matchStartNodes(*fStatus);
428     for (int j=0; j<ruleRootNodes.size(); ++j) {
429         RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j));
430         if (node->fChainIn) {
431             setAdd(&matchStartNodes, node->fFirstPosSet);
432         }
433     }
434     if (U_FAILURE(*fStatus)) {
435         return;
436     }
437 
438     int32_t  endNodeIx;
439     int32_t  startNodeIx;
440 
441     for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
442         RBBINode *endNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
443 
444         // Identify leaf nodes that correspond to overall rule match positions.
445         // These include the endMarkNode in their followPos sets.
446         //
447         // Note: do not consider other end marker nodes, those that are added to
448         //       look-ahead rules. These can't chain; a match immediately stops
449         //       further matching. This leaves exactly one end marker node, the one
450         //       at the end of the complete tree.
451 
452         if (!endNode->fFollowPos->contains(endMarkNode)) {
453             continue;
454         }
455 
456         // We've got a node that can end a match.
457 
458         // !!LBCMNoChain implementation:  If this node's val correspond to
459         // the Line Break $CM char class, don't chain from it.
460         // TODO:  Remove this. !!LBCMNoChain is deprecated, and is not used
461         //        by any of the standard ICU rules.
462         if (fRB->fLBCMNoChain) {
463             UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
464             if (c != -1) {
465                 // c == -1 occurs with sets containing only the {eof} marker string.
466                 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
467                 if (cLBProp == U_LB_COMBINING_MARK) {
468                     continue;
469                 }
470             }
471         }
472 
473         // Now iterate over the nodes that can start a match, looking for ones
474         //   with the same char class as our ending node.
475         RBBINode *startNode;
476         for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
477             startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
478             if (startNode->fType != RBBINode::leafChar) {
479                 continue;
480             }
481 
482             if (endNode->fVal == startNode->fVal) {
483                 // The end val (character class) of one possible match is the
484                 //   same as the start of another.
485 
486                 // Add all nodes from the followPos of the start node to the
487                 //  followPos set of the end node, which will have the effect of
488                 //  letting matches transition from a match state at endNode
489                 //  to the second char of a match starting with startNode.
490                 setAdd(endNode->fFollowPos, startNode->fFollowPos);
491             }
492         }
493     }
494 }
495 
496 
497 //-----------------------------------------------------------------------------
498 //
499 //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
500 //                Do an swizzle similar to chaining, modifying the followPos set of
501 //                the bofNode to include the followPos nodes from other {bot} nodes
502 //                scattered through the tree.
503 //
504 //                This function has much in common with calcChainedFollowPos().
505 //
506 //-----------------------------------------------------------------------------
bofFixup()507 void RBBITableBuilder::bofFixup() {
508 
509     if (U_FAILURE(*fStatus)) {
510         return;
511     }
512 
513     //   The parse tree looks like this ...
514     //         fTree root  --->       <cat>
515     //                               /     \       .
516     //                            <cat>   <#end node>
517     //                           /     \  .
518     //                     <bofNode>   rest
519     //                               of tree
520     //
521     //    We will be adding things to the followPos set of the <bofNode>
522     //
523     RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
524     U_ASSERT(bofNode->fType == RBBINode::leafChar);
525     U_ASSERT(bofNode->fVal == 2);
526 
527     // Get all nodes that can be the start a match of the user-written rules
528     //  (excluding the fake bofNode)
529     //  We want the nodes that can start a match in the
530     //     part labeled "rest of tree"
531     //
532     UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
533 
534     RBBINode *startNode;
535     int       startNodeIx;
536     for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
537         startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
538         if (startNode->fType != RBBINode::leafChar) {
539             continue;
540         }
541 
542         if (startNode->fVal == bofNode->fVal) {
543             //  We found a leaf node corresponding to a {bof} that was
544             //    explicitly written into a rule.
545             //  Add everything from the followPos set of this node to the
546             //    followPos set of the fake bofNode at the start of the tree.
547             //
548             setAdd(bofNode->fFollowPos, startNode->fFollowPos);
549         }
550     }
551 }
552 
553 //-----------------------------------------------------------------------------
554 //
555 //   buildStateTable()    Determine the set of runtime DFA states and the
556 //                        transition tables for these states, by the algorithm
557 //                        of fig. 3.44 in Aho.
558 //
559 //                        Most of the comments are quotes of Aho's psuedo-code.
560 //
561 //-----------------------------------------------------------------------------
buildStateTable()562 void RBBITableBuilder::buildStateTable() {
563     if (U_FAILURE(*fStatus)) {
564         return;
565     }
566     RBBIStateDescriptor *failState;
567     // Set it to NULL to avoid uninitialized warning
568     RBBIStateDescriptor *initialState = NULL;
569     //
570     // Add a dummy state 0 - the stop state.  Not from Aho.
571     int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
572     failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
573     if (failState == NULL) {
574         *fStatus = U_MEMORY_ALLOCATION_ERROR;
575         goto ExitBuildSTdeleteall;
576     }
577     failState->fPositions = new UVector(*fStatus);
578     if (failState->fPositions == NULL) {
579         *fStatus = U_MEMORY_ALLOCATION_ERROR;
580     }
581     if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
582         goto ExitBuildSTdeleteall;
583     }
584     fDStates->addElement(failState, *fStatus);
585     if (U_FAILURE(*fStatus)) {
586         goto ExitBuildSTdeleteall;
587     }
588 
589     // initially, the only unmarked state in Dstates is firstpos(root),
590     //       where toot is the root of the syntax tree for (r)#;
591     initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
592     if (initialState == NULL) {
593         *fStatus = U_MEMORY_ALLOCATION_ERROR;
594     }
595     if (U_FAILURE(*fStatus)) {
596         goto ExitBuildSTdeleteall;
597     }
598     initialState->fPositions = new UVector(*fStatus);
599     if (initialState->fPositions == NULL) {
600         *fStatus = U_MEMORY_ALLOCATION_ERROR;
601     }
602     if (U_FAILURE(*fStatus)) {
603         goto ExitBuildSTdeleteall;
604     }
605     setAdd(initialState->fPositions, fTree->fFirstPosSet);
606     fDStates->addElement(initialState, *fStatus);
607     if (U_FAILURE(*fStatus)) {
608         goto ExitBuildSTdeleteall;
609     }
610 
611     // while there is an unmarked state T in Dstates do begin
612     for (;;) {
613         RBBIStateDescriptor *T = NULL;
614         int32_t              tx;
615         for (tx=1; tx<fDStates->size(); tx++) {
616             RBBIStateDescriptor *temp;
617             temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
618             if (temp->fMarked == FALSE) {
619                 T = temp;
620                 break;
621             }
622         }
623         if (T == NULL) {
624             break;
625         }
626 
627         // mark T;
628         T->fMarked = TRUE;
629 
630         // for each input symbol a do begin
631         int32_t  a;
632         for (a = 1; a<=lastInputSymbol; a++) {
633             // let U be the set of positions that are in followpos(p)
634             //    for some position p in T
635             //    such that the symbol at position p is a;
636             UVector    *U = NULL;
637             RBBINode   *p;
638             int32_t     px;
639             for (px=0; px<T->fPositions->size(); px++) {
640                 p = (RBBINode *)T->fPositions->elementAt(px);
641                 if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
642                     if (U == NULL) {
643                         U = new UVector(*fStatus);
644                         if (U == NULL) {
645 				*fStatus = U_MEMORY_ALLOCATION_ERROR;
646 				goto ExitBuildSTdeleteall;
647                         }
648                     }
649                     setAdd(U, p->fFollowPos);
650                 }
651             }
652 
653             // if U is not empty and not in DStates then
654             int32_t  ux = 0;
655             UBool    UinDstates = FALSE;
656             if (U != NULL) {
657                 U_ASSERT(U->size() > 0);
658                 int  ix;
659                 for (ix=0; ix<fDStates->size(); ix++) {
660                     RBBIStateDescriptor *temp2;
661                     temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
662                     if (setEquals(U, temp2->fPositions)) {
663                         delete U;
664                         U  = temp2->fPositions;
665                         ux = ix;
666                         UinDstates = TRUE;
667                         break;
668                     }
669                 }
670 
671                 // Add U as an unmarked state to Dstates
672                 if (!UinDstates)
673                 {
674                     RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
675                     if (newState == NULL) {
676 			*fStatus = U_MEMORY_ALLOCATION_ERROR;
677                     }
678                     if (U_FAILURE(*fStatus)) {
679                         goto ExitBuildSTdeleteall;
680                     }
681                     newState->fPositions = U;
682                     fDStates->addElement(newState, *fStatus);
683                     if (U_FAILURE(*fStatus)) {
684                         return;
685                     }
686                     ux = fDStates->size()-1;
687                 }
688 
689                 // Dtran[T, a] := U;
690                 T->fDtran->setElementAt(ux, a);
691             }
692         }
693     }
694     return;
695     // delete local pointers only if error occured.
696 ExitBuildSTdeleteall:
697     delete initialState;
698     delete failState;
699 }
700 
701 
702 /**
703  * mapLookAheadRules
704  *
705  */
mapLookAheadRules()706 void RBBITableBuilder::mapLookAheadRules() {
707     fLookAheadRuleMap =  new UVector32(fRB->fScanner->numRules() + 1, *fStatus);
708     if (fLookAheadRuleMap == nullptr) {
709         *fStatus = U_MEMORY_ALLOCATION_ERROR;
710     }
711     if (U_FAILURE(*fStatus)) {
712         return;
713     }
714     fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
715     int32_t laSlotsInUse = 0;
716 
717     for (int32_t n=0; n<fDStates->size(); n++) {
718         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
719         int32_t laSlotForState = 0;
720 
721         // Establish the look-ahead slot for this state, if the state covers
722         // any look-ahead nodes - corresponding to the '/' in look-ahead rules.
723 
724         // If any of the look-ahead nodes already have a slot assigned, use it,
725         // otherwise assign a new one.
726 
727         bool sawLookAheadNode = false;
728         for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
729             RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
730             if (node->fType != RBBINode::NodeType::lookAhead) {
731                 continue;
732             }
733             sawLookAheadNode = true;
734             int32_t ruleNum = node->fVal;     // Set when rule was originally parsed.
735             U_ASSERT(ruleNum < fLookAheadRuleMap->size());
736             U_ASSERT(ruleNum > 0);
737             int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum);
738             if (laSlot != 0) {
739                 if (laSlotForState == 0) {
740                     laSlotForState = laSlot;
741                 } else {
742                     // TODO: figure out if this can fail, change to setting an error code if so.
743                     U_ASSERT(laSlot == laSlotForState);
744                 }
745             }
746         }
747         if (!sawLookAheadNode) {
748             continue;
749         }
750 
751         if (laSlotForState == 0) {
752             laSlotForState = ++laSlotsInUse;
753         }
754 
755         // For each look ahead node covered by this state,
756         // set the mapping from the node's rule number to the look ahead slot.
757         // There can be multiple nodes/rule numbers going to the same la slot.
758 
759         for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
760             RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
761             if (node->fType != RBBINode::NodeType::lookAhead) {
762                 continue;
763             }
764             int32_t ruleNum = node->fVal;     // Set when rule was originally parsed.
765             int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum);
766             (void)existingVal;
767             U_ASSERT(existingVal == 0 || existingVal == laSlotForState);
768             fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum);
769         }
770     }
771 
772 }
773 
774 //-----------------------------------------------------------------------------
775 //
776 //   flagAcceptingStates    Identify accepting states.
777 //                          First get a list of all of the end marker nodes.
778 //                          Then, for each state s,
779 //                              if s contains one of the end marker nodes in its list of tree positions then
780 //                                  s is an accepting state.
781 //
782 //-----------------------------------------------------------------------------
flagAcceptingStates()783 void     RBBITableBuilder::flagAcceptingStates() {
784     if (U_FAILURE(*fStatus)) {
785         return;
786     }
787     UVector     endMarkerNodes(*fStatus);
788     RBBINode    *endMarker;
789     int32_t     i;
790     int32_t     n;
791 
792     if (U_FAILURE(*fStatus)) {
793         return;
794     }
795 
796     fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
797     if (U_FAILURE(*fStatus)) {
798         return;
799     }
800 
801     for (i=0; i<endMarkerNodes.size(); i++) {
802         endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
803         for (n=0; n<fDStates->size(); n++) {
804             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
805             if (sd->fPositions->indexOf(endMarker) >= 0) {
806                 // Any non-zero value for fAccepting means this is an accepting node.
807                 // The value is what will be returned to the user as the break status.
808                 // If no other value was specified, force it to -1.
809 
810                 if (sd->fAccepting==0) {
811                     // State hasn't been marked as accepting yet.  Do it now.
812                     sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
813                     if (sd->fAccepting == 0) {
814                         sd->fAccepting = -1;
815                     }
816                 }
817                 if (sd->fAccepting==-1 && endMarker->fVal != 0) {
818                     // Both lookahead and non-lookahead accepting for this state.
819                     // Favor the look-ahead, because a look-ahead match needs to
820                     // immediately stop the run-time engine. First match, not longest.
821                     sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
822                 }
823                 // implicit else:
824                 // if sd->fAccepting already had a value other than 0 or -1, leave it be.
825             }
826         }
827     }
828 }
829 
830 
831 //-----------------------------------------------------------------------------
832 //
833 //    flagLookAheadStates   Very similar to flagAcceptingStates, above.
834 //
835 //-----------------------------------------------------------------------------
flagLookAheadStates()836 void     RBBITableBuilder::flagLookAheadStates() {
837     if (U_FAILURE(*fStatus)) {
838         return;
839     }
840     UVector     lookAheadNodes(*fStatus);
841     RBBINode    *lookAheadNode;
842     int32_t     i;
843     int32_t     n;
844 
845     fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
846     if (U_FAILURE(*fStatus)) {
847         return;
848     }
849     for (i=0; i<lookAheadNodes.size(); i++) {
850         lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
851         U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead);
852 
853         for (n=0; n<fDStates->size(); n++) {
854             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
855             int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode);
856             if (positionsIdx >= 0) {
857                 U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx));
858                 int32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
859                 U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot);
860                 // if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) {
861                 //     printf("%s:%d Bingo. sd->fLookAhead:%d   lookaheadSlot:%d\n",
862                 //            __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot);
863                 // }
864                 sd->fLookAhead = lookaheadSlot;
865             }
866         }
867     }
868 }
869 
870 
871 
872 
873 //-----------------------------------------------------------------------------
874 //
875 //    flagTaggedStates
876 //
877 //-----------------------------------------------------------------------------
flagTaggedStates()878 void     RBBITableBuilder::flagTaggedStates() {
879     if (U_FAILURE(*fStatus)) {
880         return;
881     }
882     UVector     tagNodes(*fStatus);
883     RBBINode    *tagNode;
884     int32_t     i;
885     int32_t     n;
886 
887     if (U_FAILURE(*fStatus)) {
888         return;
889     }
890     fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
891     if (U_FAILURE(*fStatus)) {
892         return;
893     }
894     for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
895         tagNode = (RBBINode *)tagNodes.elementAt(i);
896 
897         for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
898             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
899             if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
900                 sortedAdd(&sd->fTagVals, tagNode->fVal);
901             }
902         }
903     }
904 }
905 
906 
907 
908 
909 //-----------------------------------------------------------------------------
910 //
911 //  mergeRuleStatusVals
912 //
913 //      Update the global table of rule status {tag} values
914 //      The rule builder has a global vector of status values that are common
915 //      for all tables.  Merge the ones from this table into the global set.
916 //
917 //-----------------------------------------------------------------------------
mergeRuleStatusVals()918 void  RBBITableBuilder::mergeRuleStatusVals() {
919     //
920     //  The basic outline of what happens here is this...
921     //
922     //    for each state in this state table
923     //       if the status tag list for this state is in the global statuses list
924     //           record where and
925     //           continue with the next state
926     //       else
927     //           add the tag list for this state to the global list.
928     //
929     int i;
930     int n;
931 
932     // Pre-set a single tag of {0} into the table.
933     //   We will need this as a default, for rule sets with no explicit tagging.
934     if (fRB->fRuleStatusVals->size() == 0) {
935         fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
936         fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
937     }
938 
939     //    For each state
940     for (n=0; n<fDStates->size(); n++) {
941         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
942         UVector *thisStatesTagValues = sd->fTagVals;
943         if (thisStatesTagValues == NULL) {
944             // No tag values are explicitly associated with this state.
945             //   Set the default tag value.
946             sd->fTagsIdx = 0;
947             continue;
948         }
949 
950         // There are tag(s) associated with this state.
951         //   fTagsIdx will be the index into the global tag list for this state's tag values.
952         //   Initial value of -1 flags that we haven't got it set yet.
953         sd->fTagsIdx = -1;
954         int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
955         int32_t  nextTagGroupStart = 0;
956 
957         // Loop runs once per group of tags in the global list
958         while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
959             thisTagGroupStart = nextTagGroupStart;
960             nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
961             if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
962                 // The number of tags for this state is different from
963                 //    the number of tags in this group from the global list.
964                 //    Continue with the next group from the global list.
965                 continue;
966             }
967             // The lengths match, go ahead and compare the actual tag values
968             //    between this state and the group from the global list.
969             for (i=0; i<thisStatesTagValues->size(); i++) {
970                 if (thisStatesTagValues->elementAti(i) !=
971                     fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
972                     // Mismatch.
973                     break;
974                 }
975             }
976 
977             if (i == thisStatesTagValues->size()) {
978                 // We found a set of tag values in the global list that match
979                 //   those for this state.  Use them.
980                 sd->fTagsIdx = thisTagGroupStart;
981                 break;
982             }
983         }
984 
985         if (sd->fTagsIdx == -1) {
986             // No suitable entry in the global tag list already.  Add one
987             sd->fTagsIdx = fRB->fRuleStatusVals->size();
988             fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
989             for (i=0; i<thisStatesTagValues->size(); i++) {
990                 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
991             }
992         }
993     }
994 }
995 
996 
997 
998 
999 
1000 
1001 
1002 //-----------------------------------------------------------------------------
1003 //
1004 //  sortedAdd  Add a value to a vector of sorted values (ints).
1005 //             Do not replicate entries; if the value is already there, do not
1006 //                add a second one.
1007 //             Lazily create the vector if it does not already exist.
1008 //
1009 //-----------------------------------------------------------------------------
sortedAdd(UVector ** vector,int32_t val)1010 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
1011     int32_t i;
1012 
1013     if (*vector == NULL) {
1014         *vector = new UVector(*fStatus);
1015     }
1016     if (*vector == NULL || U_FAILURE(*fStatus)) {
1017         return;
1018     }
1019     UVector *vec = *vector;
1020     int32_t  vSize = vec->size();
1021     for (i=0; i<vSize; i++) {
1022         int32_t valAtI = vec->elementAti(i);
1023         if (valAtI == val) {
1024             // The value is already in the vector.  Don't add it again.
1025             return;
1026         }
1027         if (valAtI > val) {
1028             break;
1029         }
1030     }
1031     vec->insertElementAt(val, i, *fStatus);
1032 }
1033 
1034 
1035 
1036 //-----------------------------------------------------------------------------
1037 //
1038 //  setAdd     Set operation on UVector
1039 //             dest = dest union source
1040 //             Elements may only appear once and must be sorted.
1041 //
1042 //-----------------------------------------------------------------------------
setAdd(UVector * dest,UVector * source)1043 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
1044     int32_t destOriginalSize = dest->size();
1045     int32_t sourceSize       = source->size();
1046     int32_t di           = 0;
1047     MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
1048     void **destPtr, **sourcePtr;
1049     void **destLim, **sourceLim;
1050 
1051     if (destOriginalSize > destArray.getCapacity()) {
1052         if (destArray.resize(destOriginalSize) == NULL) {
1053             return;
1054         }
1055     }
1056     destPtr = destArray.getAlias();
1057     destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
1058 
1059     if (sourceSize > sourceArray.getCapacity()) {
1060         if (sourceArray.resize(sourceSize) == NULL) {
1061             return;
1062         }
1063     }
1064     sourcePtr = sourceArray.getAlias();
1065     sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
1066 
1067     // Avoid multiple "get element" calls by getting the contents into arrays
1068     (void) dest->toArray(destPtr);
1069     (void) source->toArray(sourcePtr);
1070 
1071     dest->setSize(sourceSize+destOriginalSize, *fStatus);
1072 
1073     while (sourcePtr < sourceLim && destPtr < destLim) {
1074         if (*destPtr == *sourcePtr) {
1075             dest->setElementAt(*sourcePtr++, di++);
1076             destPtr++;
1077         }
1078         // This check is required for machines with segmented memory, like i5/OS.
1079         // Direct pointer comparison is not recommended.
1080         else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1081             dest->setElementAt(*destPtr++, di++);
1082         }
1083         else { /* *sourcePtr < *destPtr */
1084             dest->setElementAt(*sourcePtr++, di++);
1085         }
1086     }
1087 
1088     // At most one of these two cleanup loops will execute
1089     while (destPtr < destLim) {
1090         dest->setElementAt(*destPtr++, di++);
1091     }
1092     while (sourcePtr < sourceLim) {
1093         dest->setElementAt(*sourcePtr++, di++);
1094     }
1095 
1096     dest->setSize(di, *fStatus);
1097 }
1098 
1099 
1100 
1101 //-----------------------------------------------------------------------------
1102 //
1103 //  setEqual    Set operation on UVector.
1104 //              Compare for equality.
1105 //              Elements must be sorted.
1106 //
1107 //-----------------------------------------------------------------------------
setEquals(UVector * a,UVector * b)1108 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1109     return a->equals(*b);
1110 }
1111 
1112 
1113 //-----------------------------------------------------------------------------
1114 //
1115 //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
1116 //                 for each node in the tree.
1117 //
1118 //-----------------------------------------------------------------------------
1119 #ifdef RBBI_DEBUG
printPosSets(RBBINode * n)1120 void RBBITableBuilder::printPosSets(RBBINode *n) {
1121     if (n==NULL) {
1122         return;
1123     }
1124     printf("\n");
1125     RBBINode::printNodeHeader();
1126     RBBINode::printNode(n);
1127     RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");
1128 
1129     RBBIDebugPrintf("         firstpos:  ");
1130     printSet(n->fFirstPosSet);
1131 
1132     RBBIDebugPrintf("         lastpos:   ");
1133     printSet(n->fLastPosSet);
1134 
1135     RBBIDebugPrintf("         followpos: ");
1136     printSet(n->fFollowPos);
1137 
1138     printPosSets(n->fLeftChild);
1139     printPosSets(n->fRightChild);
1140 }
1141 #endif
1142 
1143 //
1144 //    findDuplCharClassFrom()
1145 //
findDuplCharClassFrom(IntPair * categories)1146 bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1147     int32_t numStates = fDStates->size();
1148     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1149 
1150     for (; categories->first < numCols-1; categories->first++) {
1151         for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
1152             // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
1153             uint16_t table_base = 0;
1154             uint16_t table_dupl = 1;
1155             for (int32_t state=0; state<numStates; state++) {
1156                 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1157                 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
1158                 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
1159                 if (table_base != table_dupl) {
1160                     break;
1161                 }
1162             }
1163             if (table_base == table_dupl) {
1164                 return true;
1165             }
1166         }
1167     }
1168     return false;
1169 }
1170 
1171 
1172 //
1173 //    removeColumn()
1174 //
removeColumn(int32_t column)1175 void RBBITableBuilder::removeColumn(int32_t column) {
1176     int32_t numStates = fDStates->size();
1177     for (int32_t state=0; state<numStates; state++) {
1178         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1179         U_ASSERT(column < sd->fDtran->size());
1180         sd->fDtran->removeElementAt(column);
1181     }
1182 }
1183 
1184 /*
1185  * findDuplicateState
1186  */
findDuplicateState(IntPair * states)1187 bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1188     int32_t numStates = fDStates->size();
1189     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1190 
1191     for (; states->first<numStates-1; states->first++) {
1192         RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
1193         for (states->second=states->first+1; states->second<numStates; states->second++) {
1194             RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
1195             if (firstSD->fAccepting != duplSD->fAccepting ||
1196                 firstSD->fLookAhead != duplSD->fLookAhead ||
1197                 firstSD->fTagsIdx   != duplSD->fTagsIdx) {
1198                 continue;
1199             }
1200             bool rowsMatch = true;
1201             for (int32_t col=0; col < numCols; ++col) {
1202                 int32_t firstVal = firstSD->fDtran->elementAti(col);
1203                 int32_t duplVal = duplSD->fDtran->elementAti(col);
1204                 if (!((firstVal == duplVal) ||
1205                         ((firstVal == states->first || firstVal == states->second) &&
1206                         (duplVal  == states->first || duplVal  == states->second)))) {
1207                     rowsMatch = false;
1208                     break;
1209                 }
1210             }
1211             if (rowsMatch) {
1212                 return true;
1213             }
1214         }
1215     }
1216     return false;
1217 }
1218 
1219 
findDuplicateSafeState(IntPair * states)1220 bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1221     int32_t numStates = fSafeTable->size();
1222 
1223     for (; states->first<numStates-1; states->first++) {
1224         UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1225         for (states->second=states->first+1; states->second<numStates; states->second++) {
1226             UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1227             bool rowsMatch = true;
1228             int32_t numCols = firstRow->length();
1229             for (int32_t col=0; col < numCols; ++col) {
1230                 int32_t firstVal = firstRow->charAt(col);
1231                 int32_t duplVal = duplRow->charAt(col);
1232                 if (!((firstVal == duplVal) ||
1233                         ((firstVal == states->first || firstVal == states->second) &&
1234                         (duplVal  == states->first || duplVal  == states->second)))) {
1235                     rowsMatch = false;
1236                     break;
1237                 }
1238             }
1239             if (rowsMatch) {
1240                 return true;
1241             }
1242         }
1243     }
1244     return false;
1245 }
1246 
1247 
removeState(IntPair duplStates)1248 void RBBITableBuilder::removeState(IntPair duplStates) {
1249     const int32_t keepState = duplStates.first;
1250     const int32_t duplState = duplStates.second;
1251     U_ASSERT(keepState < duplState);
1252     U_ASSERT(duplState < fDStates->size());
1253 
1254     RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
1255     fDStates->removeElementAt(duplState);
1256     delete duplSD;
1257 
1258     int32_t numStates = fDStates->size();
1259     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1260     for (int32_t state=0; state<numStates; ++state) {
1261         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1262         for (int32_t col=0; col<numCols; col++) {
1263             int32_t existingVal = sd->fDtran->elementAti(col);
1264             int32_t newVal = existingVal;
1265             if (existingVal == duplState) {
1266                 newVal = keepState;
1267             } else if (existingVal > duplState) {
1268                 newVal = existingVal - 1;
1269             }
1270             sd->fDtran->setElementAt(newVal, col);
1271         }
1272     }
1273 }
1274 
removeSafeState(IntPair duplStates)1275 void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1276     const int32_t keepState = duplStates.first;
1277     const int32_t duplState = duplStates.second;
1278     U_ASSERT(keepState < duplState);
1279     U_ASSERT(duplState < fSafeTable->size());
1280 
1281     fSafeTable->removeElementAt(duplState);   // Note that fSafeTable has a deleter function
1282                                               // and will auto-delete the removed element.
1283     int32_t numStates = fSafeTable->size();
1284     for (int32_t state=0; state<numStates; ++state) {
1285         UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
1286         int32_t numCols = sd->length();
1287         for (int32_t col=0; col<numCols; col++) {
1288             int32_t existingVal = sd->charAt(col);
1289             int32_t newVal = existingVal;
1290             if (existingVal == duplState) {
1291                 newVal = keepState;
1292             } else if (existingVal > duplState) {
1293                 newVal = existingVal - 1;
1294             }
1295             sd->setCharAt(col, static_cast<char16_t>(newVal));
1296         }
1297     }
1298 }
1299 
1300 
1301 /*
1302  * RemoveDuplicateStates
1303  */
removeDuplicateStates()1304 int32_t RBBITableBuilder::removeDuplicateStates() {
1305     IntPair dupls = {3, 0};
1306     int32_t numStatesRemoved = 0;
1307 
1308     while (findDuplicateState(&dupls)) {
1309         // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1310         removeState(dupls);
1311         ++numStatesRemoved;
1312     }
1313     return numStatesRemoved;
1314 }
1315 
1316 
1317 //-----------------------------------------------------------------------------
1318 //
1319 //   getTableSize()    Calculate the size of the runtime form of this
1320 //                     state transition table.
1321 //
1322 //-----------------------------------------------------------------------------
getTableSize() const1323 int32_t  RBBITableBuilder::getTableSize() const {
1324     int32_t    size = 0;
1325     int32_t    numRows;
1326     int32_t    numCols;
1327     int32_t    rowSize;
1328 
1329     if (fTree == NULL) {
1330         return 0;
1331     }
1332 
1333     size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
1334 
1335     numRows = fDStates->size();
1336     numCols = fRB->fSetBuilder->getNumCharCategories();
1337 
1338     rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
1339     size   += numRows * rowSize;
1340     return size;
1341 }
1342 
1343 
1344 //-----------------------------------------------------------------------------
1345 //
1346 //   exportTable()    export the state transition table in the format required
1347 //                    by the runtime engine.  getTableSize() bytes of memory
1348 //                    must be available at the output address "where".
1349 //
1350 //-----------------------------------------------------------------------------
exportTable(void * where)1351 void RBBITableBuilder::exportTable(void *where) {
1352     RBBIStateTable    *table = (RBBIStateTable *)where;
1353     uint32_t           state;
1354     int                col;
1355 
1356     if (U_FAILURE(*fStatus) || fTree == NULL) {
1357         return;
1358     }
1359 
1360     int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1361     if (catCount > 0x7fff ||
1362         fDStates->size() > 0x7fff) {
1363         *fStatus = U_BRK_INTERNAL_ERROR;
1364         return;
1365     }
1366 
1367     table->fRowLen    = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
1368     table->fNumStates = fDStates->size();
1369     table->fFlags     = 0;
1370     if (fRB->fLookAheadHardBreak) {
1371         table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
1372     }
1373     if (fRB->fSetBuilder->sawBOF()) {
1374         table->fFlags  |= RBBI_BOF_REQUIRED;
1375     }
1376     table->fReserved  = 0;
1377 
1378     for (state=0; state<table->fNumStates; state++) {
1379         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1380         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1381         U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1382         U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1383         row->fAccepting = (int16_t)sd->fAccepting;
1384         row->fLookAhead = (int16_t)sd->fLookAhead;
1385         row->fTagIdx    = (int16_t)sd->fTagsIdx;
1386         for (col=0; col<catCount; col++) {
1387             row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1388         }
1389     }
1390 }
1391 
1392 
1393 /**
1394  *   Synthesize a safe state table from the main state table.
1395  */
buildSafeReverseTable(UErrorCode & status)1396 void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1397     // The safe table creation has three steps:
1398 
1399     // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
1400     // following the pair do not depend on context or state before the pair. To test
1401     // whether a pair is safe, run it through the main forward state table, starting
1402     // from each state. If the the final state is the same, no matter what the starting state,
1403     // the pair is safe.
1404     //
1405     // 2. Build a state table that recognizes the safe pairs. It's similar to their
1406     // forward table, with a column for each input character [class], and a row for
1407     // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1408     // create an additional state for each input character category; being in
1409     // one of these states means that the character has been seen, and is potentially
1410     // the first of a pair. In each of these rows, the entry for the second character
1411     // of a safe pair is set to the stop state (0), indicating that a match was found.
1412     // All other table entries are set to the state corresponding the current input
1413     // character, allowing that charcter to be the of a start following pair.
1414     //
1415     // Because the safe rules are to be run in reverse, moving backwards in the text,
1416     // the first and second pair categories are swapped when building the table.
1417     //
1418     // 3. Compress the table. There are typically many rows (states) that are
1419     // equivalent - that have zeroes (match completed) in the same columns -
1420     // and can be folded together.
1421 
1422     // Each safe pair is stored as two UChars in the safePair string.
1423     UnicodeString safePairs;
1424 
1425     int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1426     int32_t numStates = fDStates->size();
1427 
1428     for (int32_t c1=0; c1<numCharClasses; ++c1) {
1429         for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1430             int32_t wantedEndState = -1;
1431             int32_t endState = 0;
1432             for (int32_t startState = 1; startState < numStates; ++startState) {
1433                 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1434                 int32_t s2 = startStateD->fDtran->elementAti(c1);
1435                 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1436                 endState = s2StateD->fDtran->elementAti(c2);
1437                 if (wantedEndState < 0) {
1438                     wantedEndState = endState;
1439                 } else {
1440                     if (wantedEndState != endState) {
1441                         break;
1442                     }
1443                 }
1444             }
1445             if (wantedEndState == endState) {
1446                 safePairs.append((char16_t)c1);
1447                 safePairs.append((char16_t)c2);
1448                 // printf("(%d, %d) ", c1, c2);
1449             }
1450         }
1451         // printf("\n");
1452     }
1453 
1454     // Populate the initial safe table.
1455     // The table as a whole is UVector<UnicodeString>
1456     // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1457     // Row 0 is the stop state.
1458     // Row 1 is the start sate.
1459     // Row 2 and beyond are other states, initially one per char class, but
1460     //   after initial construction, many of the states will be combined, compacting the table.
1461     // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1462     // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1463 
1464     U_ASSERT(fSafeTable == nullptr);
1465     fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
1466     for (int32_t row=0; row<numCharClasses + 2; ++row) {
1467         fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1468     }
1469 
1470     // From the start state, each input char class transitions to the state for that input.
1471     UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1472     for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1473         // Note: +2 for the start & stop state.
1474         startState.setCharAt(charClass, static_cast<char16_t>(charClass+2));
1475     }
1476 
1477     // Initially make every other state table row look like the start state row,
1478     for (int32_t row=2; row<numCharClasses+2; ++row) {
1479         UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1480         rowState = startState;   // UnicodeString assignment, copies contents.
1481     }
1482 
1483     // Run through the safe pairs, set the next state to zero when pair has been seen.
1484     // Zero being the stop state, meaning we found a safe point.
1485     for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1486         int32_t c1 = safePairs.charAt(pairIdx);
1487         int32_t c2 = safePairs.charAt(pairIdx + 1);
1488 
1489         UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1490         rowState.setCharAt(c1, 0);
1491     }
1492 
1493     // Remove duplicate or redundant rows from the table.
1494     IntPair states = {1, 0};
1495     while (findDuplicateSafeState(&states)) {
1496         // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1497         removeSafeState(states);
1498     }
1499 }
1500 
1501 
1502 //-----------------------------------------------------------------------------
1503 //
1504 //   getSafeTableSize()    Calculate the size of the runtime form of this
1505 //                         safe state table.
1506 //
1507 //-----------------------------------------------------------------------------
getSafeTableSize() const1508 int32_t  RBBITableBuilder::getSafeTableSize() const {
1509     int32_t    size = 0;
1510     int32_t    numRows;
1511     int32_t    numCols;
1512     int32_t    rowSize;
1513 
1514     if (fSafeTable == nullptr) {
1515         return 0;
1516     }
1517 
1518     size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
1519 
1520     numRows = fSafeTable->size();
1521     numCols = fRB->fSetBuilder->getNumCharCategories();
1522 
1523     rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
1524     size   += numRows * rowSize;
1525     return size;
1526 }
1527 
1528 
1529 //-----------------------------------------------------------------------------
1530 //
1531 //   exportSafeTable()   export the state transition table in the format required
1532 //                       by the runtime engine.  getTableSize() bytes of memory
1533 //                       must be available at the output address "where".
1534 //
1535 //-----------------------------------------------------------------------------
exportSafeTable(void * where)1536 void RBBITableBuilder::exportSafeTable(void *where) {
1537     RBBIStateTable    *table = (RBBIStateTable *)where;
1538     uint32_t           state;
1539     int                col;
1540 
1541     if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1542         return;
1543     }
1544 
1545     int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1546     if (catCount > 0x7fff ||
1547             fSafeTable->size() > 0x7fff) {
1548         *fStatus = U_BRK_INTERNAL_ERROR;
1549         return;
1550     }
1551 
1552     table->fRowLen    = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
1553     table->fNumStates = fSafeTable->size();
1554     table->fFlags     = 0;
1555     table->fReserved  = 0;
1556 
1557     for (state=0; state<table->fNumStates; state++) {
1558         UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
1559         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1560         row->fAccepting = 0;
1561         row->fLookAhead = 0;
1562         row->fTagIdx    = 0;
1563         row->fReserved  = 0;
1564         for (col=0; col<catCount; col++) {
1565             row->fNextState[col] = rowString->charAt(col);
1566         }
1567     }
1568 }
1569 
1570 
1571 
1572 
1573 //-----------------------------------------------------------------------------
1574 //
1575 //   printSet    Debug function.   Print the contents of a UVector
1576 //
1577 //-----------------------------------------------------------------------------
1578 #ifdef RBBI_DEBUG
printSet(UVector * s)1579 void RBBITableBuilder::printSet(UVector *s) {
1580     int32_t  i;
1581     for (i=0; i<s->size(); i++) {
1582         const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1583         RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
1584     }
1585     RBBIDebugPrintf("\n");
1586 }
1587 #endif
1588 
1589 
1590 //-----------------------------------------------------------------------------
1591 //
1592 //   printStates    Debug Function.  Dump the fully constructed state transition table.
1593 //
1594 //-----------------------------------------------------------------------------
1595 #ifdef RBBI_DEBUG
printStates()1596 void RBBITableBuilder::printStates() {
1597     int     c;    // input "character"
1598     int     n;    // state number
1599 
1600     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1601     RBBIDebugPrintf("      | Acc  LA    Tag");
1602     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1603         RBBIDebugPrintf(" %2d", c);
1604     }
1605     RBBIDebugPrintf("\n");
1606     RBBIDebugPrintf("      |---------------");
1607     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1608         RBBIDebugPrintf("---");
1609     }
1610     RBBIDebugPrintf("\n");
1611 
1612     for (n=0; n<fDStates->size(); n++) {
1613         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1614         RBBIDebugPrintf("  %3d | " , n);
1615         RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1616         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1617             RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1618         }
1619         RBBIDebugPrintf("\n");
1620     }
1621     RBBIDebugPrintf("\n\n");
1622 }
1623 #endif
1624 
1625 
1626 //-----------------------------------------------------------------------------
1627 //
1628 //   printSafeTable    Debug Function.  Dump the fully constructed safe table.
1629 //
1630 //-----------------------------------------------------------------------------
1631 #ifdef RBBI_DEBUG
printReverseTable()1632 void RBBITableBuilder::printReverseTable() {
1633     int     c;    // input "character"
1634     int     n;    // state number
1635 
1636     RBBIDebugPrintf("    Safe Reverse Table \n");
1637     if (fSafeTable == nullptr) {
1638         RBBIDebugPrintf("   --- nullptr ---\n");
1639         return;
1640     }
1641     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1642     RBBIDebugPrintf("      | Acc  LA    Tag");
1643     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1644         RBBIDebugPrintf(" %2d", c);
1645     }
1646     RBBIDebugPrintf("\n");
1647     RBBIDebugPrintf("      |---------------");
1648     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1649         RBBIDebugPrintf("---");
1650     }
1651     RBBIDebugPrintf("\n");
1652 
1653     for (n=0; n<fSafeTable->size(); n++) {
1654         UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1655         RBBIDebugPrintf("  %3d | " , n);
1656         RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0);  // Accepting, LookAhead, Tags
1657         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1658             RBBIDebugPrintf(" %2d", rowString->charAt(c));
1659         }
1660         RBBIDebugPrintf("\n");
1661     }
1662     RBBIDebugPrintf("\n\n");
1663 }
1664 #endif
1665 
1666 
1667 
1668 //-----------------------------------------------------------------------------
1669 //
1670 //   printRuleStatusTable    Debug Function.  Dump the common rule status table
1671 //
1672 //-----------------------------------------------------------------------------
1673 #ifdef RBBI_DEBUG
printRuleStatusTable()1674 void RBBITableBuilder::printRuleStatusTable() {
1675     int32_t  thisRecord = 0;
1676     int32_t  nextRecord = 0;
1677     int      i;
1678     UVector  *tbl = fRB->fRuleStatusVals;
1679 
1680     RBBIDebugPrintf("index |  tags \n");
1681     RBBIDebugPrintf("-------------------\n");
1682 
1683     while (nextRecord < tbl->size()) {
1684         thisRecord = nextRecord;
1685         nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1686         RBBIDebugPrintf("%4d   ", thisRecord);
1687         for (i=thisRecord+1; i<nextRecord; i++) {
1688             RBBIDebugPrintf("  %5d", tbl->elementAti(i));
1689         }
1690         RBBIDebugPrintf("\n");
1691     }
1692     RBBIDebugPrintf("\n\n");
1693 }
1694 #endif
1695 
1696 
1697 //-----------------------------------------------------------------------------
1698 //
1699 //   RBBIStateDescriptor     Methods.  This is a very struct-like class
1700 //                           Most access is directly to the fields.
1701 //
1702 //-----------------------------------------------------------------------------
1703 
RBBIStateDescriptor(int lastInputSymbol,UErrorCode * fStatus)1704 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1705     fMarked    = FALSE;
1706     fAccepting = 0;
1707     fLookAhead = 0;
1708     fTagsIdx   = 0;
1709     fTagVals   = NULL;
1710     fPositions = NULL;
1711     fDtran     = NULL;
1712 
1713     fDtran     = new UVector32(lastInputSymbol+1, *fStatus);
1714     if (U_FAILURE(*fStatus)) {
1715         return;
1716     }
1717     if (fDtran == NULL) {
1718         *fStatus = U_MEMORY_ALLOCATION_ERROR;
1719         return;
1720     }
1721     fDtran->setSize(lastInputSymbol+1);    // fDtran needs to be pre-sized.
1722                                            //   It is indexed by input symbols, and will
1723                                            //   hold  the next state number for each
1724                                            //   symbol.
1725 }
1726 
1727 
~RBBIStateDescriptor()1728 RBBIStateDescriptor::~RBBIStateDescriptor() {
1729     delete       fPositions;
1730     delete       fDtran;
1731     delete       fTagVals;
1732     fPositions = NULL;
1733     fDtran     = NULL;
1734     fTagVals   = NULL;
1735 }
1736 
1737 U_NAMESPACE_END
1738 
1739 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1740