• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 *   Copyright (c) 2002-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9 //
10 //  rbbitblb.cpp
11 //
12 
13 
14 #include "unicode/utypes.h"
15 
16 #if !UCONFIG_NO_BREAK_ITERATION
17 
18 #include "unicode/unistr.h"
19 #include "rbbitblb.h"
20 #include "rbbirb.h"
21 #include "rbbiscan.h"
22 #include "rbbisetb.h"
23 #include "rbbidata.h"
24 #include "cstring.h"
25 #include "uassert.h"
26 #include "uvectr32.h"
27 #include "cmemory.h"
28 
29 U_NAMESPACE_BEGIN
30 
31 const int32_t kMaxStateFor8BitsTable = 255;
32 
RBBITableBuilder(RBBIRuleBuilder * rb,RBBINode ** rootNode,UErrorCode & status)33 RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
34         fRB(rb),
35         fTree(*rootNode),
36         fStatus(&status),
37         fDStates(nullptr),
38         fSafeTable(nullptr) {
39     if (U_FAILURE(status)) {
40         return;
41     }
42     // fDStates is UVector<RBBIStateDescriptor *>
43     fDStates = new UVector(status);
44     if (U_SUCCESS(status) && fDStates == nullptr ) {
45         status = U_MEMORY_ALLOCATION_ERROR;
46     }
47 }
48 
49 
50 
~RBBITableBuilder()51 RBBITableBuilder::~RBBITableBuilder() {
52     int i;
53     for (i=0; i<fDStates->size(); i++) {
54         delete (RBBIStateDescriptor *)fDStates->elementAt(i);
55     }
56     delete fDStates;
57     delete fSafeTable;
58     delete fLookAheadRuleMap;
59 }
60 
61 
62 //-----------------------------------------------------------------------------
63 //
64 //   RBBITableBuilder::buildForwardTable  -  This is the main function for building
65 //                               the DFA state transition table from the RBBI rules parse tree.
66 //
67 //-----------------------------------------------------------------------------
buildForwardTable()68 void  RBBITableBuilder::buildForwardTable() {
69 
70     if (U_FAILURE(*fStatus)) {
71         return;
72     }
73 
74     // If there were no rules, just return.  This situation can easily arise
75     //   for the reverse rules.
76     if (fTree==NULL) {
77         return;
78     }
79 
80     //
81     // Walk through the tree, replacing any references to $variables with a copy of the
82     //   parse tree for the substitution expression.
83     //
84     fTree = fTree->flattenVariables();
85 #ifdef RBBI_DEBUG
86     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
87         RBBIDebugPuts("\nParse tree after flattening variable references.");
88         RBBINode::printTree(fTree, TRUE);
89     }
90 #endif
91 
92     //
93     // If the rules contained any references to {bof}
94     //   add a {bof} <cat> <former root of tree> to the
95     //   tree.  Means that all matches must start out with the
96     //   {bof} fake character.
97     //
98     if (fRB->fSetBuilder->sawBOF()) {
99         RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
100         RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
101         // Delete and exit if memory allocation failed.
102         if (bofTop == NULL || bofLeaf == NULL) {
103             *fStatus = U_MEMORY_ALLOCATION_ERROR;
104             delete bofTop;
105             delete bofLeaf;
106             return;
107         }
108         bofTop->fLeftChild  = bofLeaf;
109         bofTop->fRightChild = fTree;
110         bofLeaf->fParent    = bofTop;
111         bofLeaf->fVal       = 2;      // Reserved value for {bof}.
112         fTree               = bofTop;
113     }
114 
115     //
116     // Add a unique right-end marker to the expression.
117     //   Appears as a cat-node, left child being the original tree,
118     //   right child being the end marker.
119     //
120     RBBINode *cn = new RBBINode(RBBINode::opCat);
121     // Exit if memory allocation failed.
122     if (cn == NULL) {
123         *fStatus = U_MEMORY_ALLOCATION_ERROR;
124         return;
125     }
126     cn->fLeftChild = fTree;
127     fTree->fParent = cn;
128     RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark);
129     // Delete and exit if memory allocation failed.
130     if (cn->fRightChild == NULL) {
131         *fStatus = U_MEMORY_ALLOCATION_ERROR;
132         delete cn;
133         return;
134     }
135     cn->fRightChild->fParent = cn;
136     fTree = cn;
137 
138     //
139     //  Replace all references to UnicodeSets with the tree for the equivalent
140     //      expression.
141     //
142     fTree->flattenSets();
143 #ifdef RBBI_DEBUG
144     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
145         RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
146         RBBINode::printTree(fTree, TRUE);
147     }
148 #endif
149 
150 
151     //
152     // calculate the functions nullable, firstpos, lastpos and followpos on
153     // nodes in the parse tree.
154     //    See the algorithm description in Aho.
155     //    Understanding how this works by looking at the code alone will be
156     //       nearly impossible.
157     //
158     calcNullable(fTree);
159     calcFirstPos(fTree);
160     calcLastPos(fTree);
161     calcFollowPos(fTree);
162     if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
163         RBBIDebugPuts("\n");
164         printPosSets(fTree);
165     }
166 
167     //
168     //  For "chained" rules, modify the followPos sets
169     //
170     if (fRB->fChainRules) {
171         calcChainedFollowPos(fTree, endMarkerNode);
172     }
173 
174     //
175     //  BOF (start of input) test fixup.
176     //
177     if (fRB->fSetBuilder->sawBOF()) {
178         bofFixup();
179     }
180 
181     //
182     // Build the DFA state transition tables.
183     //
184     buildStateTable();
185     mapLookAheadRules();
186     flagAcceptingStates();
187     flagLookAheadStates();
188     flagTaggedStates();
189 
190     //
191     // Update the global table of rule status {tag} values
192     // The rule builder has a global vector of status values that are common
193     //    for all tables.  Merge the ones from this table into the global set.
194     //
195     mergeRuleStatusVals();
196 }
197 
198 
199 
200 //-----------------------------------------------------------------------------
201 //
202 //   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
203 //
204 //-----------------------------------------------------------------------------
calcNullable(RBBINode * n)205 void RBBITableBuilder::calcNullable(RBBINode *n) {
206     if (n == NULL) {
207         return;
208     }
209     if (n->fType == RBBINode::setRef ||
210         n->fType == RBBINode::endMark ) {
211         // These are non-empty leaf node types.
212         n->fNullable = FALSE;
213         return;
214     }
215 
216     if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
217         // Lookahead marker node.  It's a leaf, so no recursion on children.
218         // It's nullable because it does not match any literal text from the input stream.
219         n->fNullable = TRUE;
220         return;
221     }
222 
223 
224     // The node is not a leaf.
225     //  Calculate nullable on its children.
226     calcNullable(n->fLeftChild);
227     calcNullable(n->fRightChild);
228 
229     // Apply functions from table 3.40 in Aho
230     if (n->fType == RBBINode::opOr) {
231         n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
232     }
233     else if (n->fType == RBBINode::opCat) {
234         n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
235     }
236     else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
237         n->fNullable = TRUE;
238     }
239     else {
240         n->fNullable = FALSE;
241     }
242 }
243 
244 
245 
246 
247 //-----------------------------------------------------------------------------
248 //
249 //   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
250 //
251 //-----------------------------------------------------------------------------
calcFirstPos(RBBINode * n)252 void RBBITableBuilder::calcFirstPos(RBBINode *n) {
253     if (n == NULL) {
254         return;
255     }
256     if (n->fType == RBBINode::leafChar  ||
257         n->fType == RBBINode::endMark   ||
258         n->fType == RBBINode::lookAhead ||
259         n->fType == RBBINode::tag) {
260         // These are non-empty leaf node types.
261         // Note: In order to maintain the sort invariant on the set,
262         // this function should only be called on a node whose set is
263         // empty to start with.
264         n->fFirstPosSet->addElement(n, *fStatus);
265         return;
266     }
267 
268     // The node is not a leaf.
269     //  Calculate firstPos on its children.
270     calcFirstPos(n->fLeftChild);
271     calcFirstPos(n->fRightChild);
272 
273     // Apply functions from table 3.40 in Aho
274     if (n->fType == RBBINode::opOr) {
275         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
276         setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
277     }
278     else if (n->fType == RBBINode::opCat) {
279         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
280         if (n->fLeftChild->fNullable) {
281             setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
282         }
283     }
284     else if (n->fType == RBBINode::opStar ||
285              n->fType == RBBINode::opQuestion ||
286              n->fType == RBBINode::opPlus) {
287         setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
288     }
289 }
290 
291 
292 
293 //-----------------------------------------------------------------------------
294 //
295 //   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
296 //
297 //-----------------------------------------------------------------------------
calcLastPos(RBBINode * n)298 void RBBITableBuilder::calcLastPos(RBBINode *n) {
299     if (n == NULL) {
300         return;
301     }
302     if (n->fType == RBBINode::leafChar  ||
303         n->fType == RBBINode::endMark   ||
304         n->fType == RBBINode::lookAhead ||
305         n->fType == RBBINode::tag) {
306         // These are non-empty leaf node types.
307         // Note: In order to maintain the sort invariant on the set,
308         // this function should only be called on a node whose set is
309         // empty to start with.
310         n->fLastPosSet->addElement(n, *fStatus);
311         return;
312     }
313 
314     // The node is not a leaf.
315     //  Calculate lastPos on its children.
316     calcLastPos(n->fLeftChild);
317     calcLastPos(n->fRightChild);
318 
319     // Apply functions from table 3.40 in Aho
320     if (n->fType == RBBINode::opOr) {
321         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
322         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
323     }
324     else if (n->fType == RBBINode::opCat) {
325         setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
326         if (n->fRightChild->fNullable) {
327             setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
328         }
329     }
330     else if (n->fType == RBBINode::opStar     ||
331              n->fType == RBBINode::opQuestion ||
332              n->fType == RBBINode::opPlus) {
333         setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
334     }
335 }
336 
337 
338 
339 //-----------------------------------------------------------------------------
340 //
341 //   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
342 //
343 //-----------------------------------------------------------------------------
calcFollowPos(RBBINode * n)344 void RBBITableBuilder::calcFollowPos(RBBINode *n) {
345     if (n == NULL ||
346         n->fType == RBBINode::leafChar ||
347         n->fType == RBBINode::endMark) {
348         return;
349     }
350 
351     calcFollowPos(n->fLeftChild);
352     calcFollowPos(n->fRightChild);
353 
354     // Aho rule #1
355     if (n->fType == RBBINode::opCat) {
356         RBBINode *i;   // is 'i' in Aho's description
357         uint32_t     ix;
358 
359         UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
360 
361         for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
362             i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
363             setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
364         }
365     }
366 
367     // Aho rule #2
368     if (n->fType == RBBINode::opStar ||
369         n->fType == RBBINode::opPlus) {
370         RBBINode   *i;  // again, n and i are the names from Aho's description.
371         uint32_t    ix;
372 
373         for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
374             i = (RBBINode *)n->fLastPosSet->elementAt(ix);
375             setAdd(i->fFollowPos, n->fFirstPosSet);
376         }
377     }
378 
379 
380 
381 }
382 
383 //-----------------------------------------------------------------------------
384 //
385 //    addRuleRootNodes    Recursively walk a parse tree, adding all nodes flagged
386 //                        as roots of a rule to a destination vector.
387 //
388 //-----------------------------------------------------------------------------
addRuleRootNodes(UVector * dest,RBBINode * node)389 void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
390     if (node == NULL || U_FAILURE(*fStatus)) {
391         return;
392     }
393     U_ASSERT(!dest->hasDeleter());
394     if (node->fRuleRoot) {
395         dest->addElement(node, *fStatus);
396         // Note: rules cannot nest. If we found a rule start node,
397         //       no child node can also be a start node.
398         return;
399     }
400     addRuleRootNodes(dest, node->fLeftChild);
401     addRuleRootNodes(dest, node->fRightChild);
402 }
403 
404 //-----------------------------------------------------------------------------
405 //
406 //   calcChainedFollowPos.    Modify the previously calculated followPos sets
407 //                            to implement rule chaining.  NOT described by Aho
408 //
409 //-----------------------------------------------------------------------------
calcChainedFollowPos(RBBINode * tree,RBBINode * endMarkNode)410 void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) {
411 
412     UVector         leafNodes(*fStatus);
413     if (U_FAILURE(*fStatus)) {
414         return;
415     }
416 
417     // get a list all leaf nodes
418     tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
419     if (U_FAILURE(*fStatus)) {
420         return;
421     }
422 
423     // Collect all leaf nodes that can start matches for rules
424     // with inbound chaining enabled, which is the union of the
425     // firstPosition sets from each of the rule root nodes.
426 
427     UVector ruleRootNodes(*fStatus);
428     addRuleRootNodes(&ruleRootNodes, tree);
429 
430     UVector matchStartNodes(*fStatus);
431     for (int j=0; j<ruleRootNodes.size(); ++j) {
432         RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j));
433         if (node->fChainIn) {
434             setAdd(&matchStartNodes, node->fFirstPosSet);
435         }
436     }
437     if (U_FAILURE(*fStatus)) {
438         return;
439     }
440 
441     int32_t  endNodeIx;
442     int32_t  startNodeIx;
443 
444     for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
445         RBBINode *endNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
446 
447         // Identify leaf nodes that correspond to overall rule match positions.
448         // These include the endMarkNode in their followPos sets.
449         //
450         // Note: do not consider other end marker nodes, those that are added to
451         //       look-ahead rules. These can't chain; a match immediately stops
452         //       further matching. This leaves exactly one end marker node, the one
453         //       at the end of the complete tree.
454 
455         if (!endNode->fFollowPos->contains(endMarkNode)) {
456             continue;
457         }
458 
459         // We've got a node that can end a match.
460 
461         // !!LBCMNoChain implementation:  If this node's val correspond to
462         // the Line Break $CM char class, don't chain from it.
463         // TODO:  Remove this. !!LBCMNoChain is deprecated, and is not used
464         //        by any of the standard ICU rules.
465         if (fRB->fLBCMNoChain) {
466             UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
467             if (c != -1) {
468                 // c == -1 occurs with sets containing only the {eof} marker string.
469                 ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
470                 if (cLBProp == U_LB_COMBINING_MARK) {
471                     continue;
472                 }
473             }
474         }
475 
476         // Now iterate over the nodes that can start a match, looking for ones
477         //   with the same char class as our ending node.
478         RBBINode *startNode;
479         for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
480             startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
481             if (startNode->fType != RBBINode::leafChar) {
482                 continue;
483             }
484 
485             if (endNode->fVal == startNode->fVal) {
486                 // The end val (character class) of one possible match is the
487                 //   same as the start of another.
488 
489                 // Add all nodes from the followPos of the start node to the
490                 //  followPos set of the end node, which will have the effect of
491                 //  letting matches transition from a match state at endNode
492                 //  to the second char of a match starting with startNode.
493                 setAdd(endNode->fFollowPos, startNode->fFollowPos);
494             }
495         }
496     }
497 }
498 
499 
500 //-----------------------------------------------------------------------------
501 //
502 //   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
503 //                Do an swizzle similar to chaining, modifying the followPos set of
504 //                the bofNode to include the followPos nodes from other {bot} nodes
505 //                scattered through the tree.
506 //
507 //                This function has much in common with calcChainedFollowPos().
508 //
509 //-----------------------------------------------------------------------------
bofFixup()510 void RBBITableBuilder::bofFixup() {
511 
512     if (U_FAILURE(*fStatus)) {
513         return;
514     }
515 
516     //   The parse tree looks like this ...
517     //         fTree root  --->       <cat>
518     //                               /     \       .
519     //                            <cat>   <#end node>
520     //                           /     \  .
521     //                     <bofNode>   rest
522     //                               of tree
523     //
524     //    We will be adding things to the followPos set of the <bofNode>
525     //
526     RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
527     U_ASSERT(bofNode->fType == RBBINode::leafChar);
528     U_ASSERT(bofNode->fVal == 2);
529 
530     // Get all nodes that can be the start a match of the user-written rules
531     //  (excluding the fake bofNode)
532     //  We want the nodes that can start a match in the
533     //     part labeled "rest of tree"
534     //
535     UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
536 
537     RBBINode *startNode;
538     int       startNodeIx;
539     for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
540         startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
541         if (startNode->fType != RBBINode::leafChar) {
542             continue;
543         }
544 
545         if (startNode->fVal == bofNode->fVal) {
546             //  We found a leaf node corresponding to a {bof} that was
547             //    explicitly written into a rule.
548             //  Add everything from the followPos set of this node to the
549             //    followPos set of the fake bofNode at the start of the tree.
550             //
551             setAdd(bofNode->fFollowPos, startNode->fFollowPos);
552         }
553     }
554 }
555 
556 //-----------------------------------------------------------------------------
557 //
558 //   buildStateTable()    Determine the set of runtime DFA states and the
559 //                        transition tables for these states, by the algorithm
560 //                        of fig. 3.44 in Aho.
561 //
562 //                        Most of the comments are quotes of Aho's psuedo-code.
563 //
564 //-----------------------------------------------------------------------------
buildStateTable()565 void RBBITableBuilder::buildStateTable() {
566     if (U_FAILURE(*fStatus)) {
567         return;
568     }
569     RBBIStateDescriptor *failState;
570     // Set it to NULL to avoid uninitialized warning
571     RBBIStateDescriptor *initialState = NULL;
572     //
573     // Add a dummy state 0 - the stop state.  Not from Aho.
574     int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
575     failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
576     if (failState == NULL) {
577         *fStatus = U_MEMORY_ALLOCATION_ERROR;
578         goto ExitBuildSTdeleteall;
579     }
580     failState->fPositions = new UVector(*fStatus);
581     if (failState->fPositions == NULL) {
582         *fStatus = U_MEMORY_ALLOCATION_ERROR;
583     }
584     if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
585         goto ExitBuildSTdeleteall;
586     }
587     fDStates->addElement(failState, *fStatus);
588     if (U_FAILURE(*fStatus)) {
589         goto ExitBuildSTdeleteall;
590     }
591 
592     // initially, the only unmarked state in Dstates is firstpos(root),
593     //       where toot is the root of the syntax tree for (r)#;
594     initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
595     if (initialState == NULL) {
596         *fStatus = U_MEMORY_ALLOCATION_ERROR;
597     }
598     if (U_FAILURE(*fStatus)) {
599         goto ExitBuildSTdeleteall;
600     }
601     initialState->fPositions = new UVector(*fStatus);
602     if (initialState->fPositions == NULL) {
603         *fStatus = U_MEMORY_ALLOCATION_ERROR;
604     }
605     if (U_FAILURE(*fStatus)) {
606         goto ExitBuildSTdeleteall;
607     }
608     setAdd(initialState->fPositions, fTree->fFirstPosSet);
609     fDStates->addElement(initialState, *fStatus);
610     if (U_FAILURE(*fStatus)) {
611         goto ExitBuildSTdeleteall;
612     }
613 
614     // while there is an unmarked state T in Dstates do begin
615     for (;;) {
616         RBBIStateDescriptor *T = NULL;
617         int32_t              tx;
618         for (tx=1; tx<fDStates->size(); tx++) {
619             RBBIStateDescriptor *temp;
620             temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
621             if (temp->fMarked == FALSE) {
622                 T = temp;
623                 break;
624             }
625         }
626         if (T == NULL) {
627             break;
628         }
629 
630         // mark T;
631         T->fMarked = TRUE;
632 
633         // for each input symbol a do begin
634         int32_t  a;
635         for (a = 1; a<=lastInputSymbol; a++) {
636             // let U be the set of positions that are in followpos(p)
637             //    for some position p in T
638             //    such that the symbol at position p is a;
639             UVector    *U = NULL;
640             RBBINode   *p;
641             int32_t     px;
642             for (px=0; px<T->fPositions->size(); px++) {
643                 p = (RBBINode *)T->fPositions->elementAt(px);
644                 if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
645                     if (U == NULL) {
646                         U = new UVector(*fStatus);
647                         if (U == NULL) {
648                         	*fStatus = U_MEMORY_ALLOCATION_ERROR;
649                         	goto ExitBuildSTdeleteall;
650                         }
651                     }
652                     setAdd(U, p->fFollowPos);
653                 }
654             }
655 
656             // if U is not empty and not in DStates then
657             int32_t  ux = 0;
658             UBool    UinDstates = FALSE;
659             if (U != NULL) {
660                 U_ASSERT(U->size() > 0);
661                 int  ix;
662                 for (ix=0; ix<fDStates->size(); ix++) {
663                     RBBIStateDescriptor *temp2;
664                     temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
665                     if (setEquals(U, temp2->fPositions)) {
666                         delete U;
667                         U  = temp2->fPositions;
668                         ux = ix;
669                         UinDstates = TRUE;
670                         break;
671                     }
672                 }
673 
674                 // Add U as an unmarked state to Dstates
675                 if (!UinDstates)
676                 {
677                     RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
678                     if (newState == NULL) {
679                     	*fStatus = U_MEMORY_ALLOCATION_ERROR;
680                     }
681                     if (U_FAILURE(*fStatus)) {
682                         goto ExitBuildSTdeleteall;
683                     }
684                     newState->fPositions = U;
685                     fDStates->addElement(newState, *fStatus);
686                     if (U_FAILURE(*fStatus)) {
687                         return;
688                     }
689                     ux = fDStates->size()-1;
690                 }
691 
692                 // Dtran[T, a] := U;
693                 T->fDtran->setElementAt(ux, a);
694             }
695         }
696     }
697     return;
698     // delete local pointers only if error occurred.
699 ExitBuildSTdeleteall:
700     delete initialState;
701     delete failState;
702 }
703 
704 
705 /**
706  * mapLookAheadRules
707  *
708  */
mapLookAheadRules()709 void RBBITableBuilder::mapLookAheadRules() {
710     fLookAheadRuleMap =  new UVector32(fRB->fScanner->numRules() + 1, *fStatus);
711     if (fLookAheadRuleMap == nullptr) {
712         *fStatus = U_MEMORY_ALLOCATION_ERROR;
713     }
714     if (U_FAILURE(*fStatus)) {
715         return;
716     }
717     fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
718 
719     for (int32_t n=0; n<fDStates->size(); n++) {
720         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
721         int32_t laSlotForState = 0;
722 
723         // Establish the look-ahead slot for this state, if the state covers
724         // any look-ahead nodes - corresponding to the '/' in look-ahead rules.
725 
726         // If any of the look-ahead nodes already have a slot assigned, use it,
727         // otherwise assign a new one.
728 
729         bool sawLookAheadNode = false;
730         for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
731             RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
732             if (node->fType != RBBINode::NodeType::lookAhead) {
733                 continue;
734             }
735             sawLookAheadNode = true;
736             int32_t ruleNum = node->fVal;     // Set when rule was originally parsed.
737             U_ASSERT(ruleNum < fLookAheadRuleMap->size());
738             U_ASSERT(ruleNum > 0);
739             int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum);
740             if (laSlot != 0) {
741                 if (laSlotForState == 0) {
742                     laSlotForState = laSlot;
743                 } else {
744                     // TODO: figure out if this can fail, change to setting an error code if so.
745                     U_ASSERT(laSlot == laSlotForState);
746                 }
747             }
748         }
749         if (!sawLookAheadNode) {
750             continue;
751         }
752 
753         if (laSlotForState == 0) {
754             laSlotForState = ++fLASlotsInUse;
755         }
756 
757         // For each look ahead node covered by this state,
758         // set the mapping from the node's rule number to the look ahead slot.
759         // There can be multiple nodes/rule numbers going to the same la slot.
760 
761         for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
762             RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
763             if (node->fType != RBBINode::NodeType::lookAhead) {
764                 continue;
765             }
766             int32_t ruleNum = node->fVal;     // Set when rule was originally parsed.
767             int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum);
768             (void)existingVal;
769             U_ASSERT(existingVal == 0 || existingVal == laSlotForState);
770             fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum);
771         }
772     }
773 
774 }
775 
776 //-----------------------------------------------------------------------------
777 //
778 //   flagAcceptingStates    Identify accepting states.
779 //                          First get a list of all of the end marker nodes.
780 //                          Then, for each state s,
781 //                              if s contains one of the end marker nodes in its list of tree positions then
782 //                                  s is an accepting state.
783 //
784 //-----------------------------------------------------------------------------
flagAcceptingStates()785 void     RBBITableBuilder::flagAcceptingStates() {
786     if (U_FAILURE(*fStatus)) {
787         return;
788     }
789     UVector     endMarkerNodes(*fStatus);
790     RBBINode    *endMarker;
791     int32_t     i;
792     int32_t     n;
793 
794     if (U_FAILURE(*fStatus)) {
795         return;
796     }
797 
798     fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
799     if (U_FAILURE(*fStatus)) {
800         return;
801     }
802 
803     for (i=0; i<endMarkerNodes.size(); i++) {
804         endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
805         for (n=0; n<fDStates->size(); n++) {
806             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
807             if (sd->fPositions->indexOf(endMarker) >= 0) {
808                 // Any non-zero value for fAccepting means this is an accepting node.
809                 // The value is what will be returned to the user as the break status.
810                 // If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1).
811 
812                 if (sd->fAccepting==0) {
813                     // State hasn't been marked as accepting yet.  Do it now.
814                     sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
815                     if (sd->fAccepting == 0) {
816                         sd->fAccepting = ACCEPTING_UNCONDITIONAL;
817                     }
818                 }
819                 if (sd->fAccepting==ACCEPTING_UNCONDITIONAL && endMarker->fVal != 0) {
820                     // Both lookahead and non-lookahead accepting for this state.
821                     // Favor the look-ahead, because a look-ahead match needs to
822                     // immediately stop the run-time engine. First match, not longest.
823                     sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
824                 }
825                 // implicit else:
826                 // if sd->fAccepting already had a value other than 0 or 1, leave it be.
827             }
828         }
829     }
830 }
831 
832 
833 //-----------------------------------------------------------------------------
834 //
835 //    flagLookAheadStates   Very similar to flagAcceptingStates, above.
836 //
837 //-----------------------------------------------------------------------------
flagLookAheadStates()838 void     RBBITableBuilder::flagLookAheadStates() {
839     if (U_FAILURE(*fStatus)) {
840         return;
841     }
842     UVector     lookAheadNodes(*fStatus);
843     RBBINode    *lookAheadNode;
844     int32_t     i;
845     int32_t     n;
846 
847     fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
848     if (U_FAILURE(*fStatus)) {
849         return;
850     }
851     for (i=0; i<lookAheadNodes.size(); i++) {
852         lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
853         U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead);
854 
855         for (n=0; n<fDStates->size(); n++) {
856             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
857             int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode);
858             if (positionsIdx >= 0) {
859                 U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx));
860                 uint32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
861                 U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot);
862                 // if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) {
863                 //     printf("%s:%d Bingo. sd->fLookAhead:%d   lookaheadSlot:%d\n",
864                 //            __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot);
865                 // }
866                 sd->fLookAhead = lookaheadSlot;
867             }
868         }
869     }
870 }
871 
872 
873 
874 
875 //-----------------------------------------------------------------------------
876 //
877 //    flagTaggedStates
878 //
879 //-----------------------------------------------------------------------------
flagTaggedStates()880 void     RBBITableBuilder::flagTaggedStates() {
881     if (U_FAILURE(*fStatus)) {
882         return;
883     }
884     UVector     tagNodes(*fStatus);
885     RBBINode    *tagNode;
886     int32_t     i;
887     int32_t     n;
888 
889     if (U_FAILURE(*fStatus)) {
890         return;
891     }
892     fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
893     if (U_FAILURE(*fStatus)) {
894         return;
895     }
896     for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
897         tagNode = (RBBINode *)tagNodes.elementAt(i);
898 
899         for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
900             RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
901             if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
902                 sortedAdd(&sd->fTagVals, tagNode->fVal);
903             }
904         }
905     }
906 }
907 
908 
909 
910 
911 //-----------------------------------------------------------------------------
912 //
913 //  mergeRuleStatusVals
914 //
915 //      Update the global table of rule status {tag} values
916 //      The rule builder has a global vector of status values that are common
917 //      for all tables.  Merge the ones from this table into the global set.
918 //
919 //-----------------------------------------------------------------------------
mergeRuleStatusVals()920 void  RBBITableBuilder::mergeRuleStatusVals() {
921     //
922     //  The basic outline of what happens here is this...
923     //
924     //    for each state in this state table
925     //       if the status tag list for this state is in the global statuses list
926     //           record where and
927     //           continue with the next state
928     //       else
929     //           add the tag list for this state to the global list.
930     //
931     int i;
932     int n;
933 
934     // Pre-set a single tag of {0} into the table.
935     //   We will need this as a default, for rule sets with no explicit tagging.
936     if (fRB->fRuleStatusVals->size() == 0) {
937         fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
938         fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
939     }
940 
941     //    For each state
942     for (n=0; n<fDStates->size(); n++) {
943         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
944         UVector *thisStatesTagValues = sd->fTagVals;
945         if (thisStatesTagValues == NULL) {
946             // No tag values are explicitly associated with this state.
947             //   Set the default tag value.
948             sd->fTagsIdx = 0;
949             continue;
950         }
951 
952         // There are tag(s) associated with this state.
953         //   fTagsIdx will be the index into the global tag list for this state's tag values.
954         //   Initial value of -1 flags that we haven't got it set yet.
955         sd->fTagsIdx = -1;
956         int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
957         int32_t  nextTagGroupStart = 0;
958 
959         // Loop runs once per group of tags in the global list
960         while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
961             thisTagGroupStart = nextTagGroupStart;
962             nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
963             if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
964                 // The number of tags for this state is different from
965                 //    the number of tags in this group from the global list.
966                 //    Continue with the next group from the global list.
967                 continue;
968             }
969             // The lengths match, go ahead and compare the actual tag values
970             //    between this state and the group from the global list.
971             for (i=0; i<thisStatesTagValues->size(); i++) {
972                 if (thisStatesTagValues->elementAti(i) !=
973                     fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
974                     // Mismatch.
975                     break;
976                 }
977             }
978 
979             if (i == thisStatesTagValues->size()) {
980                 // We found a set of tag values in the global list that match
981                 //   those for this state.  Use them.
982                 sd->fTagsIdx = thisTagGroupStart;
983                 break;
984             }
985         }
986 
987         if (sd->fTagsIdx == -1) {
988             // No suitable entry in the global tag list already.  Add one
989             sd->fTagsIdx = fRB->fRuleStatusVals->size();
990             fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
991             for (i=0; i<thisStatesTagValues->size(); i++) {
992                 fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
993             }
994         }
995     }
996 }
997 
998 
999 
1000 
1001 
1002 
1003 
1004 //-----------------------------------------------------------------------------
1005 //
1006 //  sortedAdd  Add a value to a vector of sorted values (ints).
1007 //             Do not replicate entries; if the value is already there, do not
1008 //                add a second one.
1009 //             Lazily create the vector if it does not already exist.
1010 //
1011 //-----------------------------------------------------------------------------
sortedAdd(UVector ** vector,int32_t val)1012 void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
1013     int32_t i;
1014 
1015     if (*vector == NULL) {
1016         *vector = new UVector(*fStatus);
1017     }
1018     if (*vector == NULL || U_FAILURE(*fStatus)) {
1019         return;
1020     }
1021     UVector *vec = *vector;
1022     int32_t  vSize = vec->size();
1023     for (i=0; i<vSize; i++) {
1024         int32_t valAtI = vec->elementAti(i);
1025         if (valAtI == val) {
1026             // The value is already in the vector.  Don't add it again.
1027             return;
1028         }
1029         if (valAtI > val) {
1030             break;
1031         }
1032     }
1033     vec->insertElementAt(val, i, *fStatus);
1034 }
1035 
1036 
1037 
1038 //-----------------------------------------------------------------------------
1039 //
1040 //  setAdd     Set operation on UVector
1041 //             dest = dest union source
1042 //             Elements may only appear once and must be sorted.
1043 //
1044 //-----------------------------------------------------------------------------
setAdd(UVector * dest,UVector * source)1045 void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
1046     U_ASSERT(!dest->hasDeleter());
1047     U_ASSERT(!source->hasDeleter());
1048     int32_t destOriginalSize = dest->size();
1049     int32_t sourceSize       = source->size();
1050     int32_t di           = 0;
1051     MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
1052     void **destPtr, **sourcePtr;
1053     void **destLim, **sourceLim;
1054 
1055     if (destOriginalSize > destArray.getCapacity()) {
1056         if (destArray.resize(destOriginalSize) == NULL) {
1057             return;
1058         }
1059     }
1060     destPtr = destArray.getAlias();
1061     destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
1062 
1063     if (sourceSize > sourceArray.getCapacity()) {
1064         if (sourceArray.resize(sourceSize) == NULL) {
1065             return;
1066         }
1067     }
1068     sourcePtr = sourceArray.getAlias();
1069     sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
1070 
1071     // Avoid multiple "get element" calls by getting the contents into arrays
1072     (void) dest->toArray(destPtr);
1073     (void) source->toArray(sourcePtr);
1074 
1075     dest->setSize(sourceSize+destOriginalSize, *fStatus);
1076     if (U_FAILURE(*fStatus)) {
1077         return;
1078     }
1079 
1080     while (sourcePtr < sourceLim && destPtr < destLim) {
1081         if (*destPtr == *sourcePtr) {
1082             dest->setElementAt(*sourcePtr++, di++);
1083             destPtr++;
1084         }
1085         // This check is required for machines with segmented memory, like i5/OS.
1086         // Direct pointer comparison is not recommended.
1087         else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1088             dest->setElementAt(*destPtr++, di++);
1089         }
1090         else { /* *sourcePtr < *destPtr */
1091             dest->setElementAt(*sourcePtr++, di++);
1092         }
1093     }
1094 
1095     // At most one of these two cleanup loops will execute
1096     while (destPtr < destLim) {
1097         dest->setElementAt(*destPtr++, di++);
1098     }
1099     while (sourcePtr < sourceLim) {
1100         dest->setElementAt(*sourcePtr++, di++);
1101     }
1102 
1103     dest->setSize(di, *fStatus);
1104 }
1105 
1106 
1107 
1108 //-----------------------------------------------------------------------------
1109 //
1110 //  setEqual    Set operation on UVector.
1111 //              Compare for equality.
1112 //              Elements must be sorted.
1113 //
1114 //-----------------------------------------------------------------------------
setEquals(UVector * a,UVector * b)1115 UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1116     return a->equals(*b);
1117 }
1118 
1119 
1120 //-----------------------------------------------------------------------------
1121 //
1122 //  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
1123 //                 for each node in the tree.
1124 //
1125 //-----------------------------------------------------------------------------
1126 #ifdef RBBI_DEBUG
printPosSets(RBBINode * n)1127 void RBBITableBuilder::printPosSets(RBBINode *n) {
1128     if (n==NULL) {
1129         return;
1130     }
1131     printf("\n");
1132     RBBINode::printNodeHeader();
1133     RBBINode::printNode(n);
1134     RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");
1135 
1136     RBBIDebugPrintf("         firstpos:  ");
1137     printSet(n->fFirstPosSet);
1138 
1139     RBBIDebugPrintf("         lastpos:   ");
1140     printSet(n->fLastPosSet);
1141 
1142     RBBIDebugPrintf("         followpos: ");
1143     printSet(n->fFollowPos);
1144 
1145     printPosSets(n->fLeftChild);
1146     printPosSets(n->fRightChild);
1147 }
1148 #endif
1149 
1150 //
1151 //    findDuplCharClassFrom()
1152 //
findDuplCharClassFrom(IntPair * categories)1153 bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1154     int32_t numStates = fDStates->size();
1155     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1156 
1157     for (; categories->first < numCols-1; categories->first++) {
1158         // Note: dictionary & non-dictionary columns cannot be merged.
1159         //       The limitSecond value prevents considering mixed pairs.
1160         //       Dictionary categories are >= DictCategoriesStart.
1161         //       Non dict categories are   <  DictCategoriesStart.
1162         int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ?
1163             fRB->fSetBuilder->getDictCategoriesStart() : numCols;
1164         for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) {
1165             // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
1166             uint16_t table_base = 0;
1167             uint16_t table_dupl = 1;
1168             for (int32_t state=0; state<numStates; state++) {
1169                 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1170                 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
1171                 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
1172                 if (table_base != table_dupl) {
1173                     break;
1174                 }
1175             }
1176             if (table_base == table_dupl) {
1177                 return true;
1178             }
1179         }
1180     }
1181     return false;
1182 }
1183 
1184 
1185 //
1186 //    removeColumn()
1187 //
removeColumn(int32_t column)1188 void RBBITableBuilder::removeColumn(int32_t column) {
1189     int32_t numStates = fDStates->size();
1190     for (int32_t state=0; state<numStates; state++) {
1191         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1192         U_ASSERT(column < sd->fDtran->size());
1193         sd->fDtran->removeElementAt(column);
1194     }
1195 }
1196 
1197 /*
1198  * findDuplicateState
1199  */
findDuplicateState(IntPair * states)1200 bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1201     int32_t numStates = fDStates->size();
1202     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1203 
1204     for (; states->first<numStates-1; states->first++) {
1205         RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
1206         for (states->second=states->first+1; states->second<numStates; states->second++) {
1207             RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
1208             if (firstSD->fAccepting != duplSD->fAccepting ||
1209                 firstSD->fLookAhead != duplSD->fLookAhead ||
1210                 firstSD->fTagsIdx   != duplSD->fTagsIdx) {
1211                 continue;
1212             }
1213             bool rowsMatch = true;
1214             for (int32_t col=0; col < numCols; ++col) {
1215                 int32_t firstVal = firstSD->fDtran->elementAti(col);
1216                 int32_t duplVal = duplSD->fDtran->elementAti(col);
1217                 if (!((firstVal == duplVal) ||
1218                         ((firstVal == states->first || firstVal == states->second) &&
1219                         (duplVal  == states->first || duplVal  == states->second)))) {
1220                     rowsMatch = false;
1221                     break;
1222                 }
1223             }
1224             if (rowsMatch) {
1225                 return true;
1226             }
1227         }
1228     }
1229     return false;
1230 }
1231 
1232 
findDuplicateSafeState(IntPair * states)1233 bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1234     int32_t numStates = fSafeTable->size();
1235 
1236     for (; states->first<numStates-1; states->first++) {
1237         UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1238         for (states->second=states->first+1; states->second<numStates; states->second++) {
1239             UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1240             bool rowsMatch = true;
1241             int32_t numCols = firstRow->length();
1242             for (int32_t col=0; col < numCols; ++col) {
1243                 int32_t firstVal = firstRow->charAt(col);
1244                 int32_t duplVal = duplRow->charAt(col);
1245                 if (!((firstVal == duplVal) ||
1246                         ((firstVal == states->first || firstVal == states->second) &&
1247                         (duplVal  == states->first || duplVal  == states->second)))) {
1248                     rowsMatch = false;
1249                     break;
1250                 }
1251             }
1252             if (rowsMatch) {
1253                 return true;
1254             }
1255         }
1256     }
1257     return false;
1258 }
1259 
1260 
removeState(IntPair duplStates)1261 void RBBITableBuilder::removeState(IntPair duplStates) {
1262     const int32_t keepState = duplStates.first;
1263     const int32_t duplState = duplStates.second;
1264     U_ASSERT(keepState < duplState);
1265     U_ASSERT(duplState < fDStates->size());
1266 
1267     RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
1268     fDStates->removeElementAt(duplState);
1269     delete duplSD;
1270 
1271     int32_t numStates = fDStates->size();
1272     int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1273     for (int32_t state=0; state<numStates; ++state) {
1274         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1275         for (int32_t col=0; col<numCols; col++) {
1276             int32_t existingVal = sd->fDtran->elementAti(col);
1277             int32_t newVal = existingVal;
1278             if (existingVal == duplState) {
1279                 newVal = keepState;
1280             } else if (existingVal > duplState) {
1281                 newVal = existingVal - 1;
1282             }
1283             sd->fDtran->setElementAt(newVal, col);
1284         }
1285     }
1286 }
1287 
removeSafeState(IntPair duplStates)1288 void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1289     const int32_t keepState = duplStates.first;
1290     const int32_t duplState = duplStates.second;
1291     U_ASSERT(keepState < duplState);
1292     U_ASSERT(duplState < fSafeTable->size());
1293 
1294     fSafeTable->removeElementAt(duplState);   // Note that fSafeTable has a deleter function
1295                                               // and will auto-delete the removed element.
1296     int32_t numStates = fSafeTable->size();
1297     for (int32_t state=0; state<numStates; ++state) {
1298         UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
1299         int32_t numCols = sd->length();
1300         for (int32_t col=0; col<numCols; col++) {
1301             int32_t existingVal = sd->charAt(col);
1302             int32_t newVal = existingVal;
1303             if (existingVal == duplState) {
1304                 newVal = keepState;
1305             } else if (existingVal > duplState) {
1306                 newVal = existingVal - 1;
1307             }
1308             sd->setCharAt(col, static_cast<char16_t>(newVal));
1309         }
1310     }
1311 }
1312 
1313 
1314 /*
1315  * RemoveDuplicateStates
1316  */
removeDuplicateStates()1317 int32_t RBBITableBuilder::removeDuplicateStates() {
1318     IntPair dupls = {3, 0};
1319     int32_t numStatesRemoved = 0;
1320 
1321     while (findDuplicateState(&dupls)) {
1322         // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1323         removeState(dupls);
1324         ++numStatesRemoved;
1325     }
1326     return numStatesRemoved;
1327 }
1328 
1329 
1330 //-----------------------------------------------------------------------------
1331 //
1332 //   getTableSize()    Calculate the size of the runtime form of this
1333 //                     state transition table.
1334 //
1335 //-----------------------------------------------------------------------------
getTableSize() const1336 int32_t  RBBITableBuilder::getTableSize() const {
1337     int32_t    size = 0;
1338     int32_t    numRows;
1339     int32_t    numCols;
1340     int32_t    rowSize;
1341 
1342     if (fTree == NULL) {
1343         return 0;
1344     }
1345 
1346     size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
1347 
1348     numRows = fDStates->size();
1349     numCols = fRB->fSetBuilder->getNumCharCategories();
1350 
1351     if (use8BitsForTable()) {
1352         rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1353     } else {
1354         rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1355     }
1356     size   += numRows * rowSize;
1357     return size;
1358 }
1359 
use8BitsForTable() const1360 bool RBBITableBuilder::use8BitsForTable() const {
1361     return fDStates->size() <= kMaxStateFor8BitsTable;
1362 }
1363 
1364 //-----------------------------------------------------------------------------
1365 //
1366 //   exportTable()    export the state transition table in the format required
1367 //                    by the runtime engine.  getTableSize() bytes of memory
1368 //                    must be available at the output address "where".
1369 //
1370 //-----------------------------------------------------------------------------
exportTable(void * where)1371 void RBBITableBuilder::exportTable(void *where) {
1372     RBBIStateTable    *table = (RBBIStateTable *)where;
1373     uint32_t           state;
1374     int                col;
1375 
1376     if (U_FAILURE(*fStatus) || fTree == NULL) {
1377         return;
1378     }
1379 
1380     int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1381     if (catCount > 0x7fff ||
1382         fDStates->size() > 0x7fff) {
1383         *fStatus = U_BRK_INTERNAL_ERROR;
1384         return;
1385     }
1386 
1387     table->fNumStates = fDStates->size();
1388     table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart();
1389     table->fLookAheadResultsSize = fLASlotsInUse == ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1;
1390     table->fFlags     = 0;
1391     if (use8BitsForTable()) {
1392         table->fRowLen    = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1393         table->fFlags  |= RBBI_8BITS_ROWS;
1394     } else {
1395         table->fRowLen    = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1396     }
1397     if (fRB->fLookAheadHardBreak) {
1398         table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
1399     }
1400     if (fRB->fSetBuilder->sawBOF()) {
1401         table->fFlags  |= RBBI_BOF_REQUIRED;
1402     }
1403 
1404     for (state=0; state<table->fNumStates; state++) {
1405         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1406         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1407         if (use8BitsForTable()) {
1408             U_ASSERT (sd->fAccepting <= 255);
1409             U_ASSERT (sd->fLookAhead <= 255);
1410             U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 255);
1411             RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
1412             r8->fAccepting = sd->fAccepting;
1413             r8->fLookAhead = sd->fLookAhead;
1414             r8->fTagsIdx   = sd->fTagsIdx;
1415             for (col=0; col<catCount; col++) {
1416                 U_ASSERT (sd->fDtran->elementAti(col) <= kMaxStateFor8BitsTable);
1417                 r8->fNextState[col] = sd->fDtran->elementAti(col);
1418             }
1419         } else {
1420             U_ASSERT (sd->fAccepting <= 0xffff);
1421             U_ASSERT (sd->fLookAhead <= 0xffff);
1422             U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 0xffff);
1423             row->r16.fAccepting = sd->fAccepting;
1424             row->r16.fLookAhead = sd->fLookAhead;
1425             row->r16.fTagsIdx   = sd->fTagsIdx;
1426             for (col=0; col<catCount; col++) {
1427                 row->r16.fNextState[col] = sd->fDtran->elementAti(col);
1428             }
1429         }
1430     }
1431 }
1432 
1433 
1434 /**
1435  *   Synthesize a safe state table from the main state table.
1436  */
buildSafeReverseTable(UErrorCode & status)1437 void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1438     // The safe table creation has three steps:
1439 
1440     // 1. Identify pairs of character classes that are "safe." Safe means that boundaries
1441     // following the pair do not depend on context or state before the pair. To test
1442     // whether a pair is safe, run it through the main forward state table, starting
1443     // from each state. If the the final state is the same, no matter what the starting state,
1444     // the pair is safe.
1445     //
1446     // 2. Build a state table that recognizes the safe pairs. It's similar to their
1447     // forward table, with a column for each input character [class], and a row for
1448     // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1449     // create an additional state for each input character category; being in
1450     // one of these states means that the character has been seen, and is potentially
1451     // the first of a pair. In each of these rows, the entry for the second character
1452     // of a safe pair is set to the stop state (0), indicating that a match was found.
1453     // All other table entries are set to the state corresponding the current input
1454     // character, allowing that character to be the of a start following pair.
1455     //
1456     // Because the safe rules are to be run in reverse, moving backwards in the text,
1457     // the first and second pair categories are swapped when building the table.
1458     //
1459     // 3. Compress the table. There are typically many rows (states) that are
1460     // equivalent - that have zeroes (match completed) in the same columns -
1461     // and can be folded together.
1462 
1463     // Each safe pair is stored as two UChars in the safePair string.
1464     UnicodeString safePairs;
1465 
1466     int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1467     int32_t numStates = fDStates->size();
1468 
1469     for (int32_t c1=0; c1<numCharClasses; ++c1) {
1470         for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1471             int32_t wantedEndState = -1;
1472             int32_t endState = 0;
1473             for (int32_t startState = 1; startState < numStates; ++startState) {
1474                 RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1475                 int32_t s2 = startStateD->fDtran->elementAti(c1);
1476                 RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1477                 endState = s2StateD->fDtran->elementAti(c2);
1478                 if (wantedEndState < 0) {
1479                     wantedEndState = endState;
1480                 } else {
1481                     if (wantedEndState != endState) {
1482                         break;
1483                     }
1484                 }
1485             }
1486             if (wantedEndState == endState) {
1487                 safePairs.append((char16_t)c1);
1488                 safePairs.append((char16_t)c2);
1489                 // printf("(%d, %d) ", c1, c2);
1490             }
1491         }
1492         // printf("\n");
1493     }
1494 
1495     // Populate the initial safe table.
1496     // The table as a whole is UVector<UnicodeString>
1497     // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1498     // Row 0 is the stop state.
1499     // Row 1 is the start state.
1500     // Row 2 and beyond are other states, initially one per char class, but
1501     //   after initial construction, many of the states will be combined, compacting the table.
1502     // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1503     // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1504 
1505     U_ASSERT(fSafeTable == nullptr);
1506     LocalPointer<UVector> lpSafeTable(
1507         new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status), status);
1508     if (U_FAILURE(status)) {
1509         return;
1510     }
1511     fSafeTable = lpSafeTable.orphan();
1512     for (int32_t row=0; row<numCharClasses + 2; ++row) {
1513         LocalPointer<UnicodeString> lpString(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1514         fSafeTable->adoptElement(lpString.orphan(), status);
1515     }
1516     if (U_FAILURE(status)) {
1517         return;
1518     }
1519 
1520     // From the start state, each input char class transitions to the state for that input.
1521     UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1522     for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1523         // Note: +2 for the start & stop state.
1524         startState.setCharAt(charClass, static_cast<char16_t>(charClass+2));
1525     }
1526 
1527     // Initially make every other state table row look like the start state row,
1528     for (int32_t row=2; row<numCharClasses+2; ++row) {
1529         UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1530         rowState = startState;   // UnicodeString assignment, copies contents.
1531     }
1532 
1533     // Run through the safe pairs, set the next state to zero when pair has been seen.
1534     // Zero being the stop state, meaning we found a safe point.
1535     for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1536         int32_t c1 = safePairs.charAt(pairIdx);
1537         int32_t c2 = safePairs.charAt(pairIdx + 1);
1538 
1539         UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1540         rowState.setCharAt(c1, 0);
1541     }
1542 
1543     // Remove duplicate or redundant rows from the table.
1544     IntPair states = {1, 0};
1545     while (findDuplicateSafeState(&states)) {
1546         // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1547         removeSafeState(states);
1548     }
1549 }
1550 
1551 
1552 //-----------------------------------------------------------------------------
1553 //
1554 //   getSafeTableSize()    Calculate the size of the runtime form of this
1555 //                         safe state table.
1556 //
1557 //-----------------------------------------------------------------------------
getSafeTableSize() const1558 int32_t  RBBITableBuilder::getSafeTableSize() const {
1559     int32_t    size = 0;
1560     int32_t    numRows;
1561     int32_t    numCols;
1562     int32_t    rowSize;
1563 
1564     if (fSafeTable == nullptr) {
1565         return 0;
1566     }
1567 
1568     size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
1569 
1570     numRows = fSafeTable->size();
1571     numCols = fRB->fSetBuilder->getNumCharCategories();
1572 
1573     if (use8BitsForSafeTable()) {
1574         rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1575     } else {
1576         rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1577     }
1578     size   += numRows * rowSize;
1579     return size;
1580 }
1581 
use8BitsForSafeTable() const1582 bool RBBITableBuilder::use8BitsForSafeTable() const {
1583     return fSafeTable->size() <= kMaxStateFor8BitsTable;
1584 }
1585 
1586 //-----------------------------------------------------------------------------
1587 //
1588 //   exportSafeTable()   export the state transition table in the format required
1589 //                       by the runtime engine.  getTableSize() bytes of memory
1590 //                       must be available at the output address "where".
1591 //
1592 //-----------------------------------------------------------------------------
exportSafeTable(void * where)1593 void RBBITableBuilder::exportSafeTable(void *where) {
1594     RBBIStateTable    *table = (RBBIStateTable *)where;
1595     uint32_t           state;
1596     int                col;
1597 
1598     if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1599         return;
1600     }
1601 
1602     int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1603     if (catCount > 0x7fff ||
1604             fSafeTable->size() > 0x7fff) {
1605         *fStatus = U_BRK_INTERNAL_ERROR;
1606         return;
1607     }
1608 
1609     table->fNumStates = fSafeTable->size();
1610     table->fFlags     = 0;
1611     if (use8BitsForSafeTable()) {
1612         table->fRowLen    = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1613         table->fFlags  |= RBBI_8BITS_ROWS;
1614     } else {
1615         table->fRowLen    = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1616     }
1617 
1618     for (state=0; state<table->fNumStates; state++) {
1619         UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
1620         RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1621         if (use8BitsForSafeTable()) {
1622             RBBIStateTableRow8 *r8 = (RBBIStateTableRow8*)row;
1623             r8->fAccepting = 0;
1624             r8->fLookAhead = 0;
1625             r8->fTagsIdx    = 0;
1626             for (col=0; col<catCount; col++) {
1627                 U_ASSERT(rowString->charAt(col) <= kMaxStateFor8BitsTable);
1628                 r8->fNextState[col] = static_cast<uint8_t>(rowString->charAt(col));
1629             }
1630         } else {
1631             row->r16.fAccepting = 0;
1632             row->r16.fLookAhead = 0;
1633             row->r16.fTagsIdx    = 0;
1634             for (col=0; col<catCount; col++) {
1635                 row->r16.fNextState[col] = rowString->charAt(col);
1636             }
1637         }
1638     }
1639 }
1640 
1641 
1642 
1643 
1644 //-----------------------------------------------------------------------------
1645 //
1646 //   printSet    Debug function.   Print the contents of a UVector
1647 //
1648 //-----------------------------------------------------------------------------
1649 #ifdef RBBI_DEBUG
printSet(UVector * s)1650 void RBBITableBuilder::printSet(UVector *s) {
1651     int32_t  i;
1652     for (i=0; i<s->size(); i++) {
1653         const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1654         RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
1655     }
1656     RBBIDebugPrintf("\n");
1657 }
1658 #endif
1659 
1660 
1661 //-----------------------------------------------------------------------------
1662 //
1663 //   printStates    Debug Function.  Dump the fully constructed state transition table.
1664 //
1665 //-----------------------------------------------------------------------------
1666 #ifdef RBBI_DEBUG
printStates()1667 void RBBITableBuilder::printStates() {
1668     int     c;    // input "character"
1669     int     n;    // state number
1670 
1671     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1672     RBBIDebugPrintf("      | Acc  LA    Tag");
1673     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1674         RBBIDebugPrintf(" %3d", c);
1675     }
1676     RBBIDebugPrintf("\n");
1677     RBBIDebugPrintf("      |---------------");
1678     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1679         RBBIDebugPrintf("----");
1680     }
1681     RBBIDebugPrintf("\n");
1682 
1683     for (n=0; n<fDStates->size(); n++) {
1684         RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1685         RBBIDebugPrintf("  %3d | " , n);
1686         RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1687         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1688             RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c));
1689         }
1690         RBBIDebugPrintf("\n");
1691     }
1692     RBBIDebugPrintf("\n\n");
1693 }
1694 #endif
1695 
1696 
1697 //-----------------------------------------------------------------------------
1698 //
1699 //   printSafeTable    Debug Function.  Dump the fully constructed safe table.
1700 //
1701 //-----------------------------------------------------------------------------
1702 #ifdef RBBI_DEBUG
printReverseTable()1703 void RBBITableBuilder::printReverseTable() {
1704     int     c;    // input "character"
1705     int     n;    // state number
1706 
1707     RBBIDebugPrintf("    Safe Reverse Table \n");
1708     if (fSafeTable == nullptr) {
1709         RBBIDebugPrintf("   --- nullptr ---\n");
1710         return;
1711     }
1712     RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1713     RBBIDebugPrintf("      | Acc  LA    Tag");
1714     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1715         RBBIDebugPrintf(" %2d", c);
1716     }
1717     RBBIDebugPrintf("\n");
1718     RBBIDebugPrintf("      |---------------");
1719     for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1720         RBBIDebugPrintf("---");
1721     }
1722     RBBIDebugPrintf("\n");
1723 
1724     for (n=0; n<fSafeTable->size(); n++) {
1725         UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1726         RBBIDebugPrintf("  %3d | " , n);
1727         RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0);  // Accepting, LookAhead, Tags
1728         for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1729             RBBIDebugPrintf(" %2d", rowString->charAt(c));
1730         }
1731         RBBIDebugPrintf("\n");
1732     }
1733     RBBIDebugPrintf("\n\n");
1734 }
1735 #endif
1736 
1737 
1738 
1739 //-----------------------------------------------------------------------------
1740 //
1741 //   printRuleStatusTable    Debug Function.  Dump the common rule status table
1742 //
1743 //-----------------------------------------------------------------------------
1744 #ifdef RBBI_DEBUG
printRuleStatusTable()1745 void RBBITableBuilder::printRuleStatusTable() {
1746     int32_t  thisRecord = 0;
1747     int32_t  nextRecord = 0;
1748     int      i;
1749     UVector  *tbl = fRB->fRuleStatusVals;
1750 
1751     RBBIDebugPrintf("index |  tags \n");
1752     RBBIDebugPrintf("-------------------\n");
1753 
1754     while (nextRecord < tbl->size()) {
1755         thisRecord = nextRecord;
1756         nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1757         RBBIDebugPrintf("%4d   ", thisRecord);
1758         for (i=thisRecord+1; i<nextRecord; i++) {
1759             RBBIDebugPrintf("  %5d", tbl->elementAti(i));
1760         }
1761         RBBIDebugPrintf("\n");
1762     }
1763     RBBIDebugPrintf("\n\n");
1764 }
1765 #endif
1766 
1767 
1768 //-----------------------------------------------------------------------------
1769 //
1770 //   RBBIStateDescriptor     Methods.  This is a very struct-like class
1771 //                           Most access is directly to the fields.
1772 //
1773 //-----------------------------------------------------------------------------
1774 
RBBIStateDescriptor(int lastInputSymbol,UErrorCode * fStatus)1775 RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1776     fMarked    = FALSE;
1777     fAccepting = 0;
1778     fLookAhead = 0;
1779     fTagsIdx   = 0;
1780     fTagVals   = NULL;
1781     fPositions = NULL;
1782     fDtran     = NULL;
1783 
1784     fDtran     = new UVector32(lastInputSymbol+1, *fStatus);
1785     if (U_FAILURE(*fStatus)) {
1786         return;
1787     }
1788     if (fDtran == NULL) {
1789         *fStatus = U_MEMORY_ALLOCATION_ERROR;
1790         return;
1791     }
1792     fDtran->setSize(lastInputSymbol+1);    // fDtran needs to be pre-sized.
1793                                            //   It is indexed by input symbols, and will
1794                                            //   hold  the next state number for each
1795                                            //   symbol.
1796 }
1797 
1798 
~RBBIStateDescriptor()1799 RBBIStateDescriptor::~RBBIStateDescriptor() {
1800     delete       fPositions;
1801     delete       fDtran;
1802     delete       fTagVals;
1803     fPositions = NULL;
1804     fDtran     = NULL;
1805     fTagVals   = NULL;
1806 }
1807 
1808 U_NAMESPACE_END
1809 
1810 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1811